diff --git a/.gitignore b/.gitignore
index f3219dd9..8b80a3f0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ CMakeFiles
 CMakeCache.txt
 cmake_install.cmake
 CMakeLists.txt.user
+.idea
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..dba15f30
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "libs/glfw"]
+	path = libs/glfw
+	url = https://github.com/glfw/glfw.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bed4e57a..c9bf2069 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,7 @@
-##################
-#     Thanda     #
-##################
-
 # credit - base CMake config : Yining Karl Li , edited CMake config: Akshay Shah & Debanshu Singh
 
 #name your project
-project(Thanda)
+project(FluidSolver)
 cmake_minimum_required(VERSION 2.8)
 
 # set creates a variable
@@ -16,6 +12,8 @@ include_directories(
         ${NUPARU}/src
 )
 
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
 # Add path for pre-compiled libraries here (we will later link them with our compiled source)
 # Add Nuparu library to path for OSX, linux and windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -33,10 +31,20 @@ set(GLFW_LIBRARY_DIR ${CMAKE_LIBRARY_PATH})
 set(GLEW_LIBRARY_DIR ${CMAKE_LIBRARY_PATH})
 
 # Use find_package & find_library to link with
-find_package(OPENGL REQUIRED)
+find_package(OpenGL REQUIRED)
 find_package(GLEW)
 find_library(GLFW_LIBRARY "glfw3" HINTS ${GLFW_LIBRARY_DIR})
-find_library(JSONCPP "jsoncpp")
+find_library(JSONCPP "jsoncpp" REQUIRED)
+find_library(BOOST_IOSTREAMS boost_iostreams)
+find_library(BOOST_SYSTEM boost_system)
+find_library(OPENVDB openvdb REQUIRED)
+find_library(OPENVDB_POINTS openvdb_points)
+find_library(HALF Half REQUIRED)
+find_library(TBB NAMES tbb tbbmalloc)
+find_library(ZLIB z)
+find_library(ANT AntTweakBar)
+
+#Iex IexMath Imath IlmThread
 
 add_definitions(
         -DTW_STATIC
@@ -46,7 +54,20 @@ add_definitions(
         -D_CRT_SECURE_NO_WARNINGS
 )
 
-set(CORE_LIBS ${GLFW_LIBRARY} ${GLUT_LIBRARY} ${GLEW_LIBRARY} ${JSONCPP} ${OPENGL_LIBRARY} )
+set(CORE_LIBS
+        ${GLFW_LIBRARY}
+        ${GLUT_LIBRARY}
+        ${GLEW_LIBRARY}
+        ${JSONCPP}
+        ${OPENGL_LIBRARY}
+        ${BOOST_IOSTREAMS}
+        ${BOOST_SYSTEM}
+        ${OPENVDB}
+        ${OPENVDB_POINTS}
+        ${TBB}
+        ${ZLIB}
+        ${HALF}
+        ${ANT})
 
 # OSX-specific hacks/fixes
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -59,22 +80,23 @@ endif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 
 # Linux specific hacks/fixes
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lX11 -lXxf86vm -lXrandr -lpthread -lXi")
+    set(CORE_LIBS ${CORE_LIBS} X11 Xxf86vm Xrandr pthread Xi)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lX11 -lXxf86vm -lXrandr -lpthread -lXi -lz")
 endif()
 
 # set compiler flags for c++11
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O3 -m64 -msse2 -w")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Ofast -m64 -msse2 -w")
 elseif(WIN32)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 endif()
 
 if(MSVC)
-    set(COMPILER_FLAGS 
+    set(COMPILER_FLAGS
             CMAKE_CXX_FLAGS
             CMAKE_CXX_FLAGS_DEBUG
             CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_C_FLAGS 
+            CMAKE_C_FLAGS
             CMAKE_C_FLAGS_DEBUG
             CMAKE_C_FLAGS_RELEASE
         )
@@ -82,13 +104,24 @@ endif()
 
 # Add source files you want to compile (.cpp)
 set(CORE_SRC
-        src/main.cpp
-        src/camera/camera.cpp
-        src/viewer/viewer.cpp
-        src/fluidSolver/fluidSolver.cpp
-        src/scene/scene.cpp
-        src/geom/geom.cpp
-)
+        main.cpp
+        core/display/InputHandler.cpp
+        core/display/InputHandler.h
+        core/display/Window.cpp
+        core/display/Window.h
+        core/fileIO/SceneLoader.cpp
+        core/fileIO/SceneLoader.h
+        core/geometry/Bound.cpp
+        core/geometry/Bound.h
+        core/geometry/Box.cpp
+        core/geometry/Box.h
+        core/geometry/Geo.cpp
+        core/geometry/Geo.h
+        core/geometry/GeoObject.h
+        core/solver/FluidSolver.cpp
+        core/solver/FluidSolver.h
+        core/util/math.h
+        core/display/shaders/particle.frag.h core/display/shaders/particle.vert.h core/display/painters/ParticlesPainter.cpp core/display/painters/ParticlesPainter.h core/display/painters/Painter.cpp core/display/painters/Painter.h core/display/painters/BoxPainter.cpp core/display/painters/BoxPainter.h core/display/shaders/flat.vert.h core/display/shaders/flat.frag.h core/scenes/default.h core/camera/Camera.cpp core/camera/Camera.h core/solver/grid/Grid.cpp core/solver/grid/Grid.h core/solver/grid/MACGrid.cpp core/solver/grid/MACGrid.h core/solver/FluidParticle.h core/fileIO/ParticlesWriter.cpp core/fileIO/ParticlesWriter.h core/util/hacks.h core/display/painters/GridVectorAttributePainter.cpp core/display/painters/GridVectorAttributePainter.h core/display/shaders/gridAttr.geo.h core/util/flags.h core/display/painters/GridScalarAttributePainter.cpp core/display/painters/GridScalarAttributePainter.h core/display/shaders/gridScal.vert.h core/display/shaders/gridScal.frag.h)
 
-add_executable(Thanda ${CORE_SRC})
-target_link_libraries(Thanda ${CORE_LIBS})
+add_executable(FluidSolver ${CORE_SRC})
+target_link_libraries(FluidSolver ${CORE_LIBS})
diff --git a/README.md b/README.md
index 54283261..ebc5feed 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,59 @@
-# CIS563-FluidSolver
-(Credit : CIS565 README)
+Fluid Solver
+===========
 
-Fluid Solver Submission guidelines:
+First steps toward building a fluid solver.
 
+## User Interface:
 
-- If you have modified any of the CMakeLists.txt files at all (aside from the list of CORE_SRC), you must test that your project can build. Beware of any build issues.
+* Middle mouse scroll to zoom in/out
+* Middle mouse click and drag to orbit the camera
+* SHIFT + middle mouse click and drag to track/slide
 
-- Open a GitHub pull request so that we can see that you have finished. The title should be "Submission: YOUR NAME".
+## Code Overview:
 
-- In the body of the pull request, include a link to your repository.
+### Scene Loading
 
-- Submit on canvas with a direct link to your pull request on GitHub
+Scenes are loaded by passing a path to the scene file as the first program argument.
+It does a simple parsing with jsoncpp to create objects.
 
+* containerDim: size and position of the container
+* particleDim: size and position of initial fluid object
+* resolution: number of grid divisions on the largest axis
 
-And you're done!
\ No newline at end of file
+#### AntTweakBar
+
+I've added AntTweakBar, but haven't gotten around to having it do anything yet. I also don't have a MAC so I was unable to compile the libraries for it.
+
+### Geometry
+
+All geometry objects implement functions for collision detection. These come in a few different forms, allowing collision detection given next and previous points, given a point and distance tolerance, as well as given a point, ray, and timestep.
+All geometry objects also have a bounding box which at the moment is used to assist in converting the geometry to particles.
+
+### Fluid Solver
+
+Particles are created by looping over all of the geometries' bounding boxes and checking if the point is contained within them.
+For solving, particles are simply accelerated by a static gravity constant and then collisions are checked against the container geometry.
+All information is stored in a temporary buffer which is swapped with the particle buffer at the end of the solve.
+
+### Drawing
+
+A Painter class is used to define the drawing behavior of various elements in the scene. Each sets up their own shaders on initialization and implement methods to draw their respective objects.
+I found it much nicer to isolate my code this way so that I didn't have a billion gl calls in my geometry classes and a billion gl calls in my Window class.
+Shaders are stored as char arrays in header files. I found that the easiest way to package them with my code.
+
+The Window sets up a glfw window and a Singleton instance of InputHandler. glfw doesn't let you have non-static callback functions so I instead have callback functions to update the state of my static InputHandler which the Window can subscribe to.
+From there, I can get the window/keyboard/mouse data and do the approriate camera calculations.
+
+## Parallelization:
+
+TBB is used heavily to parallelize calculations on the particles and grid. This can be toggled on/off by defining/undefining USETBB in <core/util/flags.h>.
+
+For the default scene, the parallelized code computes each frame in an average of 0.0494588 seconds. Without TBB, this is 0.0768101 seconds.
+
+## OpenVDB:
+
+Currently, each frame is written out as "particles_{frame}.vdb" in the current directory
+
+## Known Problems:
+
+There seems to be an edge indexing problem in the attribute transfer from grid to particle
\ No newline at end of file
diff --git a/core/camera/Camera.cpp b/core/camera/Camera.cpp
new file mode 100644
index 00000000..9b7c5680
--- /dev/null
+++ b/core/camera/Camera.cpp
@@ -0,0 +1,53 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#include "Camera.h"
+
+Camera::Camera(int w, int h) :
+        zoom(35),
+        eye(glm::vec3(0,0,zoom)),
+        tgt(glm::vec3(0,0,0)),
+        width(w),
+        height(h),
+        fovy(45),
+        near_clip(0.001f),
+        far_clip(1000.f),
+        world_up(glm::vec3(0,1,0)),
+        look(tgt - eye),
+        right(glm::cross(look, world_up)),
+        up(glm::cross(right, look)),
+        rotation(glm::mat4(1.f)) {
+
+//    rotation = glm::rotate(rotation, -PI/6, glm::vec3(1,0,0));
+//    rotation = glm::rotate(rotation, PI/4, glm::vec3(0,1,0));
+
+    recomputeEye();
+    recompute();
+}
+
+glm::mat4 Camera::viewProj() {
+    return _viewProj;
+}
+
+void Camera::recompute() {
+    float aspect = (float)width/height;
+    look = glm::normalize(tgt - eye);
+    right = glm::cross(look, world_up);
+    up = glm::cross(right, look);
+
+    _viewProj = glm::perspective(fovy, aspect, near_clip, far_clip) * glm::lookAt(eye, tgt, up);
+}
+
+void Camera::resize(int w, int h) {
+    width = w;
+    height = h;
+    recompute();
+}
+
+void Camera::recomputeEye() {
+    eye = glm::vec3(rotation * glm::vec4(0,0,zoom,0)) + tgt;
+    look = glm::normalize(tgt - eye);
+    right = glm::normalize(glm::cross(look, up));
+    up = glm::normalize(glm::cross(right, look));
+}
diff --git a/core/camera/Camera.h b/core/camera/Camera.h
new file mode 100644
index 00000000..62be261a
--- /dev/null
+++ b/core/camera/Camera.h
@@ -0,0 +1,35 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#ifndef FLUIDSOLVER_CAMERA_H
+#define FLUIDSOLVER_CAMERA_H
+
+#include <core/util/math.h>
+
+class Camera {
+private:
+    glm::vec3 world_up;
+public:
+    float zoom;
+    glm::vec3 eye;
+    glm::vec3 tgt;
+
+    Camera(int w, int h);
+    glm::mat4 viewProj();
+    void recomputeEye();
+    void recompute();
+    void resize(int w, int h);
+
+    glm::vec3 look, up, right;
+    int width, height;
+    float fovy, near_clip, far_clip;
+    glm::mat4 rotation;
+
+private:
+
+    glm::mat4 _viewProj;
+};
+
+
+#endif //FLUIDSOLVER_CAMERA_H
diff --git a/core/display/InputHandler.cpp b/core/display/InputHandler.cpp
new file mode 100644
index 00000000..799eb5f5
--- /dev/null
+++ b/core/display/InputHandler.cpp
@@ -0,0 +1,216 @@
+//
+// Created by austin on 2/26/16.
+//
+
+#include "InputHandler.h"
+#include <iostream>
+#include <algorithm>
+
+InputHandler::InputHandler() {
+
+}
+
+double InputHandler::x() const {
+    return _mouseState.x;
+}
+
+double InputHandler::y() const {
+    return _mouseState.y;
+}
+
+double InputHandler::delX() const {
+    return _mouseState.delX;
+}
+
+double InputHandler::delY() const {
+    return _mouseState.delY;
+}
+
+double InputHandler::delWheel() const {
+    return _mouseState.delWheel;
+}
+
+bool InputHandler::leftDown() const {
+    return _mouseState.leftDown;
+}
+
+bool InputHandler::wheelDown() const {
+    return _mouseState.wheelDown;
+}
+
+bool InputHandler::rightDown() const {
+    return _mouseState.rightDown;
+}
+
+bool InputHandler::key(int key) const {
+    return _keyboard.find(key) != _keyboard.end();
+}
+
+void InputHandler::x(double val, bool events) {
+    std::swap(_mouseState.x, val);
+
+    if (events) {
+        _mouseState.delX = _mouseState.x - val;
+
+        mouseMoved();
+
+        emit(_mouseState);
+        _mouseState.leftDragInit = false;
+        _mouseState.wheelDragInit = false;
+        _mouseState.rightDragInit = false;
+    }
+}
+
+void InputHandler::y(double val, bool events) {
+    std::swap(_mouseState.y, val);
+
+    if (events) {
+        _mouseState.delY = _mouseState.y - val;
+
+        mouseMoved();
+
+        emit(_mouseState);
+        _mouseState.leftDragInit = false;
+        _mouseState.wheelDragInit = false;
+        _mouseState.rightDragInit = false;
+    }
+}
+
+void InputHandler::mouseMoved() {
+    if (_mouseState.leftDown) {
+        if (!_mouseState.leftDragInit) {
+            _mouseState.leftDragInit = true;
+            _mouseState.startLeftX = _mouseState.x;
+            _mouseState.startLeftY = _mouseState.y;
+        }
+        _mouseState.leftDragging = true;
+    }
+    if (_mouseState.wheelDown) {
+        if (!_mouseState.wheelDragInit) {
+            _mouseState.wheelDragInit = true;
+            _mouseState.startWheelX = _mouseState.x;
+            _mouseState.startWheelY = _mouseState.y;
+        }
+        _mouseState.wheelDragging = true;
+    }
+    if (_mouseState.rightDown) {
+        if (!_mouseState.rightDragInit) {
+            _mouseState.rightDragInit = true;
+            _mouseState.startRightX = _mouseState.x;
+            _mouseState.startRightY = _mouseState.y;
+        }
+        _mouseState.rightDragging = true;
+    }
+}
+
+void InputHandler::pos(double x, double y, bool events) {
+    std::swap(_mouseState.x, x);
+    std::swap(_mouseState.y, y);
+
+    if (events) {
+        _mouseState.delX = _mouseState.x - x;
+        _mouseState.delY = _mouseState.y - y;
+
+        mouseMoved();
+
+        emit(_mouseState);
+        _mouseState.leftDragInit = false;
+        _mouseState.wheelDragInit = false;
+        _mouseState.rightDragInit = false;
+    }
+}
+
+void InputHandler::delX(double val, bool events) {
+    _mouseState.delX = val;
+    if (events) {
+        emit(_mouseState);
+    }
+}
+
+void InputHandler::delY(double val, bool events) {
+    _mouseState.delY = val;
+    if (events) {
+        emit(_mouseState);
+    }
+}
+
+
+void InputHandler::delWheel(double val, bool events) {
+    _mouseState.delWheel = val;
+    if (events) {
+        emit(_mouseState);
+    }
+    _mouseState.delWheel = 0;
+}
+
+void InputHandler::leftDown(bool val, bool events) {
+    _mouseState.leftDown = val;
+    if (events) {
+        if (!val) {
+            _mouseState.leftDragging = false;
+            _mouseState.leftDragFinish = true;
+        }
+        emit(_mouseState);
+        _mouseState.leftDragFinish = false;
+    }
+}
+
+void InputHandler::wheelDown(bool val, bool events) {
+    _mouseState.wheelDown = val;
+    if (events) {
+        if (!val) {
+            _mouseState.wheelDragging = false;
+            _mouseState.wheelDragFinish = true;
+        }
+        emit(_mouseState);
+        _mouseState.wheelDragFinish = false;
+    }
+}
+
+void InputHandler::rightDown(bool val, bool events) {
+    _mouseState.rightDown = val;
+    if (events) {
+        if (!val) {
+            _mouseState.rightDragging = false;
+            _mouseState.rightDragFinish = true;
+        }
+        emit(_mouseState);
+        _mouseState.rightDragFinish = false;
+    }
+}
+
+void InputHandler::key(int key, bool down, bool events) {
+    if (down) {
+        _keyboard.insert(key);
+    } else {
+        _keyboard.erase(_keyboard.find((key)));
+    }
+}
+
+void InputHandler::emit(MouseState &event) {
+    for (int i = 0; i < mouseSubscribers.size(); i++) {
+        mouseSubscribers.at(i)(event);
+    }
+}
+
+void InputHandler::registerMouseListener(InputHandler::MouseListener listener) {
+    mouseSubscribers.push_back(listener);
+}
+
+//void InputHandler::deregisterMouseListener(MouseListener listener) {
+//    mouseSubscribers.erase(std::remove(mouseSubscribers.begin(), mouseSubscribers.end(), listener), mouseSubscribers.end());
+//}
+
+void InputHandler::windowResized(int w, int h) {
+    for (int i = 0; i < windowSubscribers.size(); i++) {
+        windowSubscribers.at(i)(w, h);
+    }
+}
+
+void InputHandler::registerWindowListener(InputHandler::WindowListener listener) {
+    windowSubscribers.push_back(listener);
+}
+
+//void InputHandler::deregisterWindowListener(InputHandler::WindowListener listener) {
+//    windowSubscribers.erase(std::remove(windowSubscribers.begin(), windowSubscribers.end(), listener), windowSubscribers.end());
+//}
diff --git a/core/display/InputHandler.h b/core/display/InputHandler.h
new file mode 100644
index 00000000..8ab69b35
--- /dev/null
+++ b/core/display/InputHandler.h
@@ -0,0 +1,94 @@
+//
+// Created by austin on 2/26/16.
+//
+
+#ifndef FLUIDSOLVER_INPUTHANDLER_H
+#define FLUIDSOLVER_INPUTHANDLER_H
+
+#include <set>
+#include <vector>
+#include <functional>
+
+class InputHandler {
+
+public:
+    static InputHandler& getInputHandler() {
+        static InputHandler inputHandler;
+        return inputHandler;
+    }
+
+    double x() const;
+    double y() const;
+    double delX() const;
+    double delY() const;
+    double delWheel() const;
+    bool leftDown() const;
+    bool wheelDown() const;
+    bool rightDown() const;
+    bool key(int key) const;
+
+    void x(double val, bool events = true);
+    void y(double val, bool events = true);
+    void pos(double x, double y, bool events = true);
+    void delX(double val, bool events = true);
+    void delY(double val, bool events = true);
+    void delWheel(double val, bool events = true);
+    void leftDown(bool val, bool events = true);
+    void wheelDown(bool val, bool events = true);
+    void rightDown(bool val, bool events = true);
+    void key(int key, bool down, bool events = true);
+
+    struct MouseState {
+        double x;
+        double y;
+        double delX;
+        double delY;
+        double startLeftX;
+        double startLeftY;
+        double startWheelX;
+        double startWheelY;
+        double startRightX;
+        double startRightY;
+        double delWheel;
+        bool leftDown;
+        bool wheelDown;
+        bool rightDown;
+        bool leftDragInit;
+        bool wheelDragInit;
+        bool rightDragInit;
+        bool leftDragging;
+        bool wheelDragging;
+        bool rightDragging;
+        bool leftDragFinish;
+        bool wheelDragFinish;
+        bool rightDragFinish;
+    };
+
+    //typedef void(*MouseListener)(MouseState&);
+    //typedef void (*WindowListener)(int w, int h);
+    typedef std::function<void(int, int)> WindowListener;
+    typedef std::function<void(MouseState&)> MouseListener;
+    void registerMouseListener(MouseListener listener);
+    //void deregisterMouseListener(MouseListener listener);
+    void registerWindowListener(WindowListener listener);
+    //void deregisterWindowListener(WindowListener listener);
+    void windowResized(int w, int h);
+
+private:
+    InputHandler();
+    InputHandler(InputHandler const&) {} // prevent copies
+    void operator=(InputHandler const&) {} // prevent assignments
+
+    std::set<int> _keyboard;
+
+    MouseState _mouseState;
+
+    void mouseMoved();
+
+    std::vector<MouseListener> mouseSubscribers;
+    std::vector<WindowListener> windowSubscribers;
+    void emit(MouseState &event);
+};
+
+
+#endif //FLUIDSOLVER_INPUTHANDLER_H
diff --git a/core/display/Window.cpp b/core/display/Window.cpp
new file mode 100644
index 00000000..c5b129a8
--- /dev/null
+++ b/core/display/Window.cpp
@@ -0,0 +1,322 @@
+//
+// Created by austin on 2/25/16.
+//
+
+#include "Window.h"
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+
+static void error_callback(int error, const char* description) {
+    fputs(description, stderr);
+}
+
+InputHandler &inputHandler = InputHandler::getInputHandler();
+
+Window::Window(const char *title) : Window(1200, 800, title) { }
+Window::Window(int w, int h) : Window(w, h, "GL Window"){ }
+Window::Window(int w, int h, const char* title) : _window(nullptr), camera(w, h), _w(w), _h(h),
+    loadSceneCB(NULL) {
+    glfwSetErrorCallback(error_callback);
+
+    if (!glfwInit()) exit(EXIT_FAILURE);
+
+    // use antialiasing
+    glfwWindowHint(GLFW_SAMPLES, 4);
+
+    // set version to OpenGL 3.3
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+    #ifdef __APPLE__
+    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+    #endif
+    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+    glfwWindowHint(GLFW_RESIZABLE, GL_TRUE);
+
+    _window = glfwCreateWindow(w, h, title, NULL, NULL);
+    pixels.resize(_w*_h*4);
+
+    if (!_window) {
+        glfwTerminate();
+        exit(EXIT_FAILURE);
+    }
+
+    glfwMakeContextCurrent(_window);
+    glfwSwapInterval(1);
+    setupInputCBs();
+
+    glewExperimental= GL_TRUE;
+    if (glewInit() != GLEW_OK) {
+        fprintf(stderr, "Failed to initialize GLEW\n");
+        exit(EXIT_FAILURE);
+    }
+
+    glGenTextures(1, &texture);
+    glBindTexture(GL_TEXTURE_2D, texture);
+    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR );
+    glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR );
+    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+
+    glGenFramebuffers(1, &fbo);
+    glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+    glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, texture, 0);
+
+    glBindFramebuffer(GL_FRAMEBUFFER, 0);
+}
+
+Window::~Window() {
+
+}
+
+void Window::initializeTweakBar() {
+    int w, h;
+    glfwGetWindowSize(_window, &w, &h);
+    TwInit(TW_OPENGL_CORE, NULL);
+    TwWindowSize(w, h);
+    TwBar *myBar;
+    myBar = TwNewBar("Settings");
+    TwAddButton(myBar, "loadsceneBtn", loadSceneCB, NULL, " label='Load Scene'");
+}
+
+
+void Window::setupInputCBs() {
+
+    glfwSetKeyCallback(_window, [](GLFWwindow *window, int key, int scancode, int action, int mods) {
+        switch(action) {
+            case GLFW_PRESS:
+                inputHandler.key(key, true);
+                break;
+            case GLFW_RELEASE:
+                inputHandler.key(key, false);
+                break;
+            default: break;
+        }
+        if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
+            glfwSetWindowShouldClose(window, GL_TRUE);
+
+        switch(action) {
+            case GLFW_PRESS:
+                inputHandler.key(scancode, true);
+                break;
+            case GLFW_RELEASE:
+                inputHandler.key(scancode, false);
+                break;
+            default:break;
+        }
+
+        TwEventKeyGLFW(key, action);
+        TwEventCharGLFW(key, action);
+    });
+    glfwSetCursorPosCallback(_window, [](GLFWwindow* window, double xpos, double ypos) {
+        inputHandler.pos(xpos, ypos);
+        TwEventMousePosGLFW(xpos, ypos);
+    });
+    glfwSetMouseButtonCallback(_window, [](GLFWwindow* window, int button, int action, int mods) {
+        switch(button) {
+            case GLFW_MOUSE_BUTTON_LEFT:
+                switch(action) {
+                    case GLFW_PRESS:
+                        inputHandler.leftDown(true);
+                        break;
+                    case GLFW_RELEASE:
+                        inputHandler.leftDown(false);
+                        break;
+                    default:break;
+                }
+                break;
+            case GLFW_MOUSE_BUTTON_MIDDLE:
+                switch(action) {
+                    case GLFW_PRESS:
+                        inputHandler.wheelDown(true);
+                        break;
+                    case GLFW_RELEASE:
+                        inputHandler.wheelDown(false);
+                        break;
+                    default:break;
+                }
+                break;
+            case GLFW_MOUSE_BUTTON_RIGHT:
+                switch(action) {
+                    case GLFW_PRESS:
+                        inputHandler.rightDown(true);
+                        break;
+                    case GLFW_RELEASE:
+                        inputHandler.rightDown(false);
+                        break;
+                    default:break;
+                }
+                break;
+            default:break;
+        }
+        TwEventMouseButtonGLFW(button, action);
+    });
+    glfwSetScrollCallback(_window, [&](GLFWwindow* window, double xoffset, double yoffset) {
+        inputHandler.delWheel(yoffset);
+        TwEventMouseWheelGLFW(yoffset);
+    });
+
+    glfwSetWindowSizeCallback(_window, [](GLFWwindow *window, int width, int height) {
+        inputHandler.windowResized(width, height);
+        TwWindowSize(width, height);
+    });
+
+    inputHandler.registerMouseListener([&](InputHandler::MouseState &mouseState) {
+        if (!fequal(mouseState.delWheel, 0.0)) {
+            // change camera zoom level based on scroll direction
+            glm::vec3 vec = camera.tgt - camera.eye;
+            // limit zoom when very near to target
+            float fac = glm::min(glm::abs(glm::length(vec)/5.f), 1.f);
+            camera.zoom -= (float)mouseState.delWheel * fac;
+            camera.recomputeEye();
+            updateCamera();
+        }
+
+        if (mouseState.wheelDragging) {
+            if (inputHandler.key(340)) {
+                // pixel position offset from center
+                float x = (float) (_w/2 - mouseState.delX);
+                float y = (float) (_h/2 - mouseState.delY);
+
+                // offset in ndc
+                float sx = (2*x / _w) - 1.f;
+                float sy = 1.f - (2*y / _h);
+
+                // project camera up amd right axes
+                float alpha = camera.fovy / 2;
+                float len = glm::length(camera.tgt - camera.eye);
+                glm::vec3 V = camera.up*(float)(len*tan(alpha));
+                glm::vec3 H = camera.right*(float)(len*(_w / _h)*tan(alpha));
+
+                camera.tgt = camera.tgt + sx*H + sy*V;
+                camera.recomputeEye();
+                updateCamera();
+                return;
+            }
+
+            glm::vec4 y = glm::vec4(0,1,0,0); // y axis vector
+            glm::vec4 diff(mouseState.delX / _w, mouseState.delY / _h, 0, 0); // mouse offset
+            float a = (float) acos(glm::dot(y, diff) / (glm::length(y) * (glm::length(diff)))); // calculate offset angle from y axis
+            glm::vec4 para; // parallel axis to mouse movement
+            if (diff[0] > 0) {
+                para = glm::mat4_cast(glm::angleAxis(-a, glm::vec3(camera.look[0], camera.look[1], camera.look[2]))) * glm::vec4(camera.up, 1);
+            } else {
+                para = glm::mat4_cast(glm::angleAxis(a, glm::vec3(camera.look[0], camera.look[1], camera.look[2]))) * glm::vec4(camera.up, 1);
+            }
+            glm::vec3 perp = glm::normalize(glm::cross(camera.look, glm::vec3(para))); // perpendicular axis to mouse movement
+
+            // rotate camera on perpendicular axis
+            glm::mat4 rot = glm::mat4_cast(glm::angleAxis(-2*PI*glm::length(diff), perp));
+            camera.rotation = rot * camera.rotation;
+            camera.up = glm::vec3(rot * glm::vec4(camera.up, 0));
+            camera.right = glm::vec3(rot * glm::vec4(camera.right, 0));
+            camera.look = glm::vec3(rot * glm::vec4(camera.look, 0));
+            camera.recomputeEye();
+            updateCamera();
+        }
+    });
+
+    inputHandler.registerWindowListener([&](int w, int h){
+        _w = w;
+        _h = h;
+        glViewport(0, 0, w, h);
+        camera.resize(w, h);
+        updateCamera();
+    });
+}
+
+//https://danielbeard.wordpress.com/2011/06/06/image-saving-code-c/
+void Window::saveImage(const std::string &filename) {
+    glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+    glViewport(0,0,_w,_h);
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+    for (Painter* painter : _painters) {
+        painter->draw();
+    }
+//    std::vector<char> pixels(this->_w*this->_h*4);
+
+    glReadPixels(0,0,_w,_h, GL_BGRA, GL_UNSIGNED_BYTE, &(pixels[0]));
+
+    std::ofstream o(filename.c_str(), std::ios::out | std::ios::binary);
+    o.put(0);
+    o.put(0);
+    o.put(2);                         /* uncompressed RGB */
+    o.put(0); 		o.put(0);
+    o.put(0); 	o.put(0);
+    o.put(0);
+    o.put(0); 	o.put(0);           /* X origin */
+    o.put(0); 	o.put(0);           /* y origin */
+    o.put((_w & 0x00FF));
+    o.put((_w & 0xFF00) / 256);
+    o.put((_h & 0x00FF));
+    o.put((_h & 0xFF00) / 256);
+    o.put(32);                        /* 24 bit bitmap */
+    o.put(0);
+
+    for (int i=0;i<_w*_h*4;i+=4) {
+//        std::cout << (unsigned int)pixels[i] << "," << (unsigned int)pixels[i+1] << "," << (unsigned int)pixels[i+2] << "," << (unsigned int)pixels[i+3] << std::endl;
+        o.put(pixels[i+0]);
+        o.put(pixels[i+1]);
+        o.put(pixels[i+2]);
+        o.put(255);
+    }
+
+    o.close();
+
+    glBindFramebuffer(GL_FRAMEBUFFER, 0);
+}
+
+void Window::initloop(std::function<void(void)> predraw) {
+    GLuint vao;
+    glGenVertexArrays(1, &vao);
+    glBindVertexArray(vao);
+
+    for (Painter* painter : _painters) {
+        painter->setViewProj(glm::value_ptr(camera.viewProj()));
+    }
+
+    glClearColor(0.2f, 0.2f, 0.2f, 1.f);
+    glEnable(GL_DEPTH_TEST);
+
+    while (!glfwWindowShouldClose(_window)) {
+        predraw();
+        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+        glBindVertexArray(vao);
+
+        for (Painter* painter : _painters) {
+            painter->draw();
+        }
+
+
+//        TwDraw();
+
+        glfwSwapBuffers(_window);
+        glfwPollEvents();
+    }
+
+//    TwTerminate();
+
+    glfwDestroyWindow(_window);
+    glfwTerminate();
+}
+
+void Window::handleMouseInput(InputHandler::MouseState &mouseState) {
+
+}
+
+void Window::addPainter(Painter *painter) {
+    _painters.push_back(painter);
+}
+
+void Window::removePainter(Painter *painter) {
+    _painters.erase(std::remove(_painters.begin(), _painters.end(), painter), _painters.end());
+}
+
+void Window::updateCamera() {
+    camera.recompute();
+    // send camera uniforms to painters
+    for (Painter* painter : _painters) {
+        painter->setViewProj(glm::value_ptr(camera.viewProj()));
+    }
+}
diff --git a/core/display/Window.h b/core/display/Window.h
new file mode 100644
index 00000000..e8881ab4
--- /dev/null
+++ b/core/display/Window.h
@@ -0,0 +1,49 @@
+//
+// Created by austin on 2/25/16.
+//
+
+#ifndef FLUID_SIMULATOR_WINDOW_H
+#define FLUID_SIMULATOR_WINDOW_H
+
+#include <GL/glew.h>
+#include <GLFW/glfw3.h>
+#include "InputHandler.h"
+#include <core/solver/FluidSolver.h>
+#include <core/display/painters/Painter.h>
+#include <core/camera/Camera.h>
+#include <AntTweakBar/AntTweakBar.h>
+
+class Window {
+public:
+    Window(const char* title);
+    Window(int w = 1200, int h = 800);
+    Window(int w, int h, const char* title);
+    ~Window();
+    void initloop(std::function<void(void)> predraw = [](){});
+
+    void saveImage(const std::string &filename);
+
+    void addPainter(Painter* painter);
+    void removePainter(Painter* painter);
+
+    void initializeTweakBar();
+    TwButtonCallback loadSceneCB;
+
+private:
+    GLFWwindow* _window;
+    void setupInputCBs();
+    void handleMouseInput(InputHandler::MouseState &mouseState);
+    std::vector<Painter*> _painters;
+    Camera camera;
+    void updateCamera();
+
+    GLuint texture;
+    GLuint fbo;
+    std::vector<char> pixels;
+
+    int _w;
+    int _h;
+};
+
+
+#endif //FLUID_SIMULATOR_WINDOW_H
diff --git a/core/display/painters/BoxPainter.cpp b/core/display/painters/BoxPainter.cpp
new file mode 100644
index 00000000..daf93e00
--- /dev/null
+++ b/core/display/painters/BoxPainter.cpp
@@ -0,0 +1,114 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#include "BoxPainter.h"
+#include <core/display/shaders/flat.vert.h>
+#include <core/display/shaders/flat.frag.h>
+
+struct vert {
+    glm::vec3 pos;
+    glm::vec3 col;
+};
+
+BoxPainter::BoxPainter(Box *box) : _box(box) {
+    GLuint vert = compileShader(flat_vert, GL_VERTEX_SHADER);
+    GLuint frag = compileShader(flat_frag, GL_FRAGMENT_SHADER);
+
+    std::vector<GLuint> programs = {vert, frag};
+    prog = makeProgram(programs);
+
+    unifViewProj = glGetUniformLocation(prog, "u_viewProj");
+    attrPos = glGetAttribLocation(prog, "v_pos");
+    attrCol = glGetAttribLocation(prog, "v_col");
+
+    glGenBuffers(1, &vertex_buffer);
+    glGenBuffers(1, &index_buffer);
+
+    create();
+}
+
+void BoxPainter::update() {
+    if (_box != nullptr) {
+        create();
+        destroy();
+    }
+}
+
+void BoxPainter::draw() const {
+    if (_box != nullptr) {
+        glUseProgram(prog);
+
+        glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+
+        glEnableVertexAttribArray(attrPos);
+        glVertexAttribPointer(attrPos, 3, GL_FLOAT, GL_FALSE, sizeof(vert), (void*)offsetof(vert, pos));
+
+        glEnableVertexAttribArray(attrCol);
+        glVertexAttribPointer(attrCol, 3, GL_FLOAT, GL_FALSE, sizeof(vert), (void*)offsetof(vert, col));
+
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+        glDrawElements(GL_LINES, 24, GL_UNSIGNED_INT, (void*)0);
+
+        glDisableVertexAttribArray(attrPos);
+        glDisableVertexAttribArray(attrCol);
+    }
+}
+
+void BoxPainter::create() {
+    if (_box != nullptr) {
+        vert verts[8];
+        GLuint indices[24];
+        
+        float minX = _box->minX();
+        float maxX = _box->maxX();
+        float minY = _box->minY();
+        float maxY = _box->maxY();
+        float minZ = _box->minZ();
+        float maxZ = _box->maxZ();
+        
+        verts[0].pos = glm::vec3(minX, minY, minZ);
+        verts[1].pos = glm::vec3(minX, maxY, minZ);
+        verts[2].pos = glm::vec3(minX, maxY, maxZ);
+        verts[3].pos = glm::vec3(minX, minY, maxZ);
+        verts[4].pos = glm::vec3(maxX, minY, minZ);
+        verts[5].pos = glm::vec3(maxX, maxY, minZ);
+        verts[6].pos = glm::vec3(maxX, maxY, maxZ);
+        verts[7].pos = glm::vec3(maxX, minY, maxZ);
+
+        indices[0] = 0; indices[1] = 1;
+        indices[2] = 1; indices[3] = 2;
+        indices[4] = 2; indices[5] = 3;
+        indices[6] = 3; indices[7] = 0;
+
+        indices[8] = 4; indices[9] = 5;
+        indices[10] = 5; indices[11] = 6;
+        indices[12] = 6; indices[13] = 7;
+        indices[14] = 7; indices[15] = 4;
+
+        indices[16] = 0; indices[17] = 4;
+        indices[18] = 1; indices[19] = 5;
+        indices[20] = 2; indices[21] = 6;
+        indices[22] = 3; indices[23] = 7;
+
+        for (int i = 0; i < 8; ++i) {
+            verts[i].col = glm::vec3(1,1,1);
+        }
+
+        glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+        glBufferData(GL_ARRAY_BUFFER, 8 * sizeof(vert), verts, GL_STATIC_DRAW);
+
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+        glBufferData(GL_ELEMENT_ARRAY_BUFFER, 24 * sizeof(GLuint), indices, GL_STATIC_DRAW);
+    }
+}
+
+void BoxPainter::destroy() {
+    glDeleteBuffers(1, &vertex_buffer);
+    glDeleteBuffers(1, &index_buffer);
+}
+
+void BoxPainter::setViewProj(const float *viewProj) {
+    glUseProgram(prog);
+    glUniformMatrix4fv(unifViewProj, 1, GL_FALSE, viewProj);
+}
diff --git a/core/display/painters/BoxPainter.h b/core/display/painters/BoxPainter.h
new file mode 100644
index 00000000..a162eeb1
--- /dev/null
+++ b/core/display/painters/BoxPainter.h
@@ -0,0 +1,33 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#ifndef FLUIDSOLVER_BOXPAINTER_H
+#define FLUIDSOLVER_BOXPAINTER_H
+
+#include "Painter.h"
+#include <core/geometry/Box.h>
+
+class BoxPainter : public Painter {
+public:
+    BoxPainter(Box* box);
+    void update();
+    void draw() const;
+    virtual void setViewProj(const float* viewProj);
+
+private:
+    GLuint vertex_buffer;
+    GLuint index_buffer;
+
+    GLint unifViewProj;
+    GLint attrPos;
+    GLint attrCol;
+
+    Box* _box;
+
+    void create();
+    void destroy();
+};
+
+
+#endif //FLUIDSOLVER_BOXPAINTER_H
diff --git a/core/display/painters/GridScalarAttributePainter.cpp b/core/display/painters/GridScalarAttributePainter.cpp
new file mode 100644
index 00000000..2fd1c3df
--- /dev/null
+++ b/core/display/painters/GridScalarAttributePainter.cpp
@@ -0,0 +1,159 @@
+//
+// Created by austin on 3/28/16.
+//
+
+#include "GridScalarAttributePainter.h"
+#include <core/display/shaders/gridScal.frag.h>
+#include <core/display/shaders/gridScal.vert.h>
+#include <iostream>
+
+GridScalarAttributePainter::GridScalarAttributePainter(
+        Grid<float> *grid, float rangeStart, float rangeEnd,
+        float ptSizeStart, float ptSizeEnd, const glm::vec3 &colorStart,
+        const glm::vec3 &colorEnd)
+        :
+        GridScalarAttributePainter(grid, rangeStart, rangeEnd, ptSizeStart, ptSizeEnd, colorStart, colorEnd, FLOAT) {
+
+}
+
+GridScalarAttributePainter::GridScalarAttributePainter(
+        Grid<int> *grid, float rangeStart, float rangeEnd,
+        float ptSizeStart, float ptSizeEnd, const glm::vec3 &colorStart,
+        const glm::vec3 &colorEnd)
+        :
+        GridScalarAttributePainter(grid, rangeStart, rangeEnd, ptSizeStart, ptSizeEnd, colorStart, colorEnd, INT) {
+
+}
+
+template <typename T>
+GridScalarAttributePainter::GridScalarAttributePainter(Grid<T> *grid, float rangeStart, float rangeEnd,
+                                                       float ptSizeStart, float ptSizeEnd, const glm::vec3 &colorStart,
+                                                       const glm::vec3 &colorEnd, Type type) : type(type) {
+
+    if (type == FLOAT) {
+        _attributesF = &dynamic_cast<Grid<float>*>(grid)->_contents;
+        MAX_ATTRIBUTES = _attributesF->size();
+    } else if (type == INT) {
+        _attributesI = &dynamic_cast<Grid<int>*>(grid)->_contents;
+        MAX_ATTRIBUTES = _attributesI->size();
+    }
+
+    std::vector<int> indices;
+    for (int i = 0; i < MAX_ATTRIBUTES; i++) {
+        indices.push_back(i);
+    }
+
+    // compile shaders
+    GLuint gridScalVert = compileShader(gridScal_vert, GL_VERTEX_SHADER);
+    GLuint gridScalFrag = compileShader(gridScal_frag, GL_FRAGMENT_SHADER);
+
+    std::vector<GLuint> programs = {gridScalVert, gridScalFrag};
+    prog = makeProgram(programs);
+
+    // setup shader locations
+    unifViewProj = glGetUniformLocation(prog, "u_viewProj");
+    if (type == FLOAT) {
+        attrData = glGetAttribLocation(prog, "f_data");
+    } else if (type == INT) {
+        attrData = glGetAttribLocation(prog, "i_data");
+    }
+
+    unifColStart = glGetUniformLocation(prog, "u_colStart");
+    unifColEnd = glGetUniformLocation(prog, "u_colEnd");
+    unifSizeStart = glGetUniformLocation(prog, "u_sizeStart");
+    unifSizeEnd = glGetUniformLocation(prog, "u_sizeEnd");
+    unifRangeStart = glGetUniformLocation(prog, "u_rangeStart");
+    unifRangeEnd = glGetUniformLocation(prog, "u_rangeEnd");
+    unifType = glGetUniformLocation(prog, "u_type");
+
+    unifCellSize = glGetUniformLocation(prog, "u_cellSize");
+    unifCellCount = glGetUniformLocation(prog, "u_cellCount");
+    unifOrigin = glGetUniformLocation(prog, "u_origin");
+
+    // make a buffer for the indices
+    glGenBuffers(1, &index_buffer);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(int), indices.data(), GL_STATIC_DRAW);
+
+    // make a buffer for the attributes
+    if (type == FLOAT) {
+        glGenBuffers(1, &attribute_buffer);
+        glBindBuffer(GL_ARRAY_BUFFER, attribute_buffer);
+        glBufferData(GL_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(float), NULL, GL_STREAM_DRAW);
+    } else if (type == INT) {
+        glGenBuffers(1, &attribute_buffer);
+        glBindBuffer(GL_ARRAY_BUFFER, attribute_buffer);
+        glBufferData(GL_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(int), NULL, GL_STREAM_DRAW);
+    }
+
+    // set grid uniforms
+    glUseProgram(prog);
+    if (type == FLOAT) {
+        glUniform1i(unifType, 0);
+    } else if (type == INT) {
+        glUniform1i(unifType, 1);
+    }
+    glUniform1f(unifCellSize, grid->_cellSize);
+    glm::ivec3 count = glm::ivec3(grid->_countX, grid->_countY, grid->_countZ);
+    glUniform3iv(unifCellCount, 1, &(count.x));
+    glm::vec3 o = grid->_origin + grid->_offset;
+    glUniform3fv(unifOrigin, 1, &(o[0]));
+
+    glUniform3fv(unifColStart, 1, &(colorStart[0]));
+    glUniform3fv(unifColEnd, 1, &(colorEnd[0]));
+    glUniform1f(unifRangeStart, rangeStart);
+    glUniform1f(unifRangeEnd, rangeEnd);
+    glUniform1f(unifSizeStart, ptSizeStart);
+    glUniform1f(unifSizeEnd, ptSizeEnd);
+}
+
+void GridScalarAttributePainter::draw() const {
+    if (type == FLOAT) {
+        if (_attributesF != nullptr) {
+//            for (float f : *_attributesF) {
+//                if (!fequal(f, 0.f) && f < 0) {
+//                    std:: cout << f << std::endl;
+//                }
+//            }
+
+            glUseProgram(prog);
+            glEnable(GL_PROGRAM_POINT_SIZE);
+
+            // bind and send new data
+            glBindBuffer(GL_ARRAY_BUFFER, attribute_buffer);
+            glBufferData(GL_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(float), NULL, GL_STREAM_DRAW);
+            glBufferSubData(GL_ARRAY_BUFFER, 0, MAX_ATTRIBUTES * sizeof(float), _attributesF->data());
+
+            glEnableVertexAttribArray(attrData);
+            glVertexAttribPointer(attrData, 1, GL_FLOAT, GL_FALSE, sizeof(float), (void*)0);
+
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+            glDrawElements(GL_POINTS, _attributesF->size(), GL_UNSIGNED_INT, 0);
+
+            glDisableVertexAttribArray(attrData);
+        }
+    } else if (type == INT) {
+        if (_attributesI != nullptr) {
+            glUseProgram(prog);
+            glEnable(GL_PROGRAM_POINT_SIZE);
+
+            // bind and send new data
+            glBindBuffer(GL_ARRAY_BUFFER, attribute_buffer);
+            glBufferData(GL_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(int), NULL, GL_STREAM_DRAW);
+            glBufferSubData(GL_ARRAY_BUFFER, 0, MAX_ATTRIBUTES * sizeof(int), _attributesI->data());
+
+            glEnableVertexAttribArray(attrData);
+            glVertexAttribIPointer(attrData, 1, GL_INT, sizeof(int), (void*)0);
+
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+            glDrawElements(GL_POINTS, _attributesI->size(), GL_UNSIGNED_INT, 0);
+
+            glDisableVertexAttribArray(attrData);
+        }
+    }
+}
+
+void GridScalarAttributePainter::setViewProj(const float *viewProj) {
+    glUseProgram(prog);
+    glUniformMatrix4fv(unifViewProj, 1, GL_FALSE, viewProj);
+}
diff --git a/core/display/painters/GridScalarAttributePainter.h b/core/display/painters/GridScalarAttributePainter.h
new file mode 100644
index 00000000..b96024f4
--- /dev/null
+++ b/core/display/painters/GridScalarAttributePainter.h
@@ -0,0 +1,74 @@
+//
+// Created by austin on 3/28/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDSCALARATTRIBUTEPAINTER_H
+#define FLUIDSOLVER_GRIDSCALARATTRIBUTEPAINTER_H
+
+#include "Painter.h"
+#include <core/solver/grid/Grid.h>
+
+class GridScalarAttributePainter : public Painter {
+public:
+    enum Type {
+        INT,
+        FLOAT
+    };
+
+    template <typename T> explicit GridScalarAttributePainter(Grid<T>* grid,
+                                        float rangeStart,
+                                        float rangeEnd,
+                                        float ptSizeStart,
+                                        float ptSizeEnd,
+                                        const glm::vec3 &colorStart,
+                                        const glm::vec3 &colorEnd,
+                                        Type type);
+
+    explicit GridScalarAttributePainter(Grid<float>* grid,
+                                        float rangeStart,
+                                        float rangeEnd,
+                                        float ptSizeStart,
+                                        float ptSizeEnd,
+                                        const glm::vec3 &colorStart,
+                                        const glm::vec3 &colorEnd);
+
+    explicit GridScalarAttributePainter(Grid<int>* grid,
+                                        float rangeStart,
+                                        float rangeEnd,
+                                        float ptSizeStart,
+                                        float ptSizeEnd,
+                                        const glm::vec3 &colorStart,
+                                        const glm::vec3 &colorEnd);
+
+    virtual void draw() const;
+    virtual void setViewProj(const float* viewProj);
+
+private:
+    Type type;
+    unsigned int MAX_ATTRIBUTES = 10000;
+    GLuint index_buffer;
+    GLuint attribute_buffer;
+
+    GLint attrIndex;
+    GLint attrData;
+
+    GLint unifViewProj;
+
+    GLint unifColStart;
+    GLint unifColEnd;
+    GLint unifSizeStart;
+    GLint unifSizeEnd;
+    GLint unifRangeStart;
+    GLint unifRangeEnd;
+    GLint unifType;
+
+    GLint unifCellSize;
+    GLint unifCellCount;
+    GLint unifOrigin;
+
+    std::vector<float>* _attributesF;
+    std::vector<int>* _attributesI;
+};
+
+
+#endif //FLUIDSOLVER_GRIDSCALARATTRIBUTEPAINTER_H
diff --git a/core/display/painters/GridVectorAttributePainter.cpp b/core/display/painters/GridVectorAttributePainter.cpp
new file mode 100644
index 00000000..e7f7121f
--- /dev/null
+++ b/core/display/painters/GridVectorAttributePainter.cpp
@@ -0,0 +1,83 @@
+//
+// Created by austin on 3/22/16.
+//
+
+#include "GridVectorAttributePainter.h"
+#include <core/display/shaders/gridAttr.frag.h>
+#include <core/display/shaders/gridAttr.vert.h>
+#include <core/display/shaders/gridAttr.geo.h>
+
+GridVectorAttributePainter::GridVectorAttributePainter(Grid<float> *grid,
+                                                       float ptSize,
+                                                       const glm::vec3 &color,
+                                                       const glm::vec3 &dir) :
+        _ptSize(ptSize), _attributes(&grid->_contents) {
+    MAX_ATTRIBUTES = (unsigned int) _attributes->size();
+
+    std::vector<int> indices;
+    for (int i = 0; i < _attributes->size(); i++) {
+        indices.push_back(i);
+    }
+
+    // compile shaders
+    GLuint gridAttrVert = compileShader(gridAttr_vert, GL_VERTEX_SHADER);
+    GLuint gridAttrGeo = compileShader(gridAttr_geo, GL_GEOMETRY_SHADER);
+    GLuint gridAttrFrag = compileShader(gridAttr_frag, GL_FRAGMENT_SHADER);
+
+    std::vector<GLuint> programs = {gridAttrVert, gridAttrGeo, gridAttrFrag};
+    prog = makeProgram(programs);
+
+    // setup shader locations
+    unifViewProj = glGetUniformLocation(prog, "u_viewProj");
+    attrData = glGetAttribLocation(prog, "v_data");
+    unifCol = glGetUniformLocation(prog, "u_col");
+    unifCellSize = glGetUniformLocation(prog, "u_cellSize");
+    unifCellCount = glGetUniformLocation(prog, "u_cellCount");
+    unifOrigin = glGetUniformLocation(prog, "u_origin");
+    unifVec = glGetUniformLocation(prog, "u_vec");
+
+    // make a buffer for the indices
+    glGenBuffers(1, &index_buffer);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(int), indices.data(), GL_STATIC_DRAW);
+
+    // make a buffer for the attributes
+    glGenBuffers(1, &attribute_buffer);
+    glBindBuffer(GL_ARRAY_BUFFER, attribute_buffer);
+    glBufferData(GL_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(float), NULL, GL_STREAM_DRAW);
+
+    // set grid uniforms
+    glUseProgram(prog);
+    glUniform1f(unifCellSize, grid->_cellSize);
+    glm::ivec3 count = glm::ivec3(grid->_countX, grid->_countY, grid->_countZ);
+    glUniform3iv(unifCellCount, 1, &(count.x));
+    glm::vec3 o = grid->_origin + grid->_offset;
+    glUniform3fv(unifOrigin, 1, &(o[0]));
+    glUniform3fv(unifCol, 1, &(color[0]));
+    glUniform3fv(unifVec, 1, &(dir[0]));
+}
+
+void GridVectorAttributePainter::draw() const {
+    if (_attributes != nullptr) {
+        glUseProgram(prog);
+
+        // bind and send new data
+        glBindBuffer(GL_ARRAY_BUFFER, attribute_buffer);
+        glBufferData(GL_ARRAY_BUFFER, MAX_ATTRIBUTES * sizeof(float), NULL, GL_STREAM_DRAW);
+        glBufferSubData(GL_ARRAY_BUFFER, 0, MAX_ATTRIBUTES * sizeof(float), _attributes->data());
+
+        glEnableVertexAttribArray(attrData);
+        glVertexAttribPointer(attrData, 1, GL_FLOAT, GL_FALSE, sizeof(float), (void*)0);
+
+        glPointSize(_ptSize);
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer);
+        glDrawElements(GL_POINTS, _attributes->size(), GL_UNSIGNED_INT, 0);
+
+        glDisableVertexAttribArray(attrData);
+    }
+}
+
+void GridVectorAttributePainter::setViewProj(const float *viewProj) {
+    glUseProgram(prog);
+    glUniformMatrix4fv(unifViewProj, 1, GL_FALSE, viewProj);
+}
diff --git a/core/display/painters/GridVectorAttributePainter.h b/core/display/painters/GridVectorAttributePainter.h
new file mode 100644
index 00000000..68042ddd
--- /dev/null
+++ b/core/display/painters/GridVectorAttributePainter.h
@@ -0,0 +1,36 @@
+//
+// Created by austin on 3/22/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDVECTORATTRIBUTEPAINTER_H
+#define FLUIDSOLVER_GRIDVECTORATTRIBUTEPAINTER_H
+
+#include "Painter.h"
+#include <core/solver/grid/Grid.h>
+
+class GridVectorAttributePainter : public Painter {
+public:
+    explicit GridVectorAttributePainter(Grid<float>* grid, float ptSize, const glm::vec3 &color, const glm::vec3 &dir);
+    virtual void draw() const;
+    virtual void setViewProj(const float* viewProj);
+
+private:
+    unsigned int MAX_ATTRIBUTES = 10000;
+    GLuint index_buffer;
+    GLuint attribute_buffer;
+
+    GLint unifViewProj;
+    GLint attrIndex;
+    GLint attrData;
+    GLint unifCol;
+    GLint unifCellSize;
+    GLint unifCellCount;
+    GLint unifOrigin;
+    GLint unifVec;
+
+    GLfloat _ptSize;
+    std::vector<float>* _attributes;
+};
+
+
+#endif //FLUIDSOLVER_GRIDATTRIBUTEPAINTER_H
diff --git a/core/display/painters/Painter.cpp b/core/display/painters/Painter.cpp
new file mode 100644
index 00000000..a5c0d0ab
--- /dev/null
+++ b/core/display/painters/Painter.cpp
@@ -0,0 +1,62 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#include "Painter.h"
+#include <iostream>
+
+GLuint Painter::compileShader(const char* shader, GLenum type) {
+    GLuint shaderId = glCreateShader(type);
+    glShaderSourceARB(shaderId, 1, &shader, NULL);
+    glCompileShader(shaderId);
+
+    GLint success = 0;
+    glGetShaderiv(shaderId, GL_COMPILE_STATUS, &success);
+    if (success == GL_FALSE) {
+        fprintf(stderr, "Failed to compile shader!\n%s\n", shader);
+
+        GLint maxLength = 0;
+        glGetShaderiv(shaderId, GL_INFO_LOG_LENGTH, &maxLength);
+
+        std::vector<GLchar> errorLog(maxLength);
+        glGetShaderInfoLog(shaderId, maxLength, &maxLength, &errorLog[0]);
+
+        fprintf(stderr, "%s\n", &errorLog[0]);
+
+        glDeleteShader(shaderId);
+    }
+
+    return shaderId;
+}
+
+GLuint Painter::makeProgram(const std::vector<GLuint> &programs) {
+    GLuint prog = glCreateProgram();
+    for (GLuint program : programs) {
+        glAttachShader(prog, program);
+    }
+    glLinkProgram(prog);
+
+    for (GLuint program : programs) {
+        glDetachShader(prog, program);
+        glDeleteShader(program);
+    }
+
+    GLint linked;
+    glGetProgramiv(prog, GL_LINK_STATUS, &linked);
+    if (!linked) {
+        std::cerr << "Failed to link program!" << std::endl;
+
+        GLint length;
+        glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &length);
+        if ( length > 0 ){
+            std::vector<char> ProgramErrorMessage(length+1);
+            glGetProgramInfoLog(prog, length, NULL, &ProgramErrorMessage[0]);
+            fprintf(stderr, "%s\n", &ProgramErrorMessage[0]);
+        }
+    }
+    return prog;
+}
+
+void Painter::setViewProj(const float *viewProj) {
+
+}
diff --git a/core/display/painters/Painter.h b/core/display/painters/Painter.h
new file mode 100644
index 00000000..42d2a714
--- /dev/null
+++ b/core/display/painters/Painter.h
@@ -0,0 +1,24 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#ifndef FLUIDSOLVER_PAINTER_H
+#define FLUIDSOLVER_PAINTER_H
+
+#include <GL/glew.h>
+#include <vector>
+
+class Painter {
+public:
+    virtual void draw() const = 0;
+    virtual void setViewProj(const float* viewProj);
+
+protected:
+    GLuint prog;
+
+    GLuint compileShader(const char* shader, GLenum type);
+    GLuint makeProgram(const std::vector<GLuint> &programs);
+};
+
+
+#endif //FLUIDSOLVER_PAINTER_H
diff --git a/core/display/painters/ParticlesPainter.cpp b/core/display/painters/ParticlesPainter.cpp
new file mode 100644
index 00000000..fc7bbac2
--- /dev/null
+++ b/core/display/painters/ParticlesPainter.cpp
@@ -0,0 +1,64 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#include "ParticlesPainter.h"
+#include <core/display/shaders/particle.frag.h>
+#include <core/display/shaders/particle.vert.h>
+
+ParticlesPainter::ParticlesPainter(FluidSolver* solver, float ptSize) : _ptSize(ptSize), _particles(&solver->_particles) {
+    MAX_PARTICLES = (unsigned int) _particles->size();
+
+    // compile shaders
+    GLuint particleVert = compileShader(particle_vert, GL_VERTEX_SHADER);
+    GLuint particleFrag = compileShader(particle_frag, GL_FRAGMENT_SHADER);
+
+    std::vector<GLuint> programs = {particleVert, particleFrag};
+    prog = makeProgram(programs);
+
+    // setup shader locations
+    unifViewProj = glGetUniformLocation(prog, "u_viewProj");
+    attrPos = glGetAttribLocation(prog, "v_pos");
+    attrVel = glGetAttribLocation(prog, "v_vel");
+    attrCol = glGetAttribLocation(prog, "v_col");
+
+    // make a buffer for the particles
+    glGenBuffers(1, &particle_buffer);
+    glBindBuffer(GL_ARRAY_BUFFER, particle_buffer);
+    glBufferData(GL_ARRAY_BUFFER, MAX_PARTICLES * sizeof(FluidParticle), NULL, GL_STREAM_DRAW);
+}
+
+void ParticlesPainter::draw() const {
+    if (_particles != nullptr) {
+        glUseProgram(prog);
+
+        // bind and send new data
+        glBindBuffer(GL_ARRAY_BUFFER, particle_buffer);
+        glBufferData(GL_ARRAY_BUFFER, MAX_PARTICLES * sizeof(FluidParticle), NULL, GL_STREAM_DRAW);
+        glBufferSubData(GL_ARRAY_BUFFER, 0, MAX_PARTICLES * sizeof(FluidParticle), &((*_particles)[0]));
+
+        // particle positions, offset by pos attribute, jumping by FluidParticle size
+        glEnableVertexAttribArray(attrPos);
+        glVertexAttribPointer(attrPos, 3, GL_FLOAT, GL_FALSE, sizeof(FluidParticle), (void*)offsetof(FluidParticle, pos));
+
+        // particle velocities, offset by vel attribute
+        glEnableVertexAttribArray(attrVel);
+        glVertexAttribPointer(attrVel, 3, GL_FLOAT, GL_FALSE, sizeof(FluidParticle), (void*)offsetof(FluidParticle, vel));
+
+        // particle colors, offset by col attribute
+        glEnableVertexAttribArray(attrCol);
+        glVertexAttribPointer(attrCol, 3, GL_FLOAT, GL_FALSE, sizeof(FluidParticle), (void*)offsetof(FluidParticle, col));
+
+        glPointSize(_ptSize);
+        glDrawArrays(GL_POINTS, 0, (GLsizei) _particles->size());
+
+        glDisableVertexAttribArray(attrPos);
+        glDisableVertexAttribArray(attrVel);
+        glDisableVertexAttribArray(attrCol);
+    }
+}
+
+void ParticlesPainter::setViewProj(const float *viewProj) {
+    glUseProgram(prog);
+    glUniformMatrix4fv(unifViewProj, 1, GL_FALSE, viewProj);
+}
diff --git a/core/display/painters/ParticlesPainter.h b/core/display/painters/ParticlesPainter.h
new file mode 100644
index 00000000..2b1cd41b
--- /dev/null
+++ b/core/display/painters/ParticlesPainter.h
@@ -0,0 +1,31 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#ifndef FLUIDSOLVER_PARTICLESPAINTER_H
+#define FLUIDSOLVER_PARTICLESPAINTER_H
+
+#include "Painter.h"
+#include <core/solver/FluidSolver.h>
+
+class ParticlesPainter : public Painter {
+public:
+    ParticlesPainter(FluidSolver* solver, float ptSize = 3.f);
+    virtual void draw() const;
+    virtual void setViewProj(const float* viewProj);
+
+private:
+    unsigned int MAX_PARTICLES = 10000;
+    GLuint particle_buffer;
+
+    GLint unifViewProj;
+    GLint attrPos;
+    GLint attrVel;
+    GLint attrCol;
+
+    GLfloat _ptSize;
+    std::vector<FluidParticle>* _particles;
+};
+
+
+#endif //FLUIDSOLVER_PARTICLESPAINTER_H
diff --git a/core/display/shaders/flat.frag.h b/core/display/shaders/flat.frag.h
new file mode 100644
index 00000000..6e0ac8b7
--- /dev/null
+++ b/core/display/shaders/flat.frag.h
@@ -0,0 +1,14 @@
+//
+// Created by austin on 2/29/16.
+//
+
+const char* flat_frag = R"(
+#version 150
+
+in vec3 f_col;
+out vec4 out_Col;
+
+void main() {
+    out_Col = vec4(f_col.rgb, 1);
+}
+)";
diff --git a/core/display/shaders/flat.vert.h b/core/display/shaders/flat.vert.h
new file mode 100644
index 00000000..cfb8ade4
--- /dev/null
+++ b/core/display/shaders/flat.vert.h
@@ -0,0 +1,24 @@
+//
+// Created by austin on 2/29/16.
+//
+
+#ifndef FLUIDSOLVER_FLAT_VERT_H_H
+#define FLUIDSOLVER_FLAT_VERT_H_H
+
+const char* flat_vert = R"(
+#version 150
+
+uniform mat4 u_viewProj;
+
+in vec3 v_pos;
+in vec3 v_col;
+
+out vec3 f_col;
+
+void main() {
+    f_col = v_col;
+    gl_Position = u_viewProj * vec4(v_pos, 1);
+}
+)";
+
+#endif //FLUIDSOLVER_FLAT_VERT_H_H
\ No newline at end of file
diff --git a/core/display/shaders/gridAttr.frag.h b/core/display/shaders/gridAttr.frag.h
new file mode 100644
index 00000000..9c6910aa
--- /dev/null
+++ b/core/display/shaders/gridAttr.frag.h
@@ -0,0 +1,19 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDATTR_FRAG_H
+#define FLUIDSOLVER_GRIDATTR_FRAG_H
+
+const char* gridAttr_frag = R"(
+#version 150
+
+uniform vec3 u_col;
+out vec4 out_Col;
+in float f_scale;
+
+void main() {
+    out_Col = vec4(f_scale*u_col.rgb, 1);
+}
+)";
+#endif
\ No newline at end of file
diff --git a/core/display/shaders/gridAttr.geo.h b/core/display/shaders/gridAttr.geo.h
new file mode 100644
index 00000000..b17c128b
--- /dev/null
+++ b/core/display/shaders/gridAttr.geo.h
@@ -0,0 +1,32 @@
+//
+// Created by austin on 3/22/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDATTR_GEO_H
+#define FLUIDSOLVER_GRIDATTR_GEO_H
+
+const char* gridAttr_geo = R"(
+#version 150
+
+layout(points) in;
+layout(line_strip, max_vertices = 2) out;
+
+uniform mat4 u_viewProj;
+uniform vec3 u_vec;
+
+in float g_data[];
+out float f_scale;
+
+void main() {
+    f_scale = abs(g_data[0]);
+
+    gl_Position = u_viewProj * gl_in[0].gl_Position;
+    EmitVertex();
+
+    gl_Position = u_viewProj * (gl_in[0].gl_Position + g_data[0]*vec4(u_vec, 0));
+    EmitVertex();
+    EndPrimitive();
+}
+)";
+
+#endif //FLUIDSOLVER_GRIDATTR_GEO_H
diff --git a/core/display/shaders/gridAttr.vert.h b/core/display/shaders/gridAttr.vert.h
new file mode 100644
index 00000000..b524d604
--- /dev/null
+++ b/core/display/shaders/gridAttr.vert.h
@@ -0,0 +1,32 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDATTR_VERT_H
+#define FLUIDSOLVER_GRIDATTR_VERT_H
+
+const char* gridAttr_vert = R"(
+#version 150
+
+uniform float u_cellSize;
+uniform ivec3 u_cellCount;
+uniform vec3 u_origin;
+
+in float v_data;
+out float g_data;
+
+void main() {
+    g_data = v_data;
+
+    int i = int(mod(gl_VertexID, u_cellCount.x));
+    int j = int(mod(gl_VertexID / u_cellCount.x, u_cellCount.y));
+    int k = int(gl_VertexID / (u_cellCount.x * u_cellCount.y));
+
+    vec3 pos = vec3(float(i), float(j), float(k)) * u_cellSize + u_origin;
+
+    gl_Position = vec4(pos, 1);
+}
+)";
+
+
+#endif
\ No newline at end of file
diff --git a/core/display/shaders/gridScal.frag.h b/core/display/shaders/gridScal.frag.h
new file mode 100644
index 00000000..63b352fb
--- /dev/null
+++ b/core/display/shaders/gridScal.frag.h
@@ -0,0 +1,22 @@
+//
+// Created by austin on 3/28/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDSCAL_FRAG_H
+#define FLUIDSOLVER_GRIDSCAL_FRAG_H
+
+const char* gridScal_frag = R"(
+#version 150
+
+uniform vec3 u_colStart;
+uniform vec3 u_colEnd;
+
+in float amount;
+out vec4 out_Col;
+
+void main() {
+    out_Col = vec4((1 - amount)*u_colStart + amount*u_colEnd, 1);
+}
+)";
+
+#endif //FLUIDSOLVER_GRIDSCAL_FRAG_H
diff --git a/core/display/shaders/gridScal.vert.h b/core/display/shaders/gridScal.vert.h
new file mode 100644
index 00000000..19861d6f
--- /dev/null
+++ b/core/display/shaders/gridScal.vert.h
@@ -0,0 +1,48 @@
+//
+// Created by austin on 3/28/16.
+//
+
+#ifndef FLUIDSOLVER_GRIDSCAL_VERT_H
+#define FLUIDSOLVER_GRIDSCAL_VERT_H
+
+const char* gridScal_vert = R"(
+#version 150
+
+uniform mat4 u_viewProj;
+
+uniform float u_cellSize;
+uniform ivec3 u_cellCount;
+uniform vec3 u_origin;
+
+uniform float u_sizeStart;
+uniform float u_sizeEnd;
+uniform float u_rangeStart;
+uniform float u_rangeEnd;
+
+uniform int u_type;
+
+in float f_data;
+in int i_data;
+out float amount;
+
+void main() {
+
+    int i = int(mod(gl_VertexID, u_cellCount.x));
+    int j = int(mod(gl_VertexID / u_cellCount.x, u_cellCount.y));
+    int k = int(gl_VertexID / (u_cellCount.x * u_cellCount.y));
+
+    vec3 pos = vec3(float(i), float(j), float(k)) * u_cellSize + u_origin;
+
+    gl_Position = u_viewProj * vec4(pos, 1);
+
+    if (u_type > 0) {
+        amount = (float(i_data) - u_rangeStart) / (u_rangeEnd - u_rangeStart);
+    } else {
+        amount = (f_data - u_rangeStart) / (u_rangeEnd - u_rangeStart);
+    }
+
+    gl_PointSize = u_sizeStart  + amount * (u_sizeEnd - u_sizeStart);
+}
+)";
+
+#endif //FLUIDSOLVER_GRIDSCAL_VERT_H
diff --git a/core/display/shaders/particle.frag.h b/core/display/shaders/particle.frag.h
new file mode 100644
index 00000000..ed0d9e43
--- /dev/null
+++ b/core/display/shaders/particle.frag.h
@@ -0,0 +1,21 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_PARTICLE_FRAG_H
+#define FLUIDSOLVER_PARTICLE_FRAG_H
+
+const char* particle_frag = R"(
+#version 150
+
+in vec3 f_col;
+in vec3 f_vel;
+out vec4 out_Col;
+
+void main() {
+    float fac = length(f_vel) / 20.0;
+    vec3 col = f_col + vec3(fac,fac,fac);
+    out_Col = vec4(col, 1);
+}
+)";
+#endif
\ No newline at end of file
diff --git a/core/display/shaders/particle.vert.h b/core/display/shaders/particle.vert.h
new file mode 100644
index 00000000..d5bed065
--- /dev/null
+++ b/core/display/shaders/particle.vert.h
@@ -0,0 +1,28 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_PARTICLE_VERT_H
+#define FLUIDSOLVER_PARTICLE_VERT_H
+
+const char* particle_vert = R"(
+#version 150
+
+uniform mat4 u_viewProj;
+
+in vec3 v_pos;
+in vec3 v_vel;
+in vec3 v_col;
+
+out vec3 f_col;
+out vec3 f_vel;
+
+void main() {
+    f_col = v_col;
+    f_vel = v_vel;
+    gl_Position = u_viewProj * vec4(v_pos, 1);
+    gl_PointSize = 3;
+}
+)";
+
+#endif
\ No newline at end of file
diff --git a/core/fileIO/ParticlesWriter.cpp b/core/fileIO/ParticlesWriter.cpp
new file mode 100644
index 00000000..4f5b0eca
--- /dev/null
+++ b/core/fileIO/ParticlesWriter.cpp
@@ -0,0 +1,54 @@
+//
+// Created by austin on 3/21/16.
+//
+
+#include "ParticlesWriter.h"
+//#include <openvdb_points/tools/PointDataGrid.h>
+//#include <openvdb_points/tools/PointConversion.h>
+//#include <openvdb_points/tools/PointCount.h>
+
+//using namespace openvdb::tools;
+
+ParticlesWriter::ParticlesWriter() {
+    openvdb::initialize();
+//    openvdb::points::initialize();
+}
+
+ParticlesWriter::~ParticlesWriter() {
+
+}
+
+void ParticlesWriter::writeData(const FluidSolver* const solver, const std::string &filename) {
+//    openvdb::FloatGrid::Ptr grid = openvdb::FloatGrid::create();
+//    openvdb::FloatGrid::Accessor accessor = grid->getAccessor();
+//    openvdb::Coord xyz(0, 0, 0);
+
+//    for (const FluidParticle &particle : solver->_particles) {
+//        xyz.reset(particle.cell.x, particle.cell.y, particle.cell.z);
+//        accessor.setValue(xyz, 1.f);
+//    }
+//
+//    openvdb::io::File file(filename);
+//    openvdb::GridPtrVec grids;
+//    grids.push_back(grid);
+//
+//    file.write(grids);
+//    file.close();
+
+//    const float voxelSize = 10.0f;
+//    openvdb::math::Transform::Ptr transform = openvdb::math::Transform::createLinearTransform(voxelSize);
+//
+//    std::vector<openvdb::Vec3f> positions;
+//    for (const FluidParticle &particle : solver->_particles) {
+//        positions.push_back(openvdb::Vec3f(particle.pos.x, particle.pos.y, particle.pos.z));
+//    }
+//
+//    PointDataGrid::Ptr grid = createPointDataGrid<PointDataGrid>(positions, TypedAttributeArray<openvdb::Vec3f>::attributeType(), *transform);
+//
+//    openvdb::io::File file("filename");
+//    openvdb::GridPtrVec grids;
+//    grids.push_back(grid);
+//
+//    file.write(grids);
+//    file.close();
+}
diff --git a/core/fileIO/ParticlesWriter.h b/core/fileIO/ParticlesWriter.h
new file mode 100644
index 00000000..c2e28767
--- /dev/null
+++ b/core/fileIO/ParticlesWriter.h
@@ -0,0 +1,22 @@
+//
+// Created by austin on 3/21/16.
+//
+
+#ifndef FLUIDSOLVER_PARTICLESWRITER_H
+#define FLUIDSOLVER_PARTICLESWRITER_H
+
+#include <openvdb/openvdb.h>
+//#include <openvdb_points/openvdb.h>
+#include <core/solver/FluidSolver.h>
+
+class ParticlesWriter {
+
+public:
+    ParticlesWriter();
+    ~ParticlesWriter();
+
+    void writeData(const FluidSolver* const solver, const std::string &filename);
+};
+
+
+#endif //FLUIDSOLVER_PARTICLESWRITER_H
diff --git a/core/fileIO/SceneLoader.cpp b/core/fileIO/SceneLoader.cpp
new file mode 100644
index 00000000..be81cad9
--- /dev/null
+++ b/core/fileIO/SceneLoader.cpp
@@ -0,0 +1,69 @@
+//
+// Created by austin on 2/26/16.
+//
+
+#include <fstream>
+#include <core/geometry/Box.h>
+#include "SceneLoader.h"
+#include <core/scenes/default.h>
+
+FluidSolver* SceneLoader::LoadScene(const char *filepath) {
+    Json::Reader reader;
+
+    std::ifstream fileStream(filepath, std::ifstream::binary);
+
+    Json::Value root;
+    if (filepath != nullptr) {
+        if (!reader.parse(fileStream, root, false)) {
+            fprintf(stderr, "Failed to load json file %s!", filepath);
+            exit(EXIT_FAILURE);
+        }
+    } else {
+        return LoadScene(std::string(default_scene));
+    }
+
+    return parseJson(root);
+}
+
+FluidSolver* SceneLoader::LoadScene(const std::string &jsonstring) {
+    Json::Reader reader;
+    Json::Value root;
+
+    if (!reader.parse(jsonstring, root, false)) {
+        fprintf(stderr, "Failed to load json string!\n %s", jsonstring);
+        exit(EXIT_FAILURE);
+    }
+
+    return parseJson(root);
+}
+
+FluidSolver* SceneLoader::parseJson(const Json::Value &root) {
+    Json::Value containerDim = root["containerDim"];
+    Json::Value particleDim = root["particleDim"];
+    Json::Value resolution = root["resolution"];
+
+    glm::vec3 containerSize(containerDim["scale"][0].asFloat(),
+                            containerDim["scale"][1].asFloat(),
+                            containerDim["scale"][2].asFloat());
+    glm::vec3 containerPos(containerDim["position"][0].asFloat(),
+                           containerDim["position"][1].asFloat(),
+                           containerDim["position"][2].asFloat());
+
+    glm::vec3 fluidSize(particleDim["scale"][0].asFloat(),
+                        particleDim["scale"][1].asFloat(),
+                        particleDim["scale"][2].asFloat());
+    glm::vec3 fluidPos(particleDim["position"][0].asFloat(),
+                       particleDim["position"][1].asFloat(),
+                       particleDim["position"][2].asFloat());
+
+    float cellSize = std::max(std::max(containerSize.x, containerSize.y), containerSize.z) / resolution.asFloat();
+
+    Box* container = new Box(containerPos, containerSize);
+//    Box* container = new Box(containerPos, containerSize + 2.f*glm::vec3(cellSize, cellSize, cellSize));
+    Box fluidObject = Box(fluidPos, fluidSize);
+
+    FluidSolver* solver = new FluidSolver(cellSize/2, cellSize);
+    solver->setContainer(container);
+    solver->addFluid(fluidObject);
+    return solver;
+}
diff --git a/core/fileIO/SceneLoader.h b/core/fileIO/SceneLoader.h
new file mode 100644
index 00000000..47c60612
--- /dev/null
+++ b/core/fileIO/SceneLoader.h
@@ -0,0 +1,23 @@
+//
+// Created by austin on 2/26/16.
+//
+
+#ifndef FLUIDSOLVER_SCENELOADER_H
+#define FLUIDSOLVER_SCENELOADER_H
+
+#include <json/json.h>
+#include <core/solver/FluidSolver.h>
+
+class SceneLoader {
+public:
+    static FluidSolver* LoadScene(const char* filepath);
+    static FluidSolver* LoadScene(const std::string &jsonstring);
+
+    static const char * defaultScene;
+
+private:
+    static FluidSolver* parseJson(const Json::Value &root);
+};
+
+
+#endif //FLUIDSOLVER_SCENELOADER_H
diff --git a/core/geometry/Bound.cpp b/core/geometry/Bound.cpp
new file mode 100644
index 00000000..2c0adc7f
--- /dev/null
+++ b/core/geometry/Bound.cpp
@@ -0,0 +1,158 @@
+//
+// Created by austin on 2/27/16.
+//
+
+#include "Bound.h"
+
+Bound::Bound() {}
+
+Bound::Bound(const glm::vec3 &center, const glm::vec3 &dim) : Bound(center, dim.x, dim.y, dim.z) {}
+
+Bound::Bound(const glm::vec3 &center, float sX, float sY, float sZ) : Bound(center.x - sX / 2.f, center.y - sY / 2.f, center.z - sZ / 2.f,
+                                                                      center.x + sX/2.f, center.y + sY/2.f, center.z + sZ/2.f) {}
+
+Bound::Bound(float cX, float cY, float cZ, const glm::vec3 &dim) : Bound(cX - dim.x, cY - dim.y, cZ - dim.z,
+                                                                   cX + dim.x, cY + dim.y, cZ + dim.z) {}
+
+Bound::Bound(float minX, float minY, float minZ, float maxX, float maxY, float maxZ)
+        : _minX(minX), _minY(minY), _minZ(minZ), _maxX(maxX), _maxY(maxY), _maxZ(maxZ) {}
+
+Bound::~Bound() {}
+
+float Bound::minX() const { return _minX; }
+
+float Bound::minY() const { return _minY; }
+
+float Bound::minZ() const { return _minZ; }
+
+float Bound::maxX() const { return _maxX; }
+
+float Bound::maxY() const { return _maxY; }
+
+float Bound::maxZ() const { return _maxZ; }
+
+float Bound::width() const { return _maxX - _minX; }
+
+float Bound::height() const { return _maxY - _minY; }
+
+float Bound::depth() const { return _maxZ - _minZ; }
+
+glm::vec3 Bound::dim() const { return glm::vec3(_maxX - _minX, _maxY - _minY, _maxZ - _minZ); }
+
+glm::vec3 Bound::center() const { return glm::vec3((_minX + _maxX) / 2.f, (_minY + _maxY) / 2.f, (_minZ + _maxZ) / 2.f); }
+
+bool Bound::contains(const glm::vec3 &pt) const {
+    return (pt.x >= _minX && pt.x < _maxX &&
+            pt.y >= _minY && pt.y < _maxY &&
+            pt.z >= _minZ && pt.z < _maxZ);
+}
+
+bool Bound::collidesPt(const glm::vec3 &pt, glm::vec3 &normal, float tolerance) const {
+    if (fequal(pt.x, _minX, tolerance)) {
+        if (pt.y >= _minY && pt.y < _maxY && pt.z >= _minZ && pt.z < _maxZ) {
+            if (pt.x < _minX) {
+                normal = glm::vec3(-1.f, 0.f, 0.f);
+            } else {
+                normal = glm::vec3(1.f, 0.f, 0.f);
+            }
+            return true;
+        }
+    } else if (fequal(pt.x, _maxX, tolerance)) {
+        if (pt.y >= _minY && pt.y < _maxY && pt.z >= _minZ && pt.z < _maxZ) {
+            if (pt.x > _maxX) {
+                normal = glm::vec3(1.f, 0.f, 0.f);
+            } else {
+                normal = glm::vec3(-1.f, 0.f, 0.f);
+            }
+            return true;
+        }
+    }
+    if (fequal(pt.y, _minY, tolerance)) {
+        if (pt.x >= _minX && pt.x < _maxX && pt.z >= _minZ && pt.z < _maxZ) {
+            if (pt.y < _minY) {
+                normal = glm::vec3(0.f, -1.f, 0.f);
+            } else {
+                normal = glm::vec3(0.f, 1.f, 0.f);
+            }
+            return true;
+        }
+    } else if (fequal(pt.y, _maxY, tolerance)) {
+        if (pt.x >= _minX && pt.x < _maxX && pt.z >= _minZ && pt.z < _maxZ) {
+            if (pt.y > _maxY) {
+                normal = glm::vec3(0.f, 1.f, 0.f);
+            } else {
+                normal = glm::vec3(0.f, -1.f, 0.f);
+            }
+            return true;
+        }
+    }
+    if (fequal(pt.z, _minZ, tolerance)) {
+        if (pt.x >= _minX && pt.x < _maxX && pt.y >= _minY && pt.y < _maxY) {
+            if (pt.z < _minZ) {
+                normal = glm::vec3(0.f, 0.f, -1.f);
+            } else {
+                normal = glm::vec3(0.f, 0.f, 1.f);
+            }
+            return true;
+        }
+    } else if (fequal(pt.z, _maxZ, tolerance)) {
+        if (pt.x >= _minX && pt.x < _maxX && pt.y >= _minY && pt.y < _maxY) {
+            if (pt.z > _maxZ) {
+                normal = glm::vec3(0.f, 0.f, 1.f);
+            } else {
+                normal = glm::vec3(0.f, 0.f, -1.f);
+            }
+            return true;
+        }
+    }
+    return false;
+}
+
+bool Bound::collides(const glm::vec3 &prev, const glm::vec3 &next, glm::vec3 &normal) const {
+    if (prev.y >= _minY && prev.y < _maxY && prev.z >= _minZ && prev.z < _maxZ) {
+        if (prev.x > _minX && next.x <= _minX) {        // cross minX plane
+            normal = glm::vec3(1.f, 0.f, 0.f);
+            return true;
+        } else if (prev.x < _minX && next.x >= _minX) {
+            normal = glm::vec3(-1.f, 0.f, 0.f);
+            return true;
+        } else if (prev.x < _maxX && next.x >= _maxX) { // cross maxX plane
+            normal = glm::vec3(-1.f, 0.f, 0.f);
+            return true;
+        } else if (prev.x > _maxX && next.x <= _maxX) {
+            normal = glm::vec3(1.f, 0.f, 0.f);
+            return true;
+        }
+    }
+    if (prev.x >= _minX && prev.x < _maxX && prev.z >= _minZ && prev.z < _maxZ) {
+        if (prev.y > _minY && next.y <= _minY) {        // cross minY plane
+            normal = glm::vec3(0.f, 1.f, 0.f);
+            return true;
+        } else if (prev.y < _minY && next.y >= _minY) {
+            normal = glm::vec3(0.f, -1.f, 0.f);
+            return true;
+        } else if (prev.y < _maxY && next.y >= _maxY) { // cross maxY plane
+            normal = glm::vec3(0.f, -1.f, 0.f);
+            return true;
+        } else if (prev.y > _maxY && next.y <= _maxY) {
+            normal = glm::vec3(0.f, 1.f, 0.f);
+            return true;
+        }
+    }
+    if (prev.x >= _minX && prev.x < _maxX && prev.y >= _minY && prev.y < _maxY) {
+        if (prev.z > _minZ && next.z <= _minZ) {        // cross minZ plane
+            normal = glm::vec3(0.f, 1.f, 0.f);
+            return true;
+        } else if (prev.z < _minZ && next.z >= _minZ) {
+            normal = glm::vec3(0.f, -1.f, 0.f);
+            return true;
+        } else if (prev.z < _maxZ && next.z >= _maxZ) { // cross maxZ plane
+            normal = glm::vec3(0.f, -1.f, 0.f);
+            return true;
+        } else if (prev.z > _maxZ && next.z <= _maxZ) {
+            normal = glm::vec3(0.f, 1.f, 0.f);
+            return true;
+        }
+    }
+    return false;
+}
\ No newline at end of file
diff --git a/core/geometry/Bound.h b/core/geometry/Bound.h
new file mode 100644
index 00000000..5ce97bef
--- /dev/null
+++ b/core/geometry/Bound.h
@@ -0,0 +1,45 @@
+//
+// Created by austin on 2/27/16.
+//
+
+#ifndef FLUIDSOLVER_BOUND_H
+#define FLUIDSOLVER_BOUND_H
+
+#include "Geo.h"
+
+class Bound : public Geo {
+public:
+    Bound();
+    Bound(const glm::vec3 &center, const glm::vec3 &dim);
+    Bound(const glm::vec3 &center, float sX, float sY, float sZ);
+    Bound(float cX, float cY, float cZ, const glm::vec3 &dim);
+    Bound(float minX, float minY, float minZ, float maxX, float maxY, float maxZ);
+    virtual ~Bound();
+
+    float minX() const;
+    float minY() const;
+    float minZ() const;
+    float maxX() const;
+    float maxY() const;
+    float maxZ() const;
+    float width() const;
+    float height() const;
+    float depth() const;
+    glm::vec3 dim() const;
+    glm::vec3 center() const;
+
+    virtual bool contains(const glm::vec3 &pt) const;
+    virtual bool collidesPt(const glm::vec3 &pt, glm::vec3 &normal, float tolerance = 0.001f) const;
+    virtual bool collides(const glm::vec3 &prev, const glm::vec3 &next, glm::vec3 &normal) const;
+
+private:
+    float _minX;
+    float _minY;
+    float _minZ;
+    float _maxX;
+    float _maxY;
+    float _maxZ;
+};
+
+
+#endif //FLUIDSOLVER_BOUND_H
diff --git a/core/geometry/Box.cpp b/core/geometry/Box.cpp
new file mode 100644
index 00000000..02f69ba2
--- /dev/null
+++ b/core/geometry/Box.cpp
@@ -0,0 +1,18 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#include "Box.h"
+
+Box::Box(const glm::vec3 &center, const glm::vec3 &dim) : Box(center, dim.x, dim.y, dim.z) {}
+
+Box::Box(const glm::vec3 &center, float sX, float sY, float sZ) : Box(center.x - sX / 2.f, center.y - sY / 2.f, center.z - sZ / 2.f,
+                                                                        center.x + sX/2.f, center.y + sY/2.f, center.z + sZ/2.f) {}
+
+Box::Box(float cX, float cY, float cZ, const glm::vec3 &dim) : Box(cX - dim.x, cY - dim.y, cZ - dim.z,
+                                                                     cX + dim.x, cY + dim.y, cZ + dim.z) {}
+
+Box::Box(float minX, float minY, float minZ, float maxX, float maxY, float maxZ) :
+    Bound(minX, minY, minZ, maxX, maxY, maxZ), GeoObject() {
+    computeBound();
+}
diff --git a/core/geometry/Box.h b/core/geometry/Box.h
new file mode 100644
index 00000000..2ce520c0
--- /dev/null
+++ b/core/geometry/Box.h
@@ -0,0 +1,33 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_BOX_H
+#define FLUIDSOLVER_BOX_H
+
+#include "GeoObject.h"
+
+class Box : public Bound, public GeoObject {
+public:
+    Box(const glm::vec3 &center, const glm::vec3 &dim);
+    Box(const glm::vec3 &center, float sX, float sY, float sZ);
+    Box(float cX, float cY, float cZ, const glm::vec3 &dim);
+    Box(float minX, float minY, float minZ, float maxX, float maxY, float maxZ);
+
+    virtual bool contains(const glm::vec3 &pt) const {
+        return _bound.contains(pt);
+    }
+    virtual bool collidesPt(const glm::vec3 &pt, glm::vec3 &normal, float tolerance = 0.001f) const {
+        return _bound.collidesPt(pt, normal, tolerance);
+    }
+    virtual bool collides(const glm::vec3 &prev, const glm::vec3 &next, glm::vec3 &normal) const {
+        return _bound.collides(prev, next, normal);
+    }
+
+    virtual void computeBound() {
+        _bound = *this;
+    }
+};
+
+
+#endif //FLUIDSOLVER_BOX_H
diff --git a/core/geometry/Geo.cpp b/core/geometry/Geo.cpp
new file mode 100644
index 00000000..6d5b6e1a
--- /dev/null
+++ b/core/geometry/Geo.cpp
@@ -0,0 +1,28 @@
+//
+// Created by austin on 2/27/16.
+//
+
+
+#include "Geo.h"
+
+bool Geo::collidesPt(const glm::vec3 &pt, float tolerance) const {
+    glm::vec3 norm;
+    return collidesPt(pt, norm, tolerance);
+}
+
+bool Geo::collides(const glm::vec3 &prev, const glm::vec3 &next) const {
+    glm::vec3 norm;
+    return collides(prev, next, norm);
+}
+
+bool Geo::collidesRay(const glm::vec3 &pt, const glm::vec3 &dir, float step) const {
+    assert(fequal(glm::length(dir), 1.f));
+    glm::vec3 nextPt = pt + dir * step;
+    return collides(pt, nextPt);
+}
+
+bool Geo::collidesRay(const glm::vec3 &pt, const glm::vec3 &dir, glm::vec3 &normal, float step) const {
+    assert(fequal(glm::length(dir), 1.f));
+    glm::vec3 nextPt = pt + dir * step;
+    return collides(pt, nextPt, normal);
+}
\ No newline at end of file
diff --git a/core/geometry/Geo.h b/core/geometry/Geo.h
new file mode 100644
index 00000000..63f7930d
--- /dev/null
+++ b/core/geometry/Geo.h
@@ -0,0 +1,31 @@
+//
+// Created by austin on 2/27/16.
+//
+
+#ifndef FLUIDSOLVER_GEO_H
+#define FLUIDSOLVER_GEO_H
+
+#include <core/util/math.h>
+
+class Geo {
+public:
+    Geo() { }
+    virtual ~Geo() { }
+
+    virtual bool contains(const glm::vec3 &pt) const = 0;
+
+    virtual bool collidesPt(const glm::vec3 &pt, float tolerance = 0.001f) const;
+
+    virtual bool collidesPt(const glm::vec3 &pt, glm::vec3 &normal, float tolerance = 0.001f) const = 0;
+
+    virtual bool collides(const glm::vec3 &prev, const glm::vec3 &next) const;
+
+    virtual bool collides(const glm::vec3 &prev, const glm::vec3 &next, glm::vec3 &normal) const = 0;
+
+    virtual bool collidesRay(const glm::vec3 &pt, const glm::vec3 &dir, float step = 0.001f) const;
+
+    virtual bool collidesRay(const glm::vec3 &pt, const glm::vec3 &dir, glm::vec3 &normal, float step = 0.001f) const;
+};
+
+
+#endif //FLUIDSOLVER_GEO_H
diff --git a/core/geometry/GeoObject.h b/core/geometry/GeoObject.h
new file mode 100644
index 00000000..525a8b37
--- /dev/null
+++ b/core/geometry/GeoObject.h
@@ -0,0 +1,23 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_GEOOBJECT_H
+#define FLUIDSOLVER_GEOOBJECT_H
+
+#include "Geo.h"
+#include "Bound.h"
+
+class GeoObject : public Geo {
+public:
+    GeoObject() { }
+    virtual ~GeoObject() {}
+    virtual void computeBound() = 0;
+    const Bound& bound() const { return _bound; }
+
+protected:
+    Bound _bound;
+};
+
+
+#endif //FLUIDSOLVER_GEOOBJECT_H
diff --git a/core/scenes/default.h b/core/scenes/default.h
new file mode 100644
index 00000000..58527e6f
--- /dev/null
+++ b/core/scenes/default.h
@@ -0,0 +1,29 @@
+//
+// Created by austin on 2/29/16.
+//
+
+const char* default_scene = R"(
+{
+  "containerDim": {
+    "position": [0,0,0],
+    "scale" : [6,6,6]
+  },
+  "particleDim": {
+    "position": [0,0,0],
+    "scale": [4,4,4]
+  },
+  "resolution": 20
+}
+
+/*{
+    "containerDim" : {
+        "position": [0, 0, 0],
+        "scale" : [40, 20, 20]
+    },
+    "particleDim" : {
+       "position": [-10, 0, 0],
+        "scale" : [10, 19, 19]
+    },
+    "resolution" : 20
+};*/
+)";
diff --git a/core/solver/FluidParticle.h b/core/solver/FluidParticle.h
new file mode 100644
index 00000000..1fd3eb7b
--- /dev/null
+++ b/core/solver/FluidParticle.h
@@ -0,0 +1,25 @@
+//
+// Created by austin on 3/20/16.
+//
+
+#ifndef FLUIDSOLVER_FLUIDPARTICLE_H
+#define FLUIDSOLVER_FLUIDPARTICLE_H
+
+#include <core/util/math.h>
+
+struct FluidParticle {
+    glm::vec3 pos;
+    glm::vec3 pos_old;
+    glm::vec3 vel;
+    glm::vec3 col;
+    glm::ivec3 cell;
+
+    FluidParticle() {
+        pos = glm::vec3(0);
+        pos_old = glm::vec3(0);
+        vel = glm::vec3(0,0,0);
+        col = glm::vec3(0.0f, 0.0f, 1.f);
+    }
+};
+
+#endif //FLUIDSOLVER_FLUIDPARTICLE_H
diff --git a/core/solver/FluidSolver.cpp b/core/solver/FluidSolver.cpp
new file mode 100644
index 00000000..0d781b84
--- /dev/null
+++ b/core/solver/FluidSolver.cpp
@@ -0,0 +1,752 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#include <iostream>
+#include "FluidSolver.h"
+#include <core/util/hacks.h>
+#include <tbb/parallel_for.h>
+#include <core/util/flags.h>
+#include <tbb/parallel_invoke.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range3d.h>
+#include <Eigen/Sparse>
+#include <iostream>
+
+
+float FluidSolver::g = -9.80665f;
+
+FluidSolver::FluidSolver(float particleSep, float gridSize) : particle_radius(particleSep), _cell_size(gridSize), frame(0) {
+//    particle_radius /= 2;
+}
+
+FluidSolver::~FluidSolver() {
+    delete _container;
+    delete _MAC;
+}
+
+void FluidSolver::setContainer(GeoObject* container) {
+    _container = container;
+    glm::vec3 size = _container->bound().dim();
+    glm::vec3 origin = (_container->bound().center() - size / 2.f);
+    _MAC = new MACGrid<std::vector<FluidParticle*> >(
+            origin - glm::vec3(_cell_size, _cell_size, _cell_size),
+            size + 2.f*glm::vec3(_cell_size, _cell_size, _cell_size),
+            _cell_size
+    );
+//    _MAC = new MACGrid<std::vector<FluidParticle*> >(origin, size, _cell_size);
+
+    std::function<void(size_t, size_t, size_t)> setSolid = [&](size_t i, size_t j, size_t k) {
+        _MAC->_gType(i,j,k) = SOLID;
+    };
+
+    _MAC->_gType.iterateRegion(0,0,0, 1,_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(_MAC->_gType.countX()-1,0,0, _MAC->_gType.countX(),_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(0,0,0, _MAC->_gType.countX(),1,_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(0,_MAC->_gType.countY()-1,0, _MAC->_gType.countX(),_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(0,0,0, _MAC->_gType.countX(),_MAC->_gType.countY(),1, setSolid);
+    _MAC->_gType.iterateRegion(0,0,_MAC->_gType.countZ()-1, _MAC->_gType.countX(),_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+
+    /*_MAC = MACGrid<std::vector<FluidParticle*> >(
+            origin,
+            size,
+            _cell_size
+    );*/
+}
+
+/*
+ * Loop over fluid bounds to generate particles
+ */
+void FluidSolver::addFluid(const GeoObject &fluid) {
+//    FluidParticle p;
+//
+//    p.pos = glm::vec3(-1,5,1);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-1,3,1);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-1,5,3);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-1,3,3);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-3,5,1);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-3,3,1);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-3,5,3);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//
+//    p.pos = glm::vec3(-3,3,3);
+//    _particles.push_back(p);
+//    _MAC->_gType.at(p.pos) = FLUID;
+//    return;
+
+    const Bound& b = fluid.bound();
+    for (float x = b.minX(); x < b.maxX(); x += particle_radius) {
+        for (float y = b.minY(); y < b.maxY(); y += particle_radius) {
+            for (float z = b.minZ(); z < b.maxZ(); z += particle_radius) {
+                glm::vec3 pos = glm::vec3(x, y, z) + glm::vec3(_cell_size)/2.f;
+                if (fluid.contains(pos)) {
+                    FluidParticle p;
+                    p.pos = pos;
+                    _particles.push_back(p);
+                    _MAC->_gType.at(pos) = FLUID;
+                }
+            }
+        }
+    }
+    std::cout << "Added " << _particles.size() << " particles" << std::endl;
+}
+
+void FluidSolver::init() {
+
+    for (FluidParticle &particle : _particles) {
+        particle.cell = _MAC->indexOf(particle.pos);
+        _MAC->atIdx(particle.cell).push_back(&particle);
+    }
+    _MAC->_gU.clear(0);
+    _MAC->_gV.clear(0);
+    _MAC->_gW.clear(0);
+
+//    for (size_t idx = 0; idx < _MAC->_gU.size(); idx++) {
+//        size_t i, j, k;
+//        _MAC->_gU.toIJK(idx, i,j,k);
+//        std::cout << idx << "; " << i << "," << j << "," << k << "; " << _MAC->_gU.fromIJK(i,j,k) << std::endl;
+//    }
+//
+//    std::size_t velOffset = offsetof(FluidParticle, vel);
+//    std::size_t U_offset = velOffset + offsetof(glm::vec3, x);
+//    std::size_t V_offset = velOffset + offsetof(glm::vec3, y);
+//    std::size_t W_offset = velOffset + offsetof(glm::vec3, z);
+//    particleAttributeToGrid(U_offset, _MAC->_gU_old, _cell_size, 0.f);
+//    particleAttributeToGrid(V_offset, _MAC->_gV_old, _cell_size, 0.f);
+//    particleAttributeToGrid(W_offset, _MAC->_gW_old, _cell_size, 0.f);
+//    particleAttributeToGrid(U_offset, _MAC->_gU, _cell_size, 0.f);
+//    particleAttributeToGrid(V_offset, _MAC->_gV, _cell_size, 0.f);
+//    particleAttributeToGrid(W_offset, _MAC->_gW, _cell_size, 0.f);
+}
+
+void FluidSolver::projectVelocitiesToGrid() {
+    std::size_t velOffset = offsetof(FluidParticle, vel);
+    std::size_t U_offset = velOffset + offsetof(glm::vec3, x);
+    std::size_t V_offset = velOffset + offsetof(glm::vec3, y);
+    std::size_t W_offset = velOffset + offsetof(glm::vec3, z);
+
+#ifdef USETBB
+    tbb::parallel_invoke(
+            [&](){
+                particleAttributeToGrid(U_offset, _MAC->_gU, _cell_size, 0.f);
+            },
+            [&](){
+                particleAttributeToGrid(V_offset, _MAC->_gV, _cell_size, 0.f);
+            },
+            [&](){
+                particleAttributeToGrid(W_offset, _MAC->_gW, _cell_size, 0.f);
+            }
+    );
+    tbb::parallel_invoke(
+            [&]() {
+                _MAC->_gU_old = _MAC->_gU;
+            },
+            [&]() {
+                _MAC->_gV_old = _MAC->_gV;
+            },
+            [&]() {
+                _MAC->_gW_old = _MAC->_gW;
+            }
+    );
+
+#else
+    particleAttributeToGrid(U_offset, _MAC->_gU, _cell_size, 0.f);
+    particleAttributeToGrid(V_offset, _MAC->_gV, _cell_size, 0.f);
+    particleAttributeToGrid(W_offset, _MAC->_gW, _cell_size, 0.f);
+
+    _MAC->_gU_old = _MAC->_gU;
+    _MAC->_gV_old = _MAC->_gV;
+    _MAC->_gW_old = _MAC->_gW;
+#endif
+}
+
+void FluidSolver::transferVelocitiesToParticles() {
+    float smooth = 0.05f;
+
+#ifdef USETBB
+    tbb::parallel_invoke(
+            [&]() {
+                tbb::parallel_for(tbb::blocked_range<size_t>(0, _particles.size()),
+                                  [&](const tbb::blocked_range<size_t> &r) {
+                                      for (size_t i = r.begin(); i != r.end(); ++i) {
+                                          FluidParticle &particle = _particles[i];
+                                          float vel = interpolateAttribute(particle.pos, _MAC->_gU);
+                                          float oldVel = interpolateAttribute(particle.pos, _MAC->_gU_old);
+                                          particle.vel.x = vel*smooth + (particle.vel.x +(vel - oldVel))*(1.f-smooth);
+                                      }
+                                  });
+            },
+            [&]() {
+                tbb::parallel_for(tbb::blocked_range<size_t>(0, _particles.size()),
+                                  [&](const tbb::blocked_range<size_t> &r) {
+                                      for (size_t i = r.begin(); i != r.end(); ++i) {
+                                          FluidParticle &particle = _particles[i];
+                                          float vel = interpolateAttribute(particle.pos, _MAC->_gV);
+                                          float oldVel = interpolateAttribute(particle.pos, _MAC->_gV_old);
+                                          particle.vel.y = vel*smooth + (particle.vel.y +(vel - oldVel))*(1.f-smooth);
+                                      }
+                                  });
+            },
+            [&]() {
+                tbb::parallel_for(tbb::blocked_range<size_t>(0, _particles.size()),
+                                  [&](const tbb::blocked_range<size_t> &r) {
+                                      for (size_t i = r.begin(); i != r.end(); ++i) {
+                                          FluidParticle &particle = _particles[i];
+                                          float vel = interpolateAttribute(particle.pos, _MAC->_gW);
+                                          float oldVel = interpolateAttribute(particle.pos, _MAC->_gW_old);
+                                          particle.vel.z = vel*smooth + (particle.vel.z +(vel - oldVel))*(1.f-smooth);
+                                      }
+                                  });
+            }
+    );
+#else
+    for (FluidParticle &particle : _particles) {
+        float vel = interpolateAttribute(particle.pos, _MAC->_gU);
+        float oldVel = interpolateAttribute(particle.pos, _MAC->_gU_old);
+        particle.vel.x = vel*smooth + (particle.vel.x +(vel - oldVel))*(1.f-smooth);
+    }
+    for (FluidParticle &particle : _particles) {
+        float vel = interpolateAttribute(particle.pos, _MAC->_gV);
+        float oldVel = interpolateAttribute(particle.pos, _MAC->_gV_old);
+        particle.vel.y = vel*smooth + (particle.vel.y +(vel - oldVel))*(1.f-smooth);
+    }
+    for (FluidParticle &particle : _particles) {
+        float vel = interpolateAttribute(particle.pos, _MAC->_gW);
+        float oldVel = interpolateAttribute(particle.pos, _MAC->_gW_old);
+        particle.vel.z = vel*smooth + (particle.vel.z +(vel - oldVel))*(1.f-smooth);
+    }
+#endif
+}
+
+void FluidSolver::enforceBoundary() {
+    _MAC->_gType.iterate([&](size_t i, size_t j, size_t k) {
+        switch (_MAC->_gType(i,j,k)) {
+            case EMPTY:break;
+            case FLUID:break;
+            case SOLID:
+                /*_MAC->_gU(i,j,k) = 0;
+                _MAC->_gV(i,j,k) = 0;
+                _MAC->_gW(i,j,k) = 0;
+                _MAC->_gU(i+1,j,k) = 0;
+                _MAC->_gV(i,j+1,k) = 0;
+                _MAC->_gW(i,j,k+1) = 0;
+                */
+                if (i == 0 || _MAC->_gType(i-1,j,k) != SOLID) {
+                    _MAC->_gU(i,j,k) = std::min(0.f, _MAC->_gU(i,j,k));
+                }
+                if (i == _MAC->_gType.countX() - 1 || _MAC->_gType(i+1,j,k) != SOLID) {
+                    _MAC->_gU(i+1,j,k) = std::max(0.f, _MAC->_gU(i+1,j,k));
+                }
+                if (j == 0 || _MAC->_gType(i,j-1,k) != SOLID) {
+                    _MAC->_gV(i,j,k) = std::min(0.f, _MAC->_gV(i,j,k));
+                }
+                if (j == _MAC->_gType.countY() - 1 || _MAC->_gType(i,j+1,k) != SOLID) {
+                    _MAC->_gV(i,j+1,k) = std::max(0.f, _MAC->_gV(i,j+1,k));
+                }
+                if (k == 0 || _MAC->_gType(i,j,k-1) != SOLID) {
+                    _MAC->_gW(i,j,k) = std::min(0.f, _MAC->_gW(i,j,k));
+                }
+                if (k == _MAC->_gType.countZ() - 1 || _MAC->_gType(i,j,k+1) != SOLID) {
+                    _MAC->_gW(i,j,k+1) = std::max(0.f, _MAC->_gW(i,j,k+1));
+                }
+                break;
+            default:break;
+        }
+    });
+    //return;
+
+
+    /*_MAC->_gU.iterateRegion(0,0,0, 1,_MAC->_gU.countY(),_MAC->_gU.countZ(), [&](size_t i, size_t j, size_t k) {
+        _MAC->_gU(i,j,k) = std::min(0.f, _MAC->_gU(i,j,k));
+    });
+    _MAC->_gU.iterateRegion(_MAC->_gU.countX()-1,0,0, _MAC->_gU.countX(),_MAC->_gU.countY(),_MAC->_gU.countZ(), [&](size_t i, size_t j, size_t k) {
+        _MAC->_gU(i,j,k) = std::max(0.f, _MAC->_gU(i,j,k));
+    });
+    _MAC->_gV.iterateRegion(0,0,0, _MAC->_gV.countX(),1,_MAC->_gV.countZ(), [&](size_t i, size_t j, size_t k) {
+        _MAC->_gV(i,j,k) = 0;
+    });
+    _MAC->_gV.iterateRegion(0,_MAC->_gV.countY()-1,0, _MAC->_gV.countX(),_MAC->_gV.countY(),_MAC->_gV.countZ(), [&](size_t i, size_t j, size_t k) {
+        _MAC->_gV(i,j,k) = 0;
+    });
+    _MAC->_gW.iterateRegion(0,0,0, _MAC->_gW.countX(),_MAC->_gW.countY(),1, [&](size_t i, size_t j, size_t k) {
+        _MAC->_gW(i,j,k) = 0;
+    });
+    _MAC->_gW.iterateRegion(0,0,_MAC->_gW.countZ()-1, _MAC->_gW.countX(),_MAC->_gW.countY(),_MAC->_gW.countZ(), [&](size_t i, size_t j, size_t k) {
+        _MAC->_gW(i,j,k) = 0;
+    });*/
+}
+
+inline void pressureMatrixHelper(std::vector<Eigen::Triplet<double>> &coeffs, const Grid<int> &grid, size_t &IDX,
+                                 int &count, const float &scale, size_t i, size_t j, size_t k) {
+    size_t idx = grid.fromIJK(i,j,k);
+    if (grid(i,j,k) == FLUID || grid(i,j,k) == EMPTY) {
+        count++;
+        if (grid(i,j,k) == FLUID) {
+            coeffs.push_back(Eigen::Triplet<double>(IDX, idx, -scale));
+        }
+    }
+}
+
+void FluidSolver::pressureSolve(float step) {
+    typedef Eigen::Triplet<double> T;
+    std::vector<T> coefficientsA;
+    std::vector<T> coefficientsB;
+    Eigen::SparseMatrix<double> A(_MAC->_gType.size(), _MAC->_gType.size());
+    Eigen::SparseMatrix<double> b(_MAC->_gType.size(), 1);
+    Eigen::SparseVector<double> x(_MAC->_gType.size());
+    A.setZero();
+    b.setZero();
+    x.setZero();
+
+    float scale = step / (1.f*_cell_size*_cell_size);
+
+    _MAC->_gType.iterate([&](size_t I, size_t J, size_t K) {
+        size_t IDX = _MAC->_gType.fromIJK(I,J,K);
+        if (_MAC->_gType(I,J,K) == FLUID) {
+            int count = 0;
+
+            if (I > 0) {                            // if I-1 >= 0
+                pressureMatrixHelper(coefficientsA, _MAC->_gType, IDX, count, scale, I-1,J,K);
+            }
+            if (I + 1 < _MAC->_gType.countX()) {     // if I + 1 < countX
+                pressureMatrixHelper(coefficientsA, _MAC->_gType, IDX, count, scale, I+1,J,K);
+            }
+            if (J > 0) {                            // if J-1 >= 0
+                pressureMatrixHelper(coefficientsA, _MAC->_gType, IDX, count, scale, I,J-1,K);
+            }
+            if (J + 1 < _MAC->_gType.countY()) {     // if J + 1 < countY
+                pressureMatrixHelper(coefficientsA, _MAC->_gType, IDX, count, scale, I,J+1,K);
+            }
+            if (K > 0) {                            // if K-1 >= 0
+                pressureMatrixHelper(coefficientsA, _MAC->_gType, IDX, count, scale, I,J,K-1);
+            }
+            if (K + 1 < _MAC->_gType.countZ()) {     // if K + 1 < countZ
+                pressureMatrixHelper(coefficientsA, _MAC->_gType, IDX, count, scale, I,J,K+1);
+            }
+
+            coefficientsA.push_back(T(IDX, IDX, count*scale));
+
+            float div =
+            -(_MAC->_gU(I+1,J,K) - _MAC->_gU(I,J,K)) / _cell_size +
+            -(_MAC->_gV(I,J+1,K) - _MAC->_gV(I,J,K)) / _cell_size +
+            -(_MAC->_gW(I,J,K+1) - _MAC->_gW(I,J,K)) / _cell_size;
+
+            if (I == 0 || _MAC->_gType(I-1,J,K) == SOLID) {
+                div -= (_MAC->_gU(I,J,K) - 0) / _cell_size;
+            }
+            if (I == _MAC->_gType.countX() - 1 || _MAC->_gType(I+1,J,K) == SOLID) {
+                div -= (0 - _MAC->_gU(I+1,J,K)) / _cell_size;
+            }
+            if (J == 0 || _MAC->_gType(I,J-1,K) == SOLID) {
+                div -= (_MAC->_gV(I,J,K) - 0) / _cell_size;
+            }
+            if (J == _MAC->_gType.countY() - 1 || _MAC->_gType(I,J+1,K) == SOLID) {
+                div -= (0 - _MAC->_gV(I,J+1,K)) / _cell_size;
+            }
+            if (K == 0 || _MAC->_gType(I,J,K-1) == SOLID) {
+                div -= (_MAC->_gW(I,J,K) - 0) / _cell_size;
+            }
+            if (K == _MAC->_gType.countZ() - 1 || _MAC->_gType(I,J,K+1) == SOLID) {
+                div -= (0 - _MAC->_gW(I,J,K+1)) / _cell_size;
+            }
+
+            coefficientsB.push_back(T(IDX,0,div));
+        }
+    }, false);
+
+    A.setFromTriplets(coefficientsA.begin(), coefficientsA.end());
+
+    b.setFromTriplets(coefficientsB.begin(), coefficientsB.end());
+//    Eigen::ConjugateGradient<Eigen::SparseMatrix<double>, Eigen::Lower, Eigen::IncompleteCholesky<double> > cg(A);
+    Eigen::ConjugateGradient<Eigen::SparseMatrix<double>, Eigen::Lower, Eigen::IdentityPreconditioner> cg(A);
+    x = cg.solve(b);
+
+    _MAC->_gP.clear(0);
+    for (Eigen::SparseVector<double>::InnerIterator it(x); it; ++it) {
+        if (_MAC->_gType(it.index()) == FLUID) {
+            _MAC->_gP(it.index()) = it.value();
+        }
+    }
+
+    scale = step/(1.f*_cell_size);
+    _MAC->_gU.iterate([&](size_t i, size_t j, size_t k) {
+        bool leftExists = i > 0;
+        bool rightExists = i < _MAC->_gP.countX();
+        bool leftFluid = leftExists && _MAC->_gType(i-1,j,k) == FLUID;
+        bool rightFluid = rightExists && _MAC->_gType(i,j,k) == FLUID;
+        if ((leftFluid || rightFluid)) {
+            float delP = _MAC->_gP(i,j,k) - _MAC->_gP(i-1,j,k);
+            _MAC->_gU(i,j,k) -= scale * delP;
+        }
+    });
+
+    _MAC->_gV.iterate([&](size_t i, size_t j, size_t k) {
+        bool leftExists = j > 0;
+        bool rightExists = j < _MAC->_gP.countY();
+        bool leftFluid = leftExists && _MAC->_gType(i,j-1,k) == FLUID;
+        bool rightFluid = rightExists && _MAC->_gType(i,j,k) == FLUID;
+        if ((leftFluid || rightFluid)) {
+            float delP = _MAC->_gP(i,j,k) - _MAC->_gP(i,j-1,k);
+            _MAC->_gV(i,j,k) -= scale * delP;
+        }
+    });
+
+    _MAC->_gW.iterate([&](size_t i, size_t j, size_t k) {
+        bool leftExists = k > 0;
+        bool rightExists = k < _MAC->_gP.countZ();
+        bool leftFluid = leftExists && _MAC->_gType(i,j,k-1) == FLUID;
+        bool rightFluid = rightExists && _MAC->_gType(i,j,k) == FLUID;
+        if ((leftFluid || rightFluid)) {
+            float delP = _MAC->_gP(i,j,k) - _MAC->_gP(i,j,k-1);
+            _MAC->_gW(i,j,k) -= scale * delP;
+        }
+    });
+
+}
+
+void FluidSolver::gravitySolve(float step) {
+    _MAC->_gV.iterate([&](size_t i, size_t j, size_t k) {
+        _MAC->_gV(i,j,k) += g*step;
+    });
+}
+
+void FluidSolver::extrapolateVelocity() {
+    _MAC->_gU.iterate([&](size_t i, size_t j, size_t k) {
+        bool shouldExtrapolate = (_MAC->_gType.checkIdx(i-1,j,k) && _MAC->_gType(i-1,j,k) != FLUID) || (_MAC->_gType.checkIdx(i,j,k) && _MAC->_gType(i,j,k) != FLUID);
+        if (shouldExtrapolate) {
+            bool fromUp = (_MAC->_gType.checkIdx(i-1,j+1,k) && _MAC->_gType(i-1,j+1,k) == FLUID) || (_MAC->_gType.checkIdx(i,j+1,k) && _MAC->_gType(i,j+1,k) == FLUID);
+            bool fromDown = (_MAC->_gType.checkIdx(i-1,j-1,k) && _MAC->_gType(i-1,j-1,k) == FLUID) || (_MAC->_gType.checkIdx(i,j-1,k) && _MAC->_gType(i,j-1,k) == FLUID);
+            bool fromFront = (_MAC->_gType.checkIdx(i-1,j,k+1) && _MAC->_gType(i-1,j,k+1) == FLUID) || (_MAC->_gType.checkIdx(i,j,k+1) && _MAC->_gType(i,j,k+1) == FLUID);
+            bool fromBack = (_MAC->_gType.checkIdx(i-1,j,k-1) && _MAC->_gType(i-1,j,k-1) == FLUID) || (_MAC->_gType.checkIdx(i,j,k-1) && _MAC->_gType(i,j,k-1) == FLUID);
+
+            float val = 0;
+            int count = fromUp + fromDown + fromFront + fromBack;
+            if (fromUp) val += _MAC->_gU(i,j+1,k);
+            if (fromDown) val += _MAC->_gU(i,j-1,k);
+            if (fromFront) val += _MAC->_gU(i,j,k+1);
+            if (fromBack) val += _MAC->_gU(i,j,k-1);
+
+            if (count > 0) {
+                _MAC->_gU(i,j,k) = val / count;
+            }
+        }
+    });
+
+    _MAC->_gV.iterate([&](size_t i, size_t j, size_t k) {
+        bool shouldExtrapolate = (_MAC->_gType.checkIdx(i,j-1,k) && _MAC->_gType(i,j-1,k) != FLUID) || (_MAC->_gType.checkIdx(i,j,k) && _MAC->_gType(i,j,k) != FLUID);
+        if (shouldExtrapolate) {
+            bool fromUp = (_MAC->_gType.checkIdx(i+1,j-1,k) && _MAC->_gType(i+1,j-1,k) == FLUID) || (_MAC->_gType.checkIdx(i+1,j,k) && _MAC->_gType(i+1,j,k) == FLUID);
+            bool fromDown = (_MAC->_gType.checkIdx(i-1,j-1,k) && _MAC->_gType(i-1,j-1,k) == FLUID) || (_MAC->_gType.checkIdx(i-1,j,k) && _MAC->_gType(i-1,j,k) == FLUID);
+            bool fromFront = (_MAC->_gType.checkIdx(i,j-1,k+1) && _MAC->_gType(i,j-1,k+1) == FLUID) || (_MAC->_gType.checkIdx(i,j,k+1) && _MAC->_gType(i,j,k+1) == FLUID);
+            bool fromBack = (_MAC->_gType.checkIdx(i,j-1,k-1) && _MAC->_gType(i,j-1,k-1) == FLUID) || (_MAC->_gType.checkIdx(i,j,k-1) && _MAC->_gType(i,j,k-1) == FLUID);
+
+            float val = 0;
+            int count = fromUp + fromDown + fromFront + fromBack;
+            if (fromUp) val += _MAC->_gV(i+1,j,k);
+            if (fromDown) val += _MAC->_gV(i-1,j,k);
+            if (fromFront) val += _MAC->_gV(i,j,k+1);
+            if (fromBack) val += _MAC->_gV(i,j,k-1);
+
+            if (count > 0) {
+                _MAC->_gV(i,j,k) = val / count;
+            }
+        }
+    });
+
+    _MAC->_gW.iterate([&](size_t i, size_t j, size_t k) {
+        bool shouldExtrapolate = (_MAC->_gType.checkIdx(i,j,k-1) && _MAC->_gType(i,j,k-1) != FLUID) || (_MAC->_gType.checkIdx(i,j,k) && _MAC->_gType(i,j,k) != FLUID);
+        if (shouldExtrapolate) {
+            bool fromUp = (_MAC->_gType.checkIdx(i+1,j,k-1) && _MAC->_gType(i+1,j,k-1) == FLUID) || (_MAC->_gType.checkIdx(i+1,j,k) && _MAC->_gType(i+1,j,k) == FLUID);
+            bool fromDown = (_MAC->_gType.checkIdx(i-1,j,k-1) && _MAC->_gType(i-1,j,k-1) == FLUID) || (_MAC->_gType.checkIdx(i-1,j,k) && _MAC->_gType(i-1,j,k) == FLUID);
+            bool fromFront = (_MAC->_gType.checkIdx(i,j+1,k-1) && _MAC->_gType(i,j+1,k-1) == FLUID) || (_MAC->_gType.checkIdx(i,j+1,k) && _MAC->_gType(i,j+1,k) == FLUID);
+            bool fromBack = (_MAC->_gType.checkIdx(i,j-1,k-1) && _MAC->_gType(i,j-1,k-1) == FLUID) || (_MAC->_gType.checkIdx(i,j-1,k) && _MAC->_gType(i,j-1,k) == FLUID);
+
+            float val = 0;
+            int count = fromUp + fromDown + fromFront + fromBack;
+            if (fromUp) val += _MAC->_gW(i+1,j,k);
+            if (fromDown) val += _MAC->_gW(i-1,j,k);
+            if (fromFront) val += _MAC->_gW(i,j+1,k);
+            if (fromBack) val += _MAC->_gW(i,j-1,k);
+
+            if (count > 0) {
+                _MAC->_gW(i,j,k) = val / count;
+            }
+        }
+    });
+
+}
+
+void FluidSolver::updateParticlePositions(float step) {
+
+#ifdef USETBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, _particles.size()), [&](const tbb::blocked_range<size_t> &r) {
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+            FluidParticle &particle = _particles[i];
+            particle.pos_old = particle.pos;
+
+            glm::vec3 k1 = step*particle.vel;
+            particle.pos += step * glm::vec3(
+                    interpolateAttribute(particle.pos + 0.5f*k1, _MAC->_gU),
+                    interpolateAttribute(particle.pos + 0.5f*k1, _MAC->_gV),
+                    interpolateAttribute(particle.pos + 0.5f*k1, _MAC->_gW)
+            );
+        }
+    });
+#else
+    for (FluidParticle &particle : _particles) {
+        particle.pos_old = particle.pos;
+        particle.pos += particle.vel * step;
+    }
+#endif
+}
+
+void FluidSolver::resolveCollisions() {
+
+#ifdef USETBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, _particles.size()), [&](const tbb::blocked_range<size_t> &r) {
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+            FluidParticle &particle = _particles[i];
+            glm::vec3 normal;
+            if (_container->collides(particle.pos_old, particle.pos, normal)) {
+                //particle.col = glm::vec3(1,0,0);
+                glm::vec3 mask = glm::vec3(1,1,1) - glm::abs(normal);
+                particle.vel *= mask;
+                particle.pos = particle.pos_old;
+            }
+        }
+    });
+#else
+    for (FluidParticle &particle : _particles) {
+        glm::vec3 normal;
+        if (_container->collides(particle.pos_old, particle.pos, normal)) {
+            particle.col = glm::vec3(1,0,0);
+            glm::vec3 mask = glm::vec3(1,1,1) - glm::abs(normal);
+            //particle.vel *= mask;
+            particle.pos = particle.pos_old;
+        }
+    }
+#endif
+}
+
+void FluidSolver::updateCells() {
+    _MAC->clear(std::vector<FluidParticle*>());
+    _MAC->_gType.clear(EMPTY);
+    for (FluidParticle &particle : _particles) {
+        particle.cell = _MAC->indexOf(particle.pos);
+        _MAC->_gType(particle.cell) = FLUID;
+        if (_MAC->checkIdx(particle.cell)) {
+            _MAC->atIdx(particle.cell).push_back(&particle);
+        } else {
+            //std::cerr << "particle out of bounds" << std::endl;
+        }
+
+    }
+
+    std::function<void(size_t, size_t, size_t)> setSolid = [&](size_t i, size_t j, size_t k) {
+        _MAC->_gType(i,j,k) = SOLID;
+    };
+
+    _MAC->_gType.iterateRegion(0,0,0, 1,_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(_MAC->_gType.countX()-1,0,0, _MAC->_gType.countX(),_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(0,0,0, _MAC->_gType.countX(),1,_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(0,_MAC->_gType.countY()-1,0, _MAC->_gType.countX(),_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+    _MAC->_gType.iterateRegion(0,0,0, _MAC->_gType.countX(),_MAC->_gType.countY(),1, setSolid);
+    _MAC->_gType.iterateRegion(0,0,_MAC->_gType.countZ()-1, _MAC->_gType.countX(),_MAC->_gType.countY(),_MAC->_gType.countZ(), setSolid);
+
+}
+
+inline float kernel(float r, float h) {
+//    float e = 2*r/h;
+//    return 1.f/(PI*h*h*h) * MATHIFELSE(
+//            1.f - 3.f/2.f * e*e  + 3.f/4.f * e*e*e,
+//            MATHIFELSE(
+//                    1.f/4.f * (2-e)*(2-e)*(2-e),
+//                    0,
+//                    e > 2
+//            ),
+//            e > 1
+//    );
+//    return (float) (315 / (64 * PI * pow(h, 9)) * MATHIFELSE(0, pow(h * h - r * r, 3), 0 <= r && r <= h));
+    return MATHIFELSE(0, 1.f-r/h, 0 <= r && r <= h);
+}
+
+
+template<typename T> void FluidSolver::particleAttributeToGrid(std::size_t offset, Grid<T> &grid, float radius, T zeroVal) {
+    std::size_t attributeSize = sizeof(T);
+    std::size_t cellRadius = (size_t) glm::ceil(radius / _cell_size);
+
+//    grid.clear(zeroVal);
+//    std::vector<float> weights(grid.countX() * grid.countY() * grid.countZ());
+//
+//    iterParticles([&](FluidParticle &particle) {
+//        size_t I,J,K;
+//        grid.indexOf(particle.pos, I, J, K);
+//        glm::vec3 gridPos = grid.positionOf(I,J,K);
+//
+//        grid.iterateNeighborhood(I,J,K,cellRadius, [&](size_t i, size_t j, size_t k) {
+//            float dist = glm::distance(particle.pos, gridPos);
+//            size_t idx = grid.fromIJK(i,j,k);
+//            weights[idx] += kernel(dist, radius);
+//        });
+//    }, false);
+//
+//    iterParticles([&](FluidParticle &particle) {
+//        size_t I,J,K;
+//        grid.indexOf(particle.pos, I, J, K);
+//        glm::vec3 gridPos = grid.positionOf(I,J,K);
+//
+//        grid.iterateNeighborhood(I,J,K,cellRadius, [&](size_t i, size_t j, size_t k) {
+//            float dist = glm::distance(particle.pos, gridPos);
+//            size_t idx = grid.fromIJK(i,j,k);
+//            T temp;
+//            void *address = (void *) &particle + offset;
+//            std::memcpy(&temp, address, attributeSize);
+//            grid(i,j,k) += temp * (kernel(dist, radius) / weights[idx]);
+//        });
+//    }, false);
+//
+//    return;
+
+    grid.iterate([&](size_t I, size_t J, size_t K) {
+        glm::vec3 gridPos = grid.positionOf(I,J,K);
+
+        size_t mI, mJ, mK, si, ei, sj, ej, sk, ek;
+        _MAC->indexOf(gridPos, mI, mJ, mK);
+        _MAC->getNeighboorhood(mI, mJ, mK, cellRadius, si, ei, sj, ej, sk, ek);
+
+        float totalWeight = 0.f;
+
+//        totalWeight = tbb::parallel_reduce(tbb::blocked_range3d<size_t>(si, ei, sj, ej, sk, ek), 0.f, [&](const tbb::blocked_range3d<size_t> &r, float init)->float {
+//            for (size_t i = r.rows().begin(); i < r.rows().end(); i++) {
+//                for (size_t j = r.cols().begin(); j < r.cols().end(); j++) {
+//                    for (size_t k = r.pages().begin(); k < r.pages().end(); k++) {
+//                        for (FluidParticle const *particle : _MAC(i, j, k)) {
+//                            float dist = glm::distance(particle->pos, gridPos);
+//                            float weight = kernel(dist, 2*radius);
+//                            init += weight;
+//                        }
+//                    }
+//                }
+//            }
+//            return init;
+//        }, std::plus<float>()
+//        );
+
+        for (size_t i = si; i < ei; i++) {
+            for (size_t j = sj; j < ej; j++) {
+                for (size_t k = sk; k < ek; k++) {
+                    for (FluidParticle const *particle : _MAC->atIdx(i, j, k)) {
+                        float dist = glm::distance2(particle->pos, gridPos);
+                        float weight = kernel(dist, 2*radius*radius);
+                        totalWeight += weight;
+                    }
+                }
+            }
+        }
+
+        if (totalWeight == 0) {
+            grid(I,J,K) = zeroVal;
+            return;
+        }
+
+        T temp;
+        T gridVal = zeroVal;
+        for (size_t i = si; i < ei; i++) {
+            for (size_t j = sj; j < ej; j++) {
+                for (size_t k = sk; k < ek; k++) {
+                    for (FluidParticle const *particle : _MAC->atIdx(i, j, k)) {
+                        float dist = glm::distance2(particle->pos, gridPos);
+                        float weight = kernel(dist, 2*radius*radius);
+                        void *address = (void *) particle + offset;
+                        std::memcpy(&temp, address, attributeSize);
+                        gridVal += temp * (weight / totalWeight);
+                    }
+                }
+            }
+        }
+
+        grid(I,J,K) = gridVal;
+    });
+
+}
+
+template<typename T> T FluidSolver::interpolateAttribute(const glm::vec3 &pos, Grid<T> &grid) {
+    glm::vec3 idx = grid.fractionalIndexOf(pos);
+    size_t i = (size_t) floor(idx.x);
+    size_t j = (size_t) floor(idx.y);
+    size_t k = (size_t) floor(idx.z);
+    size_t I = (size_t) MATHIFELSE(ceil(idx.x), grid.countX()-1, ceil(idx.x) >= grid.countX());
+    size_t J = (size_t) MATHIFELSE(ceil(idx.y), grid.countY()-1, ceil(idx.y) >= grid.countY());
+    size_t K = (size_t) MATHIFELSE(ceil(idx.z), grid.countZ()-1, ceil(idx.z) >= grid.countZ());
+
+    T k1, k2, k3, k4, j1, j2, val;
+
+    // this is reverse from what is expected because we want smaller value (closer distance) to have larger influence
+    k1 = MATHIFELSE((K-idx.z) * grid(i,j,k) + (idx.z-k) * grid(i,j,K), grid(i,j,k), k==K);
+    k2 = MATHIFELSE((K-idx.z) * grid(i,J,k) + (idx.z-k) * grid(i,J,K), grid(i,J,k), k==K);
+    k3 = MATHIFELSE((K-idx.z) * grid(I,j,k) + (idx.z-k) * grid(I,j,K), grid(I,j,k), k==K);
+    k4 = MATHIFELSE((K-idx.z) * grid(I,J,k) + (idx.z-k) * grid(i,J,K), grid(I,J,k), k==K);
+
+    j1 = MATHIFELSE((J-idx.y) * k1 + (idx.y-j) * k2, k1, j==J);
+    j2 = MATHIFELSE((J-idx.y) * k3 + (idx.y-j) * k4, k3, j==J);
+
+    val = MATHIFELSE((I-idx.x) * j1 + (idx.x-i) * j2, j1, i==I);
+
+    return val;
+}
+
+void FluidSolver::update(float step) {
+    projectVelocitiesToGrid();
+    gravitySolve(step);
+    enforceBoundary();
+    pressureSolve(step);
+    enforceBoundary();
+    extrapolateVelocity();
+    enforceBoundary();
+    transferVelocitiesToParticles();
+    updateParticlePositions(step);
+    resolveCollisions();
+    updateCells();
+
+    frame++;
+}
+
+void FluidSolver::iterParticles(const std::function<void(FluidParticle &particle)> &cb, bool parallel) {
+#ifdef USETBB
+    if (parallel) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, _particles.size()), [&](const tbb::blocked_range<size_t> &r) {
+            for (size_t i = r.begin(); i != r.end(); ++i) {
+                FluidParticle &particle = _particles[i];
+                cb(particle);
+            }
+        });
+    } else {
+        for (FluidParticle &particle : _particles) {
+            cb(particle);
+        }
+    }
+#else
+    for (FluidParticle &particle : _particles) {
+        cb(particle);
+    }
+#endif
+}
\ No newline at end of file
diff --git a/core/solver/FluidSolver.h b/core/solver/FluidSolver.h
new file mode 100644
index 00000000..6074dd97
--- /dev/null
+++ b/core/solver/FluidSolver.h
@@ -0,0 +1,55 @@
+//
+// Created by austin on 2/28/16.
+//
+
+#ifndef FLUIDSOLVER_FLUIDSOLVER_H
+#define FLUIDSOLVER_FLUIDSOLVER_H
+
+#include <core/util/math.h>
+#include <core/geometry/GeoObject.h>
+#include <vector>
+#include "grid/MACGrid.h"
+#include "FluidParticle.h"
+
+class FluidSolver {
+    friend class ParticlesPainter;
+    friend class ParticlesWriter;
+public:
+    FluidSolver(float particleSep, float gridSize);
+    ~FluidSolver();
+
+    void setContainer(GeoObject* container);
+    void addFluid(const GeoObject &fluid);
+    void init();
+
+    void projectVelocitiesToGrid();
+    void transferVelocitiesToParticles();
+    void enforceBoundary();
+    void pressureSolve(float step);
+    void gravitySolve(float step);
+    void extrapolateVelocity();
+    void updateParticlePositions(float step);
+    void resolveCollisions();
+    void updateCells();
+
+    void update(float step = 0.04166f);
+
+    GeoObject* _container;
+    MACGrid<std::vector<FluidParticle*> >* _MAC;
+
+private:
+    std::vector<FluidParticle> _particles;
+    float particle_radius;
+    float _cell_size;
+    int frame;
+
+    template <typename T> void particleAttributeToGrid(std::size_t offset, Grid<T> &grid, float radius, T zeroVal);
+    template <typename T> T interpolateAttribute(const glm::vec3 &pos, Grid<T> &grid);
+
+    void iterParticles(const std::function<void(FluidParticle &particle)> &cb, bool parallel=true);
+
+    static float g;
+};
+
+
+#endif //FLUIDSOLVER_FLUIDSOLVER_H
diff --git a/core/solver/grid/Grid.cpp b/core/solver/grid/Grid.cpp
new file mode 100644
index 00000000..30e5bd75
--- /dev/null
+++ b/core/solver/grid/Grid.cpp
@@ -0,0 +1,315 @@
+//
+// Created by austin on 3/20/16.
+//
+
+#include "Grid.h"
+#include <core/solver/FluidParticle.h>
+
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range3d.h>
+#include <core/util/hacks.h>
+#include <iostream>
+
+template <typename T> Grid<T>::Grid() {
+
+}
+
+template <typename T> Grid<T>::~Grid() {
+
+}
+
+template <typename T> Grid<T>::Grid(const glm::vec3 &origin, const glm::vec3 &offset, const glm::vec3 &dim, float size) :
+        _origin(origin),
+        _offset(offset),
+        _dim(dim),
+        _cellSize(size),
+        //_countX((size_t) (std::ceil((_dim.x - _offset.x) / _cellSize))),
+        //_countY((size_t) (std::ceil((_dim.y - _offset.y) / _cellSize))),
+        //_countZ((size_t) (std::ceil((_dim.z - _offset.z) / _cellSize))) {
+        _countX((size_t) (std::floor((_dim.x - _offset.x) / _cellSize)+1)),
+        _countY((size_t) (std::floor((_dim.y - _offset.y) / _cellSize)+1)),
+        _countZ((size_t) (std::floor((_dim.z - _offset.z) / _cellSize)+1)) {
+    _contents = std::vector<T>((unsigned long) (_countX * _countY * _countZ));
+    std::cout << "Constructing " << _countX << "x" << _countY << "x" << _countZ << " grid..." << glm::to_string(origin) << " to " << glm::to_string(origin+dim) << std::endl;
+}
+/*
+template <typename T> template <typename C> Grid<T>::Grid(const Grid<C> &rhs) :
+        _origin(rhs._origin), 
+        _offset(rhs._offset), 
+        _dim(rhs._dim), 
+        _cellSize(rhs._cellSize),
+        _countX(rhs._countX),
+        _countY(rhs._countY),
+        _countZ(rhs._countZ) {
+    _contents = std::vector<T>((unsigned long) (_countX * _countY * _countZ));
+}*/
+
+template <typename T> T& Grid<T>::operator()(std::size_t idx) {
+    return _contents[idx];
+}
+
+template <typename T> const T& Grid<T>::operator()(std::size_t idx) const {
+    return _contents[idx];
+}
+
+template <typename T> T& Grid<T>::operator()(std::size_t i, std::size_t j, std::size_t k) {
+    return _contents[k*_countX*_countY + j*_countX + i];
+}
+
+template <typename T> const T& Grid<T>::operator()(std::size_t i, std::size_t j, std::size_t k) const {
+    return _contents[k*_countX*_countY + j*_countX + i];
+}
+
+
+template <typename T> T& Grid<T>::atIdx(std::size_t i, std::size_t j, std::size_t k) {
+    return _contents[k*_countX*_countY + j*_countX + i];
+}
+
+template <typename T> const T& Grid<T>::atIdx(std::size_t i, std::size_t j, std::size_t k) const {
+    return _contents[k*_countX*_countY + j*_countX + i];
+}
+
+
+template <typename T> T& Grid<T>::operator()(const glm::ivec3 &idx) {
+    return _contents[idx.z*_countX*_countY + idx.y*_countX + idx.x];
+}
+
+template <typename T> const T& Grid<T>::operator()(const glm::ivec3 &idx) const {
+    return _contents[idx.z*_countX*_countY + idx.y*_countX + idx.x];;
+}
+
+template <typename T> T& Grid<T>::atIdx(const glm::ivec3 &idx) {
+    return _contents[idx.z*_countX*_countY + idx.y*_countX + idx.x];
+}
+
+template <typename T> const T& Grid<T>::atIdx(const glm::ivec3 &idx) const {
+    return _contents[idx.z*_countX*_countY + idx.y*_countX + idx.x];
+}
+
+
+template <typename T> T& Grid<T>::at(float x, float y, float z) {
+    return at(glm::vec3(x, y, z));
+}
+
+template <typename T> const T& Grid<T>::at(float x, float y, float z) const {
+    return at(glm::vec3(x, y, z));;
+}
+
+template <typename T> T& Grid<T>::at(const glm::vec3 &pos) {
+    glm::ivec3 indices = indexOf(pos);
+    return this->operator()((size_t) indices.x, (size_t) indices.y, (size_t) indices.z);
+}
+
+template <typename T> const T& Grid<T>::at(const glm::vec3 &pos) const {
+    glm::ivec3 indices = indexOf(pos);
+    return this->operator()((size_t) indices.x, (size_t) indices.y, (size_t) indices.z);
+}
+
+template <typename T> glm::ivec3 Grid<T>::indexOf(const glm::vec3 &pos) const {
+    glm::vec3 indices = (pos - 0.f*_offset - _origin) / _cellSize;
+    int i = (int) indices.x;
+    int j = (int) indices.y;
+    int k = (int) indices.z;
+    //if (i >= _countX ) i = -1;
+    //if (j >= _countY ) j = -1;
+    //if (k >= _countZ ) k = -1;
+    return glm::clamp(glm::ivec3(i, j, k), glm::ivec3(0,0,0), glm::ivec3(_countX-1, _countY-1, _countZ-1));
+}
+
+template <typename T> void Grid<T>::indexOf(const glm::vec3 &pos, size_t &i, size_t &j, size_t &k) const {
+    glm::vec3 indices = (pos - 0.f*_offset - _origin) / _cellSize;
+    indices = glm::clamp(indices, glm::vec3(0,0,0), glm::vec3(_countX-1, _countY-1, _countZ-1));
+    i = (size_t) indices.x;
+    j = (size_t) indices.y;
+    k = (size_t) indices.z;
+//    int ii = ((indices.x < _countX) * indices.x + (indices.x > _countX-1) * (_countX-1)); // clamp at countX
+//    int jj = ((indices.y < _countY) * indices.y + (indices.y > _countY-1) * (_countY-1)); // clamp at countY
+//    int kk = ((indices.z < _countZ) * indices.z + (indices.z > _countZ-1) * (_countZ-1)); // clamp at countZ
+//    ii = (ii > 0) * ii;
+//    jj = (jj > 0) * jj;
+//    kk = (kk > 0) * kk;
+//    i = ii;
+//    j = jj;
+//    k = kk;
+}
+
+
+template <typename T> glm::vec3 Grid<T>::positionOf(const glm::ivec3 &idx) const {
+    return glm::vec3(idx.x * _cellSize, idx.y * _cellSize, idx.z * _cellSize) + _offset + _origin;
+}
+
+template <typename T> glm::vec3 Grid<T>::positionOf(size_t i, size_t j, size_t k) const {
+    return glm::vec3(i * _cellSize, j * _cellSize, k * _cellSize) + _offset + _origin;
+}
+
+template <typename T> glm::vec3 Grid<T>::fractionalIndexOf(const glm::vec3 &pos) const {
+    return glm::clamp((pos - _offset - _origin) / _cellSize, glm::vec3(0,0,0), glm::vec3(_countX, _countY, _countZ));
+}
+
+template <typename T> glm::ivec3 Grid<T>::toIJK(const std::size_t index) const {
+    size_t i,j,k;
+    toIJK(index, i,j,k);
+    return glm::ivec3(i,j,k);
+}
+
+template <typename T> void Grid<T>::toIJK(const std::size_t index, size_t &i, size_t &j, size_t &k) const {
+    i = (index % _countX);
+    j = ((index / _countX) % _countY);
+    k = (index / (_countX * _countY));
+}
+
+template <typename T> std::size_t Grid<T>::fromIJK(const std::size_t i, const std::size_t j, const std::size_t k) const {
+    return (size_t) (k * _countX * _countY + j * _countX + i);
+}
+
+template <typename T> std::size_t Grid<T>::fromIJK(const glm::ivec3 &ijk) const {
+    return (size_t) (ijk.z * _countX * _countY + ijk.y * _countX + ijk.x);
+}
+
+template <typename T> void Grid<T>::iterate(const std::function<void(size_t i, size_t j, size_t k)> &cb, bool parallel) {
+#ifdef USETBB
+    if (parallel) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, _contents.size()), [&](const tbb::blocked_range<size_t>& r) {
+            for (size_t i = r.begin(); i != r.end(); ++i) {
+                glm::ivec3 ijk = toIJK(i);
+                cb(ijk.x, ijk.y, ijk.z);
+            }
+        });
+    } else {
+        for (size_t idx = 0; idx < _contents.size(); idx++) {
+            glm::ivec3 ijk = toIJK(idx);
+            cb(ijk.x, ijk.y, ijk.z);
+        }
+    }
+#else
+    for (size_t idx = 0; idx < _contents.size(); idx++) {
+        glm::ivec3 ijk = toIJK(idx);
+        cb(ijk.x, ijk.y, ijk.z);
+    }
+#endif
+}
+
+template <typename T> void Grid<T>::iterateRegion(size_t i, size_t j, size_t k, size_t I, size_t J, size_t K, const std::function<void(size_t i, size_t j, size_t k)> &cb, bool parallel) {
+#ifdef USETBB
+    tbb::blocked_range3d<size_t> test(i,j,k,I,J,K);
+    if (parallel) {
+        tbb::parallel_for(tbb::blocked_range3d<size_t>(i,I,j,J,k,K), [&](const tbb::blocked_range3d<size_t> &r) {
+            for(size_t ii=r.pages().begin(), i_end=r.pages().end(); ii<i_end; ii++){
+                for (size_t jj=r.rows().begin(), j_end=r.rows().end(); jj<j_end; jj++){
+                    for (size_t kk=r.cols().begin(), k_end=r.cols().end(); kk<k_end; kk++){
+                        cb(ii,jj,kk);
+                    }
+                }
+            }
+        });
+    } else {
+        for (size_t ii = i; ii < I; ii++) {
+            for (size_t jj = j; jj < J; jj++) {
+                for (size_t kk = k; kk < K; kk++) {
+                    cb(ii,jj,kk);
+                }
+            }
+        }
+    }
+#else
+    for (size_t ii = i; ii < I; ii++) {
+        for (size_t jj = j; jj < J; jj++) {
+            for (size_t kk = k; kk < K; kk++) {
+                cb(ii,jj,kk);
+            }
+        }
+    }
+#endif
+}
+
+template <typename T> void Grid<T>::iterateNeighborhood(size_t i, size_t j, size_t k, size_t r, const std::function<void(size_t i, size_t j, size_t k)> &cb, bool parallel) {
+    size_t si = MATHIFELSE(i - r, 0, i == 0);
+    size_t sj = MATHIFELSE(j - r, 0, j == 0);
+    size_t sk = MATHIFELSE(k - r, 0, k == 0);
+    size_t ei = MATHIFELSE(i + r, _countX-1, i + r >= _countX);
+    size_t ej = MATHIFELSE(j + r, _countY-1, j + r >= _countY);
+    size_t ek = MATHIFELSE(k + r, _countZ-1, k + r >= _countZ);
+
+#ifdef USETBB
+    if (parallel) {
+        tbb::parallel_for(tbb::blocked_range3d<size_t>(si,ei,sj,ej,sk,ek), [&](const tbb::blocked_range3d<size_t> &r) {
+            for(size_t i=r.pages().begin(), i_end=r.pages().end(); i<i_end; i++){
+                for (size_t j=r.rows().begin(), j_end=r.rows().end(); j<j_end; j++){
+                    for (size_t k=r.cols().begin(), k_end=r.cols().end(); k<k_end; k++){
+                        cb(i,j,k);
+                    }
+                }
+            } 
+        });
+    } else {
+        for (size_t i = si; i <= ei; i++) {
+            for (size_t j = sj; j <= ej; j++) {
+                for (size_t k = sk; k <= ek; k++) {
+                    cb(i,j,k);
+                }
+            }
+        }
+    }
+#else
+    for (size_t i = si; i <= ei; i++) {
+        for (size_t j = sj; j <= ej; j++) {
+            for (size_t k = sk; k <= ek; k++) {
+                cb(i,j,k);
+            }
+        }
+    }
+#endif
+}
+
+template <typename T> void Grid<T>::getNeighboorhood(size_t i, size_t j, size_t k, size_t r, size_t &si, size_t &ei, size_t &sj, size_t &ej, size_t &sk, size_t &ek) {
+    si = MATHIFELSE(i - r, 0, i - r > i);
+    sj = MATHIFELSE(j - r, 0, j - r > j);
+    sk = MATHIFELSE(k - r, 0, k - r > k);
+    ei = std::min(i+r+1, _countX); //MATHIFELSE(i + r, _countX, i + r >= _countX);
+    ej = std::min(j+r+1, _countY); //MATHIFELSE(j + r, _countY, j + r >= _countY);
+    ek = std::min(k+r+1, _countZ); //MATHIFELSE(k + r, _countZ, k + r >= _countZ);
+}
+
+
+template <typename T> void Grid<T>::clear(const T &zeroVal) {
+#ifdef USETBB
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, _contents.size()), [&](const tbb::blocked_range<size_t>& r) {
+        for (size_t i = r.begin(); i != r.end(); ++i) {
+            _contents[i] = zeroVal;
+        }
+    });
+#else
+    for (size_t i = 0; i < _contents.size(); i++) {
+        _contents[i] = zeroVal;
+    }
+#endif
+}
+
+template <typename T> bool Grid<T>::checkIdx(size_t i, size_t j, size_t k) const {
+    return i >= 0 && i < _countX &&
+            j >= 0 && j < _countY &&
+            k >= 0 && k < _countZ;
+}
+template <typename T> bool Grid<T>::checkIdx(const glm::ivec3 &idx) const {
+    return checkIdx((size_t) idx.x, (size_t) idx.y, (size_t) idx.z);
+}
+
+template <typename T> size_t Grid<T>::countX() const {
+    return _countX;
+}
+
+template <typename T> size_t Grid<T>::countY() const {
+    return _countY;
+}
+
+template <typename T> size_t Grid<T>::countZ() const {
+    return _countZ;
+}
+
+template <typename T> size_t Grid<T>::size() const {
+    return _contents.size();
+}
+
+template class Grid<float>;
+template class Grid<int>;
+template class Grid<std::vector<FluidParticle*, std::allocator<FluidParticle*> > >;
diff --git a/core/solver/grid/Grid.h b/core/solver/grid/Grid.h
new file mode 100644
index 00000000..b7fe527d
--- /dev/null
+++ b/core/solver/grid/Grid.h
@@ -0,0 +1,78 @@
+//
+// Created by austin on 3/20/16.
+//
+
+#ifndef FLUIDSOLVER_GRID_H
+#define FLUIDSOLVER_GRID_H
+
+#include <vector>
+#include <core/util/math.h>
+#include <functional>
+#include <core/util/flags.h>
+
+template <typename T> class Grid {
+    friend class GridIterator;
+    friend class GridVectorAttributePainter;
+    friend class GridScalarAttributePainter;
+public:
+    Grid();
+    Grid(const glm::vec3 &origin, const glm::vec3 &offset, const glm::vec3 &dim, float size);
+    //template <typename C> Grid(const Grid<C> &rhs);
+
+    T& operator()(std::size_t idx);
+    const T& operator()(std::size_t idx) const;
+    T& operator()(std::size_t i, std::size_t j, std::size_t k);
+    const T& operator()(std::size_t i, std::size_t j, std::size_t k) const;
+    T& atIdx(std::size_t i, std::size_t j, std::size_t k);
+    const T& atIdx(std::size_t i, std::size_t j, std::size_t k) const;
+    T& operator()(const glm::ivec3 &idx);
+    const T& operator()(const glm::ivec3 &idx) const;
+    T& atIdx(const glm::ivec3 &idx);
+    const T& atIdx(const glm::ivec3 &idx) const;
+
+    T& at(float x, float y, float z);
+    const T& at(float x, float y, float z) const;
+    T& at(const glm::vec3 &pos);
+    const T& at(const glm::vec3 &pos) const;
+
+    glm::ivec3 indexOf(const glm::vec3 &pos) const;
+    void indexOf(const glm::vec3 &pos, size_t &i, size_t &j, size_t &k) const;
+    glm::vec3 positionOf(const glm::ivec3 &idx) const;
+    glm::vec3 positionOf(size_t i, size_t j, size_t k) const;
+    glm::vec3 fractionalIndexOf(const glm::vec3 &pos) const;
+
+    glm::ivec3 toIJK(const std::size_t index) const;
+    void toIJK(const std::size_t index, size_t &i, size_t &j, size_t &k) const;
+    std::size_t fromIJK(const std::size_t i, const std::size_t j, const std::size_t k) const;
+    std::size_t fromIJK(const glm::ivec3 &ijk) const;
+
+    void iterate(const std::function<void(size_t i, size_t j, size_t k)> &cb, bool parallel=true);
+    void iterateRegion(size_t i, size_t j, size_t k, size_t I, size_t J, size_t K, const std::function<void(size_t i, size_t j, size_t k)> &cb, bool parallel=true);
+
+    void iterateNeighborhood(size_t i, size_t j, size_t k, size_t r, const std::function<void(size_t i, size_t j, size_t k)> &cb, bool parallel=true);
+    void getNeighboorhood(size_t i, size_t j, size_t k, size_t r, size_t &si, size_t &ei, size_t &sj, size_t &ej, size_t &sk, size_t &ek);
+
+    void clear(const T &zeroVal);
+
+    bool checkIdx(size_t i, size_t j, size_t k) const;
+    bool checkIdx(const glm::ivec3 &idx) const;
+
+    size_t countX() const;
+    size_t countY() const;
+    size_t countZ() const;
+    size_t size() const;
+
+    virtual ~Grid();
+
+private:
+    std::vector<T> _contents;
+    glm::vec3 _origin;
+    glm::vec3 _offset;
+    glm::vec3 _dim;
+    float _cellSize;
+    size_t _countX;
+    size_t _countY;
+    size_t _countZ;
+};
+
+#endif //FLUIDSOLVER_GRID_H
diff --git a/core/solver/grid/MACGrid.cpp b/core/solver/grid/MACGrid.cpp
new file mode 100644
index 00000000..c304c9b5
--- /dev/null
+++ b/core/solver/grid/MACGrid.cpp
@@ -0,0 +1,29 @@
+//
+// Created by austin on 3/20/16.
+//
+
+#include "MACGrid.h"
+#include <core/solver/FluidParticle.h>
+
+template <typename T> MACGrid<T>::MACGrid() {
+
+}
+
+template <typename T> MACGrid<T>::~MACGrid() {
+
+}
+
+template <typename T> MACGrid<T>::MACGrid(const glm::vec3 &origin, const glm::vec3 &dim, float size) :
+        Grid<T>(origin, size*glm::vec3(0.5f,0.5f,0.5f), dim, size),
+        _gU(Grid<float>(origin, size*glm::vec3(0.0f,0.5f,0.5f), dim, size)),
+        _gV(Grid<float>(origin, size*glm::vec3(0.5f,0.0f,0.5f), dim, size)),
+        _gW(Grid<float>(origin, size*glm::vec3(0.5f,0.5f,0.0f), dim, size)),
+        _gU_old(Grid<float>(origin, size*glm::vec3(0.0f,0.5f,0.5f), dim, size)),
+        _gV_old(Grid<float>(origin, size*glm::vec3(0.5f,0.0f,0.5f), dim, size)),
+        _gW_old(Grid<float>(origin, size*glm::vec3(0.5f,0.5f,0.0f), dim, size)),
+        _gP(Grid<float>(origin, size*glm::vec3(0.5f,0.5f,0.5f), dim, size)),
+        _gType(Grid<int>(origin, size*glm::vec3(0.5f,0.5f,0.5f), dim, size)) {
+
+}
+
+template class MACGrid<std::vector<FluidParticle*, std::allocator<FluidParticle*> > >;
\ No newline at end of file
diff --git a/core/solver/grid/MACGrid.h b/core/solver/grid/MACGrid.h
new file mode 100644
index 00000000..85a3c71a
--- /dev/null
+++ b/core/solver/grid/MACGrid.h
@@ -0,0 +1,40 @@
+//
+// Created by austin on 3/20/16.
+//
+
+#ifndef FLUIDSOLVER_MACGRID_H
+#define FLUIDSOLVER_MACGRID_H
+
+#include "Grid.h"
+#include <core/util/flags.h>
+
+enum CellType {
+    EMPTY,
+    FLUID,
+    SOLID
+};
+
+template <typename T> class MACGrid : public Grid<T> {
+    friend class FluidSolver;
+public:
+
+    MACGrid();
+    MACGrid(const glm::vec3 &origin, const glm::vec3 &dim, float size);
+    virtual ~MACGrid();
+
+    Grid<float> _gU;
+    Grid<float> _gV;
+    Grid<float> _gW;
+    Grid<float> _gU_old;
+    Grid<float> _gV_old;
+    Grid<float> _gW_old;
+    Grid<float> _gP;
+    Grid<int> _gType;
+//    Grid<float> _gDiv;
+
+private:
+    std::vector<T> _contents;
+};
+
+
+#endif //FLUIDSOLVER_MACGRID_H
diff --git a/core/util/flags.h b/core/util/flags.h
new file mode 100644
index 00000000..265b643c
--- /dev/null
+++ b/core/util/flags.h
@@ -0,0 +1,11 @@
+//
+// Created by austin on 3/22/16.
+//
+
+#ifndef FLUIDSOLVER_FLAGS_H
+#define FLUIDSOLVER_FLAGS_H
+
+#endif //FLUIDSOLVER_FLAGS_H
+
+#define USETBB
+//#define SPLATTING
\ No newline at end of file
diff --git a/core/util/hacks.h b/core/util/hacks.h
new file mode 100644
index 00000000..d5297554
--- /dev/null
+++ b/core/util/hacks.h
@@ -0,0 +1,10 @@
+//
+// Created by austin on 3/21/16.
+//
+
+#ifndef FLUIDSOLVER_HACKS_H
+#define FLUIDSOLVER_HACKS_H
+
+#define MATHIFELSE(expr1, expr2, cond) ( expr1*(1-(cond)) + (expr2)*(cond) )
+
+#endif //FLUIDSOLVER_HACKS_H
diff --git a/core/util/math.h b/core/util/math.h
new file mode 100644
index 00000000..c0bcf7f8
--- /dev/null
+++ b/core/util/math.h
@@ -0,0 +1,37 @@
+//
+// Created by austin on 2/27/16.
+//
+
+#ifndef FLUIDSOLVER_MATH_H
+#define FLUIDSOLVER_MATH_H
+
+#define GLM_FORCE_RADIANS
+
+#include <glm/glm.hpp>
+#include <glm/gtc/matrix_transform.hpp>
+#include <glm/gtc/type_ptr.hpp>
+#include <glm/gtx/string_cast.hpp>
+
+static const float PI = 3.14159265358979323846f;
+static const float TWO_PI = 2 * PI;
+static const float DEG2RAD = PI / 180.f;
+static const float RAD2DEG = 180.f / PI;
+
+// Float approximate-equality comparison
+template<typename T>
+inline bool fequal(T a, T b, T epsilon = 0.0001){
+    if (a == b) {
+        // Shortcut
+        return true;
+    }
+
+    const T diff = std::abs(a - b);
+    if (a * b == 0) {
+        // a or b or both are zero; relative error is not meaningful here
+        return diff < (epsilon * epsilon);
+    }
+
+    return diff / (std::abs(a) + std::abs(b)) < epsilon;
+}
+
+#endif //FLUIDSOLVER_MATH_H
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 00000000..dddba300
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,81 @@
+#include <core/display/Window.h>
+#include <core/fileIO/SceneLoader.h>
+#include <core/fileIO/ParticlesWriter.h>
+#include <core/display/painters/ParticlesPainter.h>
+#include <core/display/painters/GridVectorAttributePainter.h>
+#include <core/display/painters/GridScalarAttributePainter.h>
+#include <core/display/painters/BoxPainter.h>
+#include <ctime>
+
+int main(int argc, char* argv[]) {
+    Window* window = new Window("Fluid Solver");
+
+    FluidSolver* solver = SceneLoader::LoadScene(argv[1]);
+
+//    ParticlesWriter particlesWriter;
+    solver->init();
+
+//    particlesWriter.writeData(solver, "particles_0.vdb");
+
+    ParticlesPainter particlesPainter(solver, 2);
+    BoxPainter boxPainter((Box *) solver->_container);
+    GridVectorAttributePainter uPainter (&solver->_MAC->_gU, 1.f, glm::vec3(1,0,0), glm::vec3(0.2,0,0));
+    GridVectorAttributePainter vPainter (&solver->_MAC->_gV, 1.f, glm::vec3(0,1,0), glm::vec3(0,0.2,0));
+    GridVectorAttributePainter wPainter (&solver->_MAC->_gW, 1.f, glm::vec3(0,0,1), glm::vec3(0,0,0.2));
+    GridScalarAttributePainter tPainter (
+            &solver->_MAC->_gType, 0.f, 2.f, 2.f, 8.f, glm::vec3(0,1,1), glm::vec3(0,0,0));
+    GridScalarAttributePainter posPressurePainter (
+            &solver->_MAC->_gP, 0.f, 50.f, 0.f, 5.f, glm::vec3(1,1,0), glm::vec3(1,0,0));
+    GridScalarAttributePainter negPressurePainter (
+            &solver->_MAC->_gP, 0.f, -2.f, 0.f, 5.f, glm::vec3(0,1,1), glm::vec3(0,0,1));
+
+    window->addPainter(&particlesPainter);
+    window->addPainter(&boxPainter);
+//    window->addPainter(&uPainter);
+//    window->addPainter(&vPainter);
+//    window->addPainter(&wPainter);
+//    window->addPainter(&tPainter);
+//    window->addPainter(&posPressurePainter);
+//    window->addPainter(&negPressurePainter);
+
+    window->loadSceneCB = [](void*) {
+        std::cout << "what" << std::endl;
+    };
+//    window->initializeTweakBar();
+
+    int framerate = 24;
+    double start = glfwGetTime();
+    int frame = 0;
+
+    float totalComputeTime = 0;
+
+    window->initloop([&]() {
+        double now = glfwGetTime();
+        float duration = (float) (now - start);
+
+        // limit solver update to 24fps
+        if (duration >= 1.f / framerate) {
+            start = now;
+            //solver->update(duration);
+            solver->update(1.f / framerate);
+            totalComputeTime += glfwGetTime() - start;
+
+            std::string filename = "particles_";
+            filename.append(std::to_string(++frame));
+            filename.append(".tga");
+//            window->saveImage(filename);
+//            particlesWriter.writeData(solver, filename);
+            if (frame >= 960) {
+                exit(0);
+            }
+        }
+    });
+//    solver->update(0.1f);
+
+    std::cout << "Average compute time: " << totalComputeTime / frame << " seconds" << std::endl;
+
+    delete window;
+    delete solver;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/nuparu/include/AntTweakBar/AntTweakBar.h b/nuparu/include/AntTweakBar/AntTweakBar.h
new file mode 100644
index 00000000..a1cf277d
--- /dev/null
+++ b/nuparu/include/AntTweakBar/AntTweakBar.h
@@ -0,0 +1,378 @@
+// ----------------------------------------------------------------------------
+//
+//  @file       AntTweakBar.h
+//
+//  @brief      AntTweakBar is a light and intuitive graphical user interface 
+//              that can be readily integrated into OpenGL and DirectX 
+//              applications in order to interactively tweak parameters.
+//
+//  @author     Philippe Decaudin
+//
+//  @doc        http://anttweakbar.sourceforge.net/doc
+//
+//  @license    This file is part of the AntTweakBar library.
+//              AntTweakBar is a free software released under the zlib license.
+//              For conditions of distribution and use, see License.txt
+//
+// ----------------------------------------------------------------------------
+
+
+#if !defined TW_INCLUDED
+#define TW_INCLUDED
+
+#include <stddef.h>
+
+#define TW_VERSION  116 // Version Mmm : M=Major mm=minor (e.g., 102 is version 1.02)
+
+
+#ifdef  __cplusplus
+#   if defined(_MSC_VER)
+#       pragma warning(push)
+#       pragma warning(disable: 4995 4530)
+#       include <string>
+#       pragma warning(pop)
+#   else
+#       include <string>
+#   endif
+    extern "C" {
+#endif  // __cplusplus
+
+
+// ----------------------------------------------------------------------------
+//  OS specific definitions
+// ----------------------------------------------------------------------------
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(TW_STATIC)
+#   define TW_CALL          __stdcall
+#   define TW_CDECL_CALL    __cdecl
+#   define TW_EXPORT_API    __declspec(dllexport)
+#   define TW_IMPORT_API    __declspec(dllimport)
+#else
+#   define TW_CALL
+#   define TW_CDECL_CALL
+#   define TW_EXPORT_API
+#   define TW_IMPORT_API
+#endif
+
+#if defined TW_EXPORTS
+#   define TW_API TW_EXPORT_API
+#elif defined TW_STATIC
+#   define TW_API
+#   if defined(_MSC_VER) && !defined(TW_NO_LIB_PRAGMA)
+#       ifdef _WIN64
+#           pragma comment(lib, "AntTweakBarStatic64")
+#       else
+#           pragma comment(lib, "AntTweakBarStatic")
+#       endif
+#   endif
+#else
+#   define TW_API TW_IMPORT_API
+#   if defined(_MSC_VER) && !defined(TW_NO_LIB_PRAGMA)
+#       ifdef _WIN64
+#           pragma comment(lib, "AntTweakBar64")
+#       else
+#           pragma comment(lib, "AntTweakBar")
+#       endif
+#   endif
+#endif
+
+
+// ----------------------------------------------------------------------------
+//  Bar functions and definitions
+// ----------------------------------------------------------------------------
+
+typedef struct CTwBar TwBar; // structure CTwBar is not exposed.
+
+TW_API TwBar *      TW_CALL TwNewBar(const char *barName);
+TW_API int          TW_CALL TwDeleteBar(TwBar *bar);
+TW_API int          TW_CALL TwDeleteAllBars();
+TW_API int          TW_CALL TwSetTopBar(const TwBar *bar);
+TW_API TwBar *      TW_CALL TwGetTopBar();
+TW_API int          TW_CALL TwSetBottomBar(const TwBar *bar);
+TW_API TwBar *      TW_CALL TwGetBottomBar();
+TW_API const char * TW_CALL TwGetBarName(const TwBar *bar);
+TW_API int          TW_CALL TwGetBarCount();
+TW_API TwBar *      TW_CALL TwGetBarByIndex(int barIndex);
+TW_API TwBar *      TW_CALL TwGetBarByName(const char *barName);
+TW_API int          TW_CALL TwRefreshBar(TwBar *bar);
+
+// ----------------------------------------------------------------------------
+//  Var functions and definitions
+// ----------------------------------------------------------------------------
+
+typedef enum ETwType
+{
+    TW_TYPE_UNDEF   = 0,
+#ifdef __cplusplus
+    TW_TYPE_BOOLCPP = 1,
+#endif // __cplusplus
+    TW_TYPE_BOOL8   = 2,
+    TW_TYPE_BOOL16,
+    TW_TYPE_BOOL32,
+    TW_TYPE_CHAR,
+    TW_TYPE_INT8,
+    TW_TYPE_UINT8,
+    TW_TYPE_INT16,
+    TW_TYPE_UINT16,
+    TW_TYPE_INT32,
+    TW_TYPE_UINT32,
+    TW_TYPE_FLOAT,
+    TW_TYPE_DOUBLE,
+    TW_TYPE_COLOR32,    // 32 bits color. Order is RGBA if API is OpenGL or Direct3D10, and inversed if API is Direct3D9 (can be modified by defining 'colorOrder=...', see doc)
+    TW_TYPE_COLOR3F,    // 3 floats color. Order is RGB.
+    TW_TYPE_COLOR4F,    // 4 floats color. Order is RGBA.
+    TW_TYPE_CDSTRING,   // Null-terminated C Dynamic String (pointer to an array of char dynamically allocated with malloc/realloc/strdup)
+#ifdef __cplusplus
+# if defined(_MSC_VER) && (_MSC_VER == 1600)
+    TW_TYPE_STDSTRING = (0x2ffe0000+sizeof(std::string)),  // VS2010 C++ STL string (std::string)
+# else
+    TW_TYPE_STDSTRING = (0x2fff0000+sizeof(std::string)),  // C++ STL string (std::string)
+# endif
+#endif // __cplusplus
+    TW_TYPE_QUAT4F = TW_TYPE_CDSTRING+2, // 4 floats encoding a quaternion {qx,qy,qz,qs}
+    TW_TYPE_QUAT4D,     // 4 doubles encoding a quaternion {qx,qy,qz,qs}
+    TW_TYPE_DIR3F,      // direction vector represented by 3 floats
+    TW_TYPE_DIR3D       // direction vector represented by 3 doubles
+} TwType;
+#define TW_TYPE_CSSTRING(n) ((TwType)(0x30000000+((n)&0xfffffff))) // Null-terminated C Static String of size n (defined as char[n], with n<2^28)
+
+typedef void (TW_CALL * TwSetVarCallback)(const void *value, void *clientData);
+typedef void (TW_CALL * TwGetVarCallback)(void *value, void *clientData);
+typedef void (TW_CALL * TwButtonCallback)(void *clientData);
+
+TW_API int      TW_CALL TwAddVarRW(TwBar *bar, const char *name, TwType type, void *var, const char *def);
+TW_API int      TW_CALL TwAddVarRO(TwBar *bar, const char *name, TwType type, const void *var, const char *def);
+TW_API int      TW_CALL TwAddVarCB(TwBar *bar, const char *name, TwType type, TwSetVarCallback setCallback, TwGetVarCallback getCallback, void *clientData, const char *def);
+TW_API int      TW_CALL TwAddButton(TwBar *bar, const char *name, TwButtonCallback callback, void *clientData, const char *def);
+TW_API int      TW_CALL TwAddSeparator(TwBar *bar, const char *name, const char *def);
+TW_API int      TW_CALL TwRemoveVar(TwBar *bar, const char *name);
+TW_API int      TW_CALL TwRemoveAllVars(TwBar *bar);
+
+typedef struct CTwEnumVal
+{
+    int           Value;
+    const char *  Label;
+} TwEnumVal;
+typedef struct CTwStructMember
+{
+    const char *  Name;
+    TwType        Type;
+    size_t        Offset;
+    const char *  DefString;
+} TwStructMember;
+typedef void (TW_CALL * TwSummaryCallback)(char *summaryString, size_t summaryMaxLength, const void *value, void *clientData);
+
+TW_API int      TW_CALL TwDefine(const char *def);
+TW_API TwType   TW_CALL TwDefineEnum(const char *name, const TwEnumVal *enumValues, unsigned int nbValues);
+TW_API TwType   TW_CALL TwDefineEnumFromString(const char *name, const char *enumString);
+TW_API TwType   TW_CALL TwDefineStruct(const char *name, const TwStructMember *structMembers, unsigned int nbMembers, size_t structSize, TwSummaryCallback summaryCallback, void *summaryClientData);
+
+typedef void (TW_CALL * TwCopyCDStringToClient)(char **destinationClientStringPtr, const char *sourceString);
+TW_API void     TW_CALL TwCopyCDStringToClientFunc(TwCopyCDStringToClient copyCDStringFunc);
+TW_API void     TW_CALL TwCopyCDStringToLibrary(char **destinationLibraryStringPtr, const char *sourceClientString);
+#ifdef __cplusplus
+typedef void (TW_CALL * TwCopyStdStringToClient)(std::string& destinationClientString, const std::string& sourceString);
+TW_API void     TW_CALL TwCopyStdStringToClientFunc(TwCopyStdStringToClient copyStdStringToClientFunc);
+TW_API void     TW_CALL TwCopyStdStringToLibrary(std::string& destinationLibraryString, const std::string& sourceClientString);
+#endif // __cplusplus
+
+typedef enum ETwParamValueType
+{
+    TW_PARAM_INT32,
+    TW_PARAM_FLOAT,
+    TW_PARAM_DOUBLE,
+    TW_PARAM_CSTRING // Null-terminated array of char (ie, c-string)
+} TwParamValueType;
+TW_API int      TW_CALL TwGetParam(TwBar *bar, const char *varName, const char *paramName, TwParamValueType paramValueType, unsigned int outValueMaxCount, void *outValues);
+TW_API int      TW_CALL TwSetParam(TwBar *bar, const char *varName, const char *paramName, TwParamValueType paramValueType, unsigned int inValueCount, const void *inValues);
+
+
+// ----------------------------------------------------------------------------
+//  Management functions and definitions
+// ----------------------------------------------------------------------------
+
+typedef enum ETwGraphAPI
+{
+    TW_OPENGL           = 1,
+    TW_DIRECT3D9        = 2,
+    TW_DIRECT3D10       = 3,
+    TW_DIRECT3D11       = 4,
+    TW_OPENGL_CORE      = 5
+} TwGraphAPI;
+
+TW_API int      TW_CALL TwInit(TwGraphAPI graphAPI, void *device);
+TW_API int      TW_CALL TwTerminate();
+
+TW_API int      TW_CALL TwDraw();
+TW_API int      TW_CALL TwWindowSize(int width, int height);
+
+TW_API int      TW_CALL TwSetCurrentWindow(int windowID); // multi-windows support
+TW_API int      TW_CALL TwGetCurrentWindow();
+TW_API int      TW_CALL TwWindowExists(int windowID);
+
+typedef enum ETwKeyModifier
+{
+    TW_KMOD_NONE        = 0x0000,   // same codes as SDL keysym.mod
+    TW_KMOD_SHIFT       = 0x0003,
+    TW_KMOD_CTRL        = 0x00c0,
+    TW_KMOD_ALT         = 0x0100,
+    TW_KMOD_META        = 0x0c00
+} TwKeyModifier;
+typedef enum EKeySpecial
+{
+    TW_KEY_BACKSPACE    = '\b',
+    TW_KEY_TAB          = '\t',
+    TW_KEY_CLEAR        = 0x0c,
+    TW_KEY_RETURN       = '\r',
+    TW_KEY_PAUSE        = 0x13,
+    TW_KEY_ESCAPE       = 0x1b,
+    TW_KEY_SPACE        = ' ',
+    TW_KEY_DELETE       = 0x7f,
+    TW_KEY_UP           = 273,      // same codes and order as SDL 1.2 keysym.sym
+    TW_KEY_DOWN,
+    TW_KEY_RIGHT,
+    TW_KEY_LEFT,
+    TW_KEY_INSERT,
+    TW_KEY_HOME,
+    TW_KEY_END,
+    TW_KEY_PAGE_UP,
+    TW_KEY_PAGE_DOWN,
+    TW_KEY_F1,
+    TW_KEY_F2,
+    TW_KEY_F3,
+    TW_KEY_F4,
+    TW_KEY_F5,
+    TW_KEY_F6,
+    TW_KEY_F7,
+    TW_KEY_F8,
+    TW_KEY_F9,
+    TW_KEY_F10,
+    TW_KEY_F11,
+    TW_KEY_F12,
+    TW_KEY_F13,
+    TW_KEY_F14,
+    TW_KEY_F15,
+    TW_KEY_LAST
+} TwKeySpecial;
+
+TW_API int      TW_CALL TwKeyPressed(int key, int modifiers);
+TW_API int      TW_CALL TwKeyTest(int key, int modifiers);
+
+typedef enum ETwMouseAction
+{
+    TW_MOUSE_RELEASED,
+    TW_MOUSE_PRESSED  
+} TwMouseAction;
+typedef enum ETwMouseButtonID
+{
+    TW_MOUSE_LEFT       = 1,    // same code as SDL_BUTTON_LEFT
+    TW_MOUSE_MIDDLE     = 2,    // same code as SDL_BUTTON_MIDDLE
+    TW_MOUSE_RIGHT      = 3     // same code as SDL_BUTTON_RIGHT
+} TwMouseButtonID;
+
+TW_API int      TW_CALL TwMouseButton(TwMouseAction action, TwMouseButtonID button);
+TW_API int      TW_CALL TwMouseMotion(int mouseX, int mouseY);
+TW_API int      TW_CALL TwMouseWheel(int pos);
+
+TW_API const char * TW_CALL TwGetLastError();
+typedef void (TW_CALL * TwErrorHandler)(const char *errorMessage);
+TW_API void     TW_CALL TwHandleErrors(TwErrorHandler errorHandler);
+
+
+// ----------------------------------------------------------------------------
+//  Helper functions to translate events from some common window management
+//  frameworks to AntTweakBar.
+//  They call TwKeyPressed, TwMouse* and TwWindowSize for you (implemented in
+//  files TwEventWin.c TwEventSDL*.c TwEventGLFW.c TwEventGLUT.c)
+// ----------------------------------------------------------------------------
+
+// For Windows message proc
+#ifndef _W64    // Microsoft specific (detection of 64 bits portability issues)
+#   define _W64
+#endif  // _W64
+#ifdef _WIN64
+    TW_API int  TW_CALL TwEventWin(void *wnd, unsigned int msg, unsigned __int64 _W64 wParam, __int64 _W64 lParam);
+#else
+    TW_API int  TW_CALL TwEventWin(void *wnd, unsigned int msg, unsigned int _W64 wParam, int _W64 lParam);
+#endif
+#define TwEventWin32    TwEventWin // For compatibility with AntTweakBar versions prior to 1.11
+
+// For libSDL event loop
+TW_API int      TW_CALL TwEventSDL(const void *sdlEvent, unsigned char sdlMajorVersion, unsigned char sdlMinorVersion);
+
+// For GLFW event callbacks
+// You should define GLFW_CDECL before including AntTweakBar.h if your version of GLFW uses cdecl calling convensions
+#ifdef GLFW_CDECL
+    TW_API int TW_CDECL_CALL TwEventMouseButtonGLFWcdecl(int glfwButton, int glfwAction);
+    TW_API int TW_CDECL_CALL TwEventKeyGLFWcdecl(int glfwKey, int glfwAction);
+    TW_API int TW_CDECL_CALL TwEventCharGLFWcdecl(int glfwChar, int glfwAction);
+    TW_API int TW_CDECL_CALL TwEventMousePosGLFWcdecl(int mouseX, int mouseY);
+    TW_API int TW_CDECL_CALL TwEventMouseWheelGLFWcdecl(int wheelPos);
+#   define TwEventMouseButtonGLFW TwEventMouseButtonGLFWcdecl
+#   define TwEventKeyGLFW         TwEventKeyGLFWcdecl
+#   define TwEventCharGLFW        TwEventCharGLFWcdecl
+#   define TwEventMousePosGLFW    TwEventMousePosGLFWcdecl
+#   define TwEventMouseWheelGLFW  TwEventMouseWheelGLFWcdecl
+#else
+    TW_API int  TW_CALL TwEventMouseButtonGLFW(int glfwButton, int glfwAction);
+    TW_API int  TW_CALL TwEventKeyGLFW(int glfwKey, int glfwAction);
+    TW_API int  TW_CALL TwEventCharGLFW(int glfwChar, int glfwAction);
+#   define TwEventMousePosGLFW     TwMouseMotion
+#   define TwEventMouseWheelGLFW   TwMouseWheel
+#endif
+
+// For GLUT event callbacks (Windows calling convention for GLUT callbacks is cdecl)
+#if defined(_WIN32) || defined(_WIN64)
+#   define TW_GLUT_CALL TW_CDECL_CALL
+#else
+#   define TW_GLUT_CALL
+#endif
+TW_API int TW_GLUT_CALL TwEventMouseButtonGLUT(int glutButton, int glutState, int mouseX, int mouseY);
+TW_API int TW_GLUT_CALL TwEventMouseMotionGLUT(int mouseX, int mouseY);
+TW_API int TW_GLUT_CALL TwEventKeyboardGLUT(unsigned char glutKey, int mouseX, int mouseY);
+TW_API int TW_GLUT_CALL TwEventSpecialGLUT(int glutKey, int mouseX, int mouseY);
+TW_API int TW_CALL      TwGLUTModifiersFunc(int (TW_CALL *glutGetModifiersFunc)(void));
+typedef void (TW_GLUT_CALL *GLUTmousebuttonfun)(int glutButton, int glutState, int mouseX, int mouseY);
+typedef void (TW_GLUT_CALL *GLUTmousemotionfun)(int mouseX, int mouseY);
+typedef void (TW_GLUT_CALL *GLUTkeyboardfun)(unsigned char glutKey, int mouseX, int mouseY);
+typedef void (TW_GLUT_CALL *GLUTspecialfun)(int glutKey, int mouseX, int mouseY);
+
+// For SFML event loop
+TW_API int      TW_CALL TwEventSFML(const void *sfmlEvent, unsigned char sfmlMajorVersion, unsigned char sfmlMinorVersion);
+
+// For X11 event loop
+#if defined(_UNIX)
+    TW_API int TW_CDECL_CALL TwEventX11(void *xevent);
+#endif
+
+// ----------------------------------------------------------------------------
+//  Make sure the types have the right sizes
+// ----------------------------------------------------------------------------
+
+#define TW_COMPILE_TIME_ASSERT(name, x) typedef int TW_DUMMY_ ## name[(x) * 2 - 1]
+
+TW_COMPILE_TIME_ASSERT(TW_CHAR,    sizeof(char)    == 1);
+TW_COMPILE_TIME_ASSERT(TW_SHORT,   sizeof(short)   == 2);
+TW_COMPILE_TIME_ASSERT(TW_INT,     sizeof(int)     == 4);
+TW_COMPILE_TIME_ASSERT(TW_FLOAT,   sizeof(float)   == 4);
+TW_COMPILE_TIME_ASSERT(TW_DOUBLE,  sizeof(double)  == 8);
+
+// Check pointer size on Windows
+#if !defined(_WIN64) && defined(_WIN32)
+    // If the following assert failed, the platform is not 32-bit and _WIN64 is not defined.
+    // When targetting 64-bit Windows platform, _WIN64 must be defined.
+    TW_COMPILE_TIME_ASSERT(TW_PTR32, sizeof(void*) == 4);
+#elif defined(_WIN64)
+    // If the following assert failed, _WIN64 is defined but the targeted platform is not 64-bit.
+    TW_COMPILE_TIME_ASSERT(TW_PTR64, sizeof(void*) == 8);
+#endif
+
+//  ---------------------------------------------------------------------------
+
+
+#ifdef  __cplusplus
+    }   // extern "C"
+#endif  // __cplusplus
+
+
+#endif  // !defined TW_INCLUDED
diff --git a/nuparu/include/Eigen/Array b/nuparu/include/Eigen/Array
deleted file mode 100644
index 3d004fb6..00000000
--- a/nuparu/include/Eigen/Array
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef EIGEN_ARRAY_MODULE_H
-#define EIGEN_ARRAY_MODULE_H
-
-// include Core first to handle Eigen2 support macros
-#include "Core"
-
-#ifndef EIGEN2_SUPPORT
-  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
-#endif
-
-#endif // EIGEN_ARRAY_MODULE_H
diff --git a/nuparu/include/Eigen/COPYING.BSD b/nuparu/include/Eigen/COPYING.BSD
deleted file mode 100644
index 11971ffe..00000000
--- a/nuparu/include/Eigen/COPYING.BSD
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
\ No newline at end of file
diff --git a/nuparu/include/Eigen/COPYING.GPL b/nuparu/include/Eigen/COPYING.GPL
deleted file mode 100644
index 94a9ed02..00000000
--- a/nuparu/include/Eigen/COPYING.GPL
+++ /dev/null
@@ -1,674 +0,0 @@
-                    GNU GENERAL PUBLIC LICENSE
-                       Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-                            Preamble
-
-  The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.  We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors.  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights.  Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received.  You must make sure that they, too, receive
-or can get the source code.  And you must show them these terms so they
-know their rights.
-
-  Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
-  For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software.  For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
-  Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so.  This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software.  The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable.  Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products.  If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
-  Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary.  To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Use with the GNU Affero General Public License.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
-    <program>  Copyright (C) <year>  <name of author>
-    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-<http://www.gnu.org/licenses/>.
-
-  The GNU General Public License does not permit incorporating your program
-into proprietary programs.  If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library.  If this is what you want to do, use the GNU Lesser General
-Public License instead of this License.  But first, please read
-<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/nuparu/include/Eigen/COPYING.LGPL b/nuparu/include/Eigen/COPYING.LGPL
deleted file mode 100644
index 4362b491..00000000
--- a/nuparu/include/Eigen/COPYING.LGPL
+++ /dev/null
@@ -1,502 +0,0 @@
-                  GNU LESSER GENERAL PUBLIC LICENSE
-                       Version 2.1, February 1999
-
- Copyright (C) 1991, 1999 Free Software Foundation, Inc.
- 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the Lesser GPL.  It also counts
- as the successor of the GNU Library Public License, version 2, hence
- the version number 2.1.]
-
-                            Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
-  This license, the Lesser General Public License, applies to some
-specially designated software packages--typically libraries--of the
-Free Software Foundation and other authors who decide to use it.  You
-can use it too, but we suggest you first think carefully about whether
-this license or the ordinary General Public License is the better
-strategy to use in any particular case, based on the explanations below.
-
-  When we speak of free software, we are referring to freedom of use,
-not price.  Our General Public Licenses are designed to make sure that
-you have the freedom to distribute copies of free software (and charge
-for this service if you wish); that you receive source code or can get
-it if you want it; that you can change the software and use pieces of
-it in new free programs; and that you are informed that you can do
-these things.
-
-  To protect your rights, we need to make restrictions that forbid
-distributors to deny you these rights or to ask you to surrender these
-rights.  These restrictions translate to certain responsibilities for
-you if you distribute copies of the library or if you modify it.
-
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link other code with the library, you must provide
-complete object files to the recipients, so that they can relink them
-with the library after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-
-  We protect your rights with a two-step method: (1) we copyright the
-library, and (2) we offer you this license, which gives you legal
-permission to copy, distribute and/or modify the library.
-
-  To protect each distributor, we want to make it very clear that
-there is no warranty for the free library.  Also, if the library is
-modified by someone else and passed on, the recipients should know
-that what they have is not the original version, so that the original
-author's reputation will not be affected by problems that might be
-introduced by others.
-
-  Finally, software patents pose a constant threat to the existence of
-any free program.  We wish to make sure that a company cannot
-effectively restrict the users of a free program by obtaining a
-restrictive license from a patent holder.  Therefore, we insist that
-any patent license obtained for a version of the library must be
-consistent with the full freedom of use specified in this license.
-
-  Most GNU software, including some libraries, is covered by the
-ordinary GNU General Public License.  This license, the GNU Lesser
-General Public License, applies to certain designated libraries, and
-is quite different from the ordinary General Public License.  We use
-this license for certain libraries in order to permit linking those
-libraries into non-free programs.
-
-  When a program is linked with a library, whether statically or using
-a shared library, the combination of the two is legally speaking a
-combined work, a derivative of the original library.  The ordinary
-General Public License therefore permits such linking only if the
-entire combination fits its criteria of freedom.  The Lesser General
-Public License permits more lax criteria for linking other code with
-the library.
-
-  We call this license the "Lesser" General Public License because it
-does Less to protect the user's freedom than the ordinary General
-Public License.  It also provides other free software developers Less
-of an advantage over competing non-free programs.  These disadvantages
-are the reason we use the ordinary General Public License for many
-libraries.  However, the Lesser license provides advantages in certain
-special circumstances.
-
-  For example, on rare occasions, there may be a special need to
-encourage the widest possible use of a certain library, so that it becomes
-a de-facto standard.  To achieve this, non-free programs must be
-allowed to use the library.  A more frequent case is that a free
-library does the same job as widely used non-free libraries.  In this
-case, there is little to gain by limiting the free library to free
-software only, so we use the Lesser General Public License.
-
-  In other cases, permission to use a particular library in non-free
-programs enables a greater number of people to use a large body of
-free software.  For example, permission to use the GNU C Library in
-non-free programs enables many more people to use the whole GNU
-operating system, as well as its variant, the GNU/Linux operating
-system.
-
-  Although the Lesser General Public License is Less protective of the
-users' freedom, it does ensure that the user of a program that is
-linked with the Library has the freedom and the wherewithal to run
-that program using a modified version of the Library.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, whereas the latter must
-be combined with the library in order to run.
-
-                  GNU LESSER GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License Agreement applies to any software library or other
-program which contains a notice placed by the copyright holder or
-other authorized party saying it may be distributed under the terms of
-this Lesser General Public License (also called "this License").
-Each licensee is addressed as "you".
-
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) The modified work must itself be a software library.
-
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-
-  6. As an exception to the Sections above, you may also combine or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-
-    b) Use a suitable shared library mechanism for linking with the
-    Library.  A suitable mechanism is one that (1) uses at run time a
-    copy of the library already present on the user's computer system,
-    rather than copying library functions into the executable, and (2)
-    will operate properly with a modified version of the library, if
-    the user installs one, as long as the modified version is
-    interface-compatible with the version that the work was made with.
-
-    c) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-
-    d) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-
-    e) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the materials to be distributed need not include anything that is
-normally distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties with
-this License.
-
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Lesser General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
-                            NO WARRANTY
-
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
-                     END OF TERMS AND CONDITIONS
-
-           How to Apply These Terms to Your New Libraries
-
-  If you develop a new library, and you want it to be of the greatest
-possible use to the public, we recommend making it free software that
-everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms of the
-ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.  It is
-safest to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the library's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-
-Also add information on how to contact you by electronic and paper mail.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the library, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
-
-  <signature of Ty Coon>, 1 April 1990
-  Ty Coon, President of Vice
-
-That's all there is to it!
diff --git a/nuparu/include/Eigen/COPYING.MINPACK b/nuparu/include/Eigen/COPYING.MINPACK
deleted file mode 100644
index ae7984da..00000000
--- a/nuparu/include/Eigen/COPYING.MINPACK
+++ /dev/null
@@ -1,52 +0,0 @@
-Minpack Copyright Notice (1999) University of Chicago.  All rights reserved
-
-Redistribution and use in source and binary forms, with or
-without modification, are permitted provided that the
-following conditions are met:
-
-1. Redistributions of source code must retain the above
-copyright notice, this list of conditions and the following
-disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following
-disclaimer in the documentation and/or other materials
-provided with the distribution.
-
-3. The end-user documentation included with the
-redistribution, if any, must include the following
-acknowledgment:
-
-   "This product includes software developed by the
-   University of Chicago, as Operator of Argonne National
-   Laboratory.
-
-Alternately, this acknowledgment may appear in the software
-itself, if and wherever such third-party acknowledgments
-normally appear.
-
-4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
-WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
-UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
-THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
-OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
-OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
-USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
-THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
-DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
-UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
-BE CORRECTED.
-
-5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
-HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
-ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
-INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
-ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
-PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
-SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
-(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
-EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
-POSSIBILITY OF SUCH LOSS OR DAMAGES.
-
diff --git a/nuparu/include/Eigen/COPYING.MPL2 b/nuparu/include/Eigen/COPYING.MPL2
deleted file mode 100644
index 14e2f777..00000000
--- a/nuparu/include/Eigen/COPYING.MPL2
+++ /dev/null
@@ -1,373 +0,0 @@
-Mozilla Public License Version 2.0
-==================================
-
-1. Definitions
---------------
-
-1.1. "Contributor"
-    means each individual or legal entity that creates, contributes to
-    the creation of, or owns Covered Software.
-
-1.2. "Contributor Version"
-    means the combination of the Contributions of others (if any) used
-    by a Contributor and that particular Contributor's Contribution.
-
-1.3. "Contribution"
-    means Covered Software of a particular Contributor.
-
-1.4. "Covered Software"
-    means Source Code Form to which the initial Contributor has attached
-    the notice in Exhibit A, the Executable Form of such Source Code
-    Form, and Modifications of such Source Code Form, in each case
-    including portions thereof.
-
-1.5. "Incompatible With Secondary Licenses"
-    means
-
-    (a) that the initial Contributor has attached the notice described
-        in Exhibit B to the Covered Software; or
-
-    (b) that the Covered Software was made available under the terms of
-        version 1.1 or earlier of the License, but not also under the
-        terms of a Secondary License.
-
-1.6. "Executable Form"
-    means any form of the work other than Source Code Form.
-
-1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
-    a separate file or files, that is not Covered Software.
-
-1.8. "License"
-    means this document.
-
-1.9. "Licensable"
-    means having the right to grant, to the maximum extent possible,
-    whether at the time of the initial grant or subsequently, any and
-    all of the rights conveyed by this License.
-
-1.10. "Modifications"
-    means any of the following:
-
-    (a) any file in Source Code Form that results from an addition to,
-        deletion from, or modification of the contents of Covered
-        Software; or
-
-    (b) any new file in Source Code Form that contains any Covered
-        Software.
-
-1.11. "Patent Claims" of a Contributor
-    means any patent claim(s), including without limitation, method,
-    process, and apparatus claims, in any patent Licensable by such
-    Contributor that would be infringed, but for the grant of the
-    License, by the making, using, selling, offering for sale, having
-    made, import, or transfer of either its Contributions or its
-    Contributor Version.
-
-1.12. "Secondary License"
-    means either the GNU General Public License, Version 2.0, the GNU
-    Lesser General Public License, Version 2.1, the GNU Affero General
-    Public License, Version 3.0, or any later versions of those
-    licenses.
-
-1.13. "Source Code Form"
-    means the form of the work preferred for making modifications.
-
-1.14. "You" (or "Your")
-    means an individual or a legal entity exercising rights under this
-    License. For legal entities, "You" includes any entity that
-    controls, is controlled by, or is under common control with You. For
-    purposes of this definition, "control" means (a) the power, direct
-    or indirect, to cause the direction or management of such entity,
-    whether by contract or otherwise, or (b) ownership of more than
-    fifty percent (50%) of the outstanding shares or beneficial
-    ownership of such entity.
-
-2. License Grants and Conditions
---------------------------------
-
-2.1. Grants
-
-Each Contributor hereby grants You a world-wide, royalty-free,
-non-exclusive license:
-
-(a) under intellectual property rights (other than patent or trademark)
-    Licensable by such Contributor to use, reproduce, make available,
-    modify, display, perform, distribute, and otherwise exploit its
-    Contributions, either on an unmodified basis, with Modifications, or
-    as part of a Larger Work; and
-
-(b) under Patent Claims of such Contributor to make, use, sell, offer
-    for sale, have made, import, and otherwise transfer either its
-    Contributions or its Contributor Version.
-
-2.2. Effective Date
-
-The licenses granted in Section 2.1 with respect to any Contribution
-become effective for each Contribution on the date the Contributor first
-distributes such Contribution.
-
-2.3. Limitations on Grant Scope
-
-The licenses granted in this Section 2 are the only rights granted under
-this License. No additional rights or licenses will be implied from the
-distribution or licensing of Covered Software under this License.
-Notwithstanding Section 2.1(b) above, no patent license is granted by a
-Contributor:
-
-(a) for any code that a Contributor has removed from Covered Software;
-    or
-
-(b) for infringements caused by: (i) Your and any other third party's
-    modifications of Covered Software, or (ii) the combination of its
-    Contributions with other software (except as part of its Contributor
-    Version); or
-
-(c) under Patent Claims infringed by Covered Software in the absence of
-    its Contributions.
-
-This License does not grant any rights in the trademarks, service marks,
-or logos of any Contributor (except as may be necessary to comply with
-the notice requirements in Section 3.4).
-
-2.4. Subsequent Licenses
-
-No Contributor makes additional grants as a result of Your choice to
-distribute the Covered Software under a subsequent version of this
-License (see Section 10.2) or under the terms of a Secondary License (if
-permitted under the terms of Section 3.3).
-
-2.5. Representation
-
-Each Contributor represents that the Contributor believes its
-Contributions are its original creation(s) or it has sufficient rights
-to grant the rights to its Contributions conveyed by this License.
-
-2.6. Fair Use
-
-This License is not intended to limit any rights You have under
-applicable copyright doctrines of fair use, fair dealing, or other
-equivalents.
-
-2.7. Conditions
-
-Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
-in Section 2.1.
-
-3. Responsibilities
--------------------
-
-3.1. Distribution of Source Form
-
-All distribution of Covered Software in Source Code Form, including any
-Modifications that You create or to which You contribute, must be under
-the terms of this License. You must inform recipients that the Source
-Code Form of the Covered Software is governed by the terms of this
-License, and how they can obtain a copy of this License. You may not
-attempt to alter or restrict the recipients' rights in the Source Code
-Form.
-
-3.2. Distribution of Executable Form
-
-If You distribute Covered Software in Executable Form then:
-
-(a) such Covered Software must also be made available in Source Code
-    Form, as described in Section 3.1, and You must inform recipients of
-    the Executable Form how they can obtain a copy of such Source Code
-    Form by reasonable means in a timely manner, at a charge no more
-    than the cost of distribution to the recipient; and
-
-(b) You may distribute such Executable Form under the terms of this
-    License, or sublicense it under different terms, provided that the
-    license for the Executable Form does not attempt to limit or alter
-    the recipients' rights in the Source Code Form under this License.
-
-3.3. Distribution of a Larger Work
-
-You may create and distribute a Larger Work under terms of Your choice,
-provided that You also comply with the requirements of this License for
-the Covered Software. If the Larger Work is a combination of Covered
-Software with a work governed by one or more Secondary Licenses, and the
-Covered Software is not Incompatible With Secondary Licenses, this
-License permits You to additionally distribute such Covered Software
-under the terms of such Secondary License(s), so that the recipient of
-the Larger Work may, at their option, further distribute the Covered
-Software under the terms of either this License or such Secondary
-License(s).
-
-3.4. Notices
-
-You may not remove or alter the substance of any license notices
-(including copyright notices, patent notices, disclaimers of warranty,
-or limitations of liability) contained within the Source Code Form of
-the Covered Software, except that You may alter any license notices to
-the extent required to remedy known factual inaccuracies.
-
-3.5. Application of Additional Terms
-
-You may choose to offer, and to charge a fee for, warranty, support,
-indemnity or liability obligations to one or more recipients of Covered
-Software. However, You may do so only on Your own behalf, and not on
-behalf of any Contributor. You must make it absolutely clear that any
-such warranty, support, indemnity, or liability obligation is offered by
-You alone, and You hereby agree to indemnify every Contributor for any
-liability incurred by such Contributor as a result of warranty, support,
-indemnity or liability terms You offer. You may include additional
-disclaimers of warranty and limitations of liability specific to any
-jurisdiction.
-
-4. Inability to Comply Due to Statute or Regulation
----------------------------------------------------
-
-If it is impossible for You to comply with any of the terms of this
-License with respect to some or all of the Covered Software due to
-statute, judicial order, or regulation then You must: (a) comply with
-the terms of this License to the maximum extent possible; and (b)
-describe the limitations and the code they affect. Such description must
-be placed in a text file included with all distributions of the Covered
-Software under this License. Except to the extent prohibited by statute
-or regulation, such description must be sufficiently detailed for a
-recipient of ordinary skill to be able to understand it.
-
-5. Termination
---------------
-
-5.1. The rights granted under this License will terminate automatically
-if You fail to comply with any of its terms. However, if You become
-compliant, then the rights granted under this License from a particular
-Contributor are reinstated (a) provisionally, unless and until such
-Contributor explicitly and finally terminates Your grants, and (b) on an
-ongoing basis, if such Contributor fails to notify You of the
-non-compliance by some reasonable means prior to 60 days after You have
-come back into compliance. Moreover, Your grants from a particular
-Contributor are reinstated on an ongoing basis if such Contributor
-notifies You of the non-compliance by some reasonable means, this is the
-first time You have received notice of non-compliance with this License
-from such Contributor, and You become compliant prior to 30 days after
-Your receipt of the notice.
-
-5.2. If You initiate litigation against any entity by asserting a patent
-infringement claim (excluding declaratory judgment actions,
-counter-claims, and cross-claims) alleging that a Contributor Version
-directly or indirectly infringes any patent, then the rights granted to
-You by any and all Contributors for the Covered Software under Section
-2.1 of this License shall terminate.
-
-5.3. In the event of termination under Sections 5.1 or 5.2 above, all
-end user license agreements (excluding distributors and resellers) which
-have been validly granted by You or Your distributors under this License
-prior to termination shall survive termination.
-
-************************************************************************
-*                                                                      *
-*  6. Disclaimer of Warranty                                           *
-*  -------------------------                                           *
-*                                                                      *
-*  Covered Software is provided under this License on an "as is"       *
-*  basis, without warranty of any kind, either expressed, implied, or  *
-*  statutory, including, without limitation, warranties that the       *
-*  Covered Software is free of defects, merchantable, fit for a        *
-*  particular purpose or non-infringing. The entire risk as to the     *
-*  quality and performance of the Covered Software is with You.        *
-*  Should any Covered Software prove defective in any respect, You     *
-*  (not any Contributor) assume the cost of any necessary servicing,   *
-*  repair, or correction. This disclaimer of warranty constitutes an   *
-*  essential part of this License. No use of any Covered Software is   *
-*  authorized under this License except under this disclaimer.         *
-*                                                                      *
-************************************************************************
-
-************************************************************************
-*                                                                      *
-*  7. Limitation of Liability                                          *
-*  --------------------------                                          *
-*                                                                      *
-*  Under no circumstances and under no legal theory, whether tort      *
-*  (including negligence), contract, or otherwise, shall any           *
-*  Contributor, or anyone who distributes Covered Software as          *
-*  permitted above, be liable to You for any direct, indirect,         *
-*  special, incidental, or consequential damages of any character      *
-*  including, without limitation, damages for lost profits, loss of    *
-*  goodwill, work stoppage, computer failure or malfunction, or any    *
-*  and all other commercial damages or losses, even if such party      *
-*  shall have been informed of the possibility of such damages. This   *
-*  limitation of liability shall not apply to liability for death or   *
-*  personal injury resulting from such party's negligence to the       *
-*  extent applicable law prohibits such limitation. Some               *
-*  jurisdictions do not allow the exclusion or limitation of           *
-*  incidental or consequential damages, so this exclusion and          *
-*  limitation may not apply to You.                                    *
-*                                                                      *
-************************************************************************
-
-8. Litigation
--------------
-
-Any litigation relating to this License may be brought only in the
-courts of a jurisdiction where the defendant maintains its principal
-place of business and such litigation shall be governed by laws of that
-jurisdiction, without reference to its conflict-of-law provisions.
-Nothing in this Section shall prevent a party's ability to bring
-cross-claims or counter-claims.
-
-9. Miscellaneous
-----------------
-
-This License represents the complete agreement concerning the subject
-matter hereof. If any provision of this License is held to be
-unenforceable, such provision shall be reformed only to the extent
-necessary to make it enforceable. Any law or regulation which provides
-that the language of a contract shall be construed against the drafter
-shall not be used to construe this License against a Contributor.
-
-10. Versions of the License
----------------------------
-
-10.1. New Versions
-
-Mozilla Foundation is the license steward. Except as provided in Section
-10.3, no one other than the license steward has the right to modify or
-publish new versions of this License. Each version will be given a
-distinguishing version number.
-
-10.2. Effect of New Versions
-
-You may distribute the Covered Software under the terms of the version
-of the License under which You originally received the Covered Software,
-or under the terms of any subsequent version published by the license
-steward.
-
-10.3. Modified Versions
-
-If you create software not governed by this License, and you want to
-create a new license for such software, you may create and use a
-modified version of this License if you rename the license and remove
-any references to the name of the license steward (except to note that
-such modified license differs from this License).
-
-10.4. Distributing Source Code Form that is Incompatible With Secondary
-Licenses
-
-If You choose to distribute Source Code Form that is Incompatible With
-Secondary Licenses under the terms of this version of the License, the
-notice described in Exhibit B of this License must be attached.
-
-Exhibit A - Source Code Form License Notice
--------------------------------------------
-
-  This Source Code Form is subject to the terms of the Mozilla Public
-  License, v. 2.0. If a copy of the MPL was not distributed with this
-  file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-If it is not possible or desirable to put the notice in a particular
-file, then You may include the notice in a location (such as a LICENSE
-file in a relevant directory) where a recipient would be likely to look
-for such a notice.
-
-You may add additional accurate notices of copyright ownership.
-
-Exhibit B - "Incompatible With Secondary Licenses" Notice
----------------------------------------------------------
-
-  This Source Code Form is "Incompatible With Secondary Licenses", as
-  defined by the Mozilla Public License, v. 2.0.
diff --git a/nuparu/include/Eigen/COPYING.README b/nuparu/include/Eigen/COPYING.README
deleted file mode 100644
index de5b6321..00000000
--- a/nuparu/include/Eigen/COPYING.README
+++ /dev/null
@@ -1,18 +0,0 @@
-Eigen is primarily MPL2 licensed. See COPYING.MPL2 and these links:
-  http://www.mozilla.org/MPL/2.0/
-  http://www.mozilla.org/MPL/2.0/FAQ.html
-
-Some files contain third-party code under BSD or LGPL licenses, whence the other
-COPYING.* files here.
-
-All the LGPL code is either LGPL 2.1-only, or LGPL 2.1-or-later.
-For this reason, the COPYING.LGPL file contains the LGPL 2.1 text.
-
-If you want to guarantee that the Eigen code that you are #including is licensed
-under the MPL2 and possibly more permissive licenses (like BSD), #define this
-preprocessor symbol:
-  EIGEN_MPL2_ONLY
-For example, with most compilers, you could add this to your project CXXFLAGS:
-  -DEIGEN_MPL2_ONLY
-This will cause a compilation error to be generated if you #include any code that is
-LGPL licensed.
diff --git a/nuparu/include/Eigen/Cholesky b/nuparu/include/Eigen/Cholesky
index f727f5d8..705a04cc 100644
--- a/nuparu/include/Eigen/Cholesky
+++ b/nuparu/include/Eigen/Cholesky
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLESKY_MODULE_H
 #define EIGEN_CHOLESKY_MODULE_H
 
@@ -10,16 +17,17 @@
   *
   *
   * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are accessible via the following MatrixBase methods:
-  *  - MatrixBase::llt(),
+  * Those decompositions are also accessible via the following methods:
+  *  - MatrixBase::llt()
   *  - MatrixBase::ldlt()
+  *  - SelfAdjointView::llt()
+  *  - SelfAdjointView::ldlt()
   *
   * \code
   * #include <Eigen/Cholesky>
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
diff --git a/nuparu/include/Eigen/CholmodSupport b/nuparu/include/Eigen/CholmodSupport
index 745b884e..83e2c1da 100644
--- a/nuparu/include/Eigen/CholmodSupport
+++ b/nuparu/include/Eigen/CholmodSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
 #define EIGEN_CHOLMODSUPPORT_MODULE_H
 
@@ -33,12 +40,8 @@ extern "C" {
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/CholmodSupport/CholmodSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
diff --git a/nuparu/include/Eigen/Core b/nuparu/include/Eigen/Core
index 9131cc3f..63602f4c 100644
--- a/nuparu/include/Eigen/Core
+++ b/nuparu/include/Eigen/Core
@@ -14,6 +14,48 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"
 
+// Handle NVCC/CUDA
+#ifdef __CUDACC__
+  // Do not try asserts on CUDA!
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  // Do not try to vectorize on CUDA!
+  #ifndef EIGEN_DONT_VECTORIZE
+  #define EIGEN_DONT_VECTORIZE
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+  
+  // All functions callable from CUDA code must be qualified with __device__
+  #define EIGEN_DEVICE_FUNC __host__ __device__
+  
+#else
+  #define EIGEN_DEVICE_FUNC
+  
+#endif
+
+#if defined(__CUDA_ARCH__)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS)
+  #define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+  #include <new>
+#endif
+
 // then include this file where all our macros are defined. It's really important to do it first because
 // it's where we do all the alignment settings (platform detection and honoring the user's will if he
 // defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
@@ -21,7 +63,7 @@
 
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if defined(__MINGW32__) && EIGEN_GNUC_AT_LEAST(4,6)
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
   #pragma GCC optimize ("-fno-ipa-cp-clone")
 #endif
 
@@ -31,26 +73,26 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
-// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
-#if !EIGEN_ALIGN
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
   #ifndef EIGEN_DONT_VECTORIZE
     #define EIGEN_DONT_VECTORIZE
   #endif
 #endif
 
-#ifdef _MSC_VER
+#if EIGEN_COMP_MSVC
   #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (_MSC_VER >= 1500) // 2008 or later
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
     // Remember that usage of defined() in a #define is undefined by the standard.
     // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
       #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
     #endif
   #endif
 #else
   // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!defined __GNUC__) || (defined __INTEL_COMPILER) || EIGEN_GNUC_AT_LEAST(4,2) )
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
     #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
   #endif
 #endif
@@ -82,6 +124,19 @@
     #ifdef __SSE4_2__
       #define EIGEN_VECTORIZE_SSE4_2
     #endif
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
 
     // include files
 
@@ -95,7 +150,7 @@
     extern "C" {
       // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
       // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #ifdef __INTEL_COMPILER
+      #if EIGEN_COMP_ICC >= 1110
         #include <immintrin.h>
       #else
         #include <emmintrin.h>
@@ -112,8 +167,20 @@
         #ifdef EIGEN_VECTORIZE_SSE4_2
         #include <nmmintrin.h>
         #endif
+        #ifdef EIGEN_VECTORIZE_AVX
+        #include <immintrin.h>
+        #endif
       #endif
     } // end extern "C"
+  #elif defined __VSX__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
   #elif defined __ALTIVEC__
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_ALTIVEC
@@ -123,13 +190,18 @@
     #undef bool
     #undef vector
     #undef pixel
-  #elif defined  __ARM_NEON__
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_NEON
     #include <arm_neon.h>
   #endif
 #endif
 
+#if defined __CUDACC__
+  #define EIGEN_VECTORIZE_CUDA
+  #include <vector_types.h>
+#endif
+
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
   #define EIGEN_HAS_OPENMP
 #endif
@@ -139,7 +211,7 @@
 #endif
 
 // MSVC for windows mobile does not have the errno.h file
-#if !(defined(_MSC_VER) && defined(_WIN32_WCE)) && !defined(__ARMCC_VERSION)
+#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
 #define EIGEN_HAS_ERRNO
 #endif
 
@@ -165,23 +237,17 @@
 #endif
 
 // required for __cpuid, needs to be included after cmath
-#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64))
+#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
   #include <intrin.h>
 #endif
 
-#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
-#endif
-
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
-#endif
-
 /** \brief Namespace containing all symbols from the %Eigen library. */
 namespace Eigen {
 
 inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_SSE4_2)
+#if defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
   return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
 #elif defined(EIGEN_VECTORIZE_SSE4_1)
   return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@@ -193,6 +259,8 @@ inline static const char *SimdInstructionSetsInUse(void) {
   return "SSE, SSE2";
 #elif defined(EIGEN_VECTORIZE_ALTIVEC)
   return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
 #elif defined(EIGEN_VECTORIZE_NEON)
   return "ARM NEON";
 #else
@@ -202,34 +270,9 @@ inline static const char *SimdInstructionSetsInUse(void) {
 
 } // end namespace Eigen
 
-#define STAGE10_FULL_EIGEN2_API             10
-#define STAGE20_RESOLVE_API_CONFLICTS       20
-#define STAGE30_FULL_EIGEN3_API             30
-#define STAGE40_FULL_EIGEN3_STRICTNESS      40
-#define STAGE99_NO_EIGEN2_SUPPORT           99
-
-#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
-#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
-#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
-#elif defined EIGEN2_SUPPORT
-  // default to stage 3, that's what it's always meant
-  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#else
-  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#undef minor
+#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
+// This will generate an error message:
+#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif
 
 // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
@@ -249,45 +292,75 @@ using std::ptrdiff_t;
   */
 
 #include "src/Core/util/Constants.h"
-#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/Meta.h"
+#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
+#include "src/Core/SpecialFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 
-#if defined EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_AVX
+  // Use AVX for floats and doubles, SSE for integers
+  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+#elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/SSE/Complex.h"
-#elif defined EIGEN_VECTORIZE_ALTIVEC
+  #include "src/Core/arch/SSE/TypeCasting.h"
+#elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
   #include "src/Core/arch/AltiVec/PacketMath.h"
+  #include "src/Core/arch/AltiVec/MathFunctions.h"
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
+  #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
 #endif
 
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/MathFunctions.h"
+#endif
+
 #include "src/Core/arch/Default/Settings.h"
 
-#include "src/Core/Functors.h"
+#include "src/Core/functors/BinaryFunctors.h"
+#include "src/Core/functors/UnaryFunctors.h"
+#include "src/Core/functors/NullaryFunctors.h"
+#include "src/Core/functors/StlFunctors.h"
+#include "src/Core/functors/AssignmentFunctors.h"
+
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"
 
+#include "src/Core/Product.h"
+#include "src/Core/CoreEvaluators.h"
+#include "src/Core/AssignEvaluator.h"
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
                                 // at least confirmed with Doxygen 1.5.5 and 1.5.6
   #include "src/Core/Assign.h"
 #endif
 
+#include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
-#include "src/Core/ForceAlignedAccess.h"
+
+// #include "src/Core/ForceAlignedAccess.h"
+
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
@@ -300,32 +373,33 @@ using std::ptrdiff_t;
 #include "src/Core/SelfCwiseBinaryOp.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
-#include "src/Core/MapBase.h"
 #include "src/Core/Stride.h"
+#include "src/Core/MapBase.h"
 #include "src/Core/Map.h"
+#include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/Ref.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
 #include "src/Core/DiagonalProduct.h"
-#include "src/Core/PermutationMatrix.h"
-#include "src/Core/Transpositions.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
 #include "src/Core/Fuzzy.h"
 #include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/Flagged.h"
-#include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
+#include "src/Core/Solve.h"
+#include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
-#include "src/Core/products/CoeffBasedProduct.h"
+#include "src/Core/ProductEvaluators.h"
 #include "src/Core/products/GeneralMatrixVector.h"
 #include "src/Core/products/GeneralMatrixMatrix.h"
 #include "src/Core/SolveTriangular.h"
@@ -347,7 +421,6 @@ using std::ptrdiff_t;
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
-#include "src/Core/ArrayBase.h"
 #include "src/Core/ArrayWrapper.h"
 
 #ifdef EIGEN_USE_BLAS
@@ -369,8 +442,4 @@ using std::ptrdiff_t;
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigen2Support"
-#endif
-
 #endif // EIGEN_CORE_H
diff --git a/nuparu/include/Eigen/Eigen b/nuparu/include/Eigen/Eigen
index 19b40ea4..654c8dc6 100644
--- a/nuparu/include/Eigen/Eigen
+++ b/nuparu/include/Eigen/Eigen
@@ -1,2 +1,2 @@
 #include "Dense"
-//#include "Sparse"
+#include "Sparse"
diff --git a/nuparu/include/Eigen/Eigen2Support b/nuparu/include/Eigen/Eigen2Support
deleted file mode 100644
index 36156d29..00000000
--- a/nuparu/include/Eigen/Eigen2Support
+++ /dev/null
@@ -1,82 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2SUPPORT_H
-#define EIGEN2SUPPORT_H
-
-#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
-#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
-#endif
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-/** \ingroup Support_modules
-  * \defgroup Eigen2Support_Module Eigen2 support module
-  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
-  *
-  * To use it, define EIGEN2_SUPPORT before including any Eigen header
-  * \code
-  * #define EIGEN2_SUPPORT
-  * \endcode
-  *
-  */
-
-#include "src/Eigen2Support/Macros.h"
-#include "src/Eigen2Support/Memory.h"
-#include "src/Eigen2Support/Meta.h"
-#include "src/Eigen2Support/Lazy.h"
-#include "src/Eigen2Support/Cwise.h"
-#include "src/Eigen2Support/CwiseOperators.h"
-#include "src/Eigen2Support/TriangularSolver.h"
-#include "src/Eigen2Support/Block.h"
-#include "src/Eigen2Support/VectorBlock.h"
-#include "src/Eigen2Support/Minor.h"
-#include "src/Eigen2Support/MathFunctions.h"
-
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-// Eigen2 used to include iostream
-#include<iostream>
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
-using Eigen::Matrix##SizeSuffix##TypeSuffix; \
-using Eigen::Vector##SizeSuffix##TypeSuffix; \
-using Eigen::RowVector##SizeSuffix##TypeSuffix;
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
-
-#define EIGEN_USING_MATRIX_TYPEDEFS \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
-
-#define USING_PART_OF_NAMESPACE_EIGEN \
-EIGEN_USING_MATRIX_TYPEDEFS \
-using Eigen::Matrix; \
-using Eigen::MatrixBase; \
-using Eigen::ei_random; \
-using Eigen::ei_real; \
-using Eigen::ei_imag; \
-using Eigen::ei_conj; \
-using Eigen::ei_abs; \
-using Eigen::ei_abs2; \
-using Eigen::ei_sqrt; \
-using Eigen::ei_exp; \
-using Eigen::ei_log; \
-using Eigen::ei_sin; \
-using Eigen::ei_cos;
-
-#endif // EIGEN2SUPPORT_H
diff --git a/nuparu/include/Eigen/Eigenvalues b/nuparu/include/Eigen/Eigenvalues
index 53c5a73a..ea93eb30 100644
--- a/nuparu/include/Eigen/Eigenvalues
+++ b/nuparu/include/Eigen/Eigenvalues
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_EIGENVALUES_MODULE_H
 #define EIGEN_EIGENVALUES_MODULE_H
 
diff --git a/nuparu/include/Eigen/Geometry b/nuparu/include/Eigen/Geometry
index efd9d450..06b736e3 100644
--- a/nuparu/include/Eigen/Geometry
+++ b/nuparu/include/Eigen/Geometry
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_GEOMETRY_MODULE_H
 #define EIGEN_GEOMETRY_MODULE_H
 
@@ -9,10 +16,6 @@
 #include "LU"
 #include <limits>
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
 /** \defgroup Geometry_Module Geometry module
   *
   *
@@ -33,27 +36,23 @@
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-  #include "src/Geometry/Homogeneous.h"
-  #include "src/Geometry/RotationBase.h"
-  #include "src/Geometry/Rotation2D.h"
-  #include "src/Geometry/Quaternion.h"
-  #include "src/Geometry/AngleAxis.h"
-  #include "src/Geometry/Transform.h"
-  #include "src/Geometry/Translation.h"
-  #include "src/Geometry/Scaling.h"
-  #include "src/Geometry/Hyperplane.h"
-  #include "src/Geometry/ParametrizedLine.h"
-  #include "src/Geometry/AlignedBox.h"
-  #include "src/Geometry/Umeyama.h"
-
-  #if defined EIGEN_VECTORIZE_SSE
-    #include "src/Geometry/arch/Geometry_SSE.h"
-  #endif
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/Geometry/All.h"
+#include "src/Geometry/Homogeneous.h"
+#include "src/Geometry/RotationBase.h"
+#include "src/Geometry/Rotation2D.h"
+#include "src/Geometry/Quaternion.h"
+#include "src/Geometry/AngleAxis.h"
+#include "src/Geometry/Transform.h"
+#include "src/Geometry/Translation.h"
+#include "src/Geometry/Scaling.h"
+#include "src/Geometry/Hyperplane.h"
+#include "src/Geometry/ParametrizedLine.h"
+#include "src/Geometry/AlignedBox.h"
+#include "src/Geometry/Umeyama.h"
+
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
+#include "src/Geometry/arch/Geometry_SSE.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/nuparu/include/Eigen/Householder b/nuparu/include/Eigen/Householder
index 6e348db5..89cd81b1 100644
--- a/nuparu/include/Eigen/Householder
+++ b/nuparu/include/Eigen/Householder
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_HOUSEHOLDER_MODULE_H
 #define EIGEN_HOUSEHOLDER_MODULE_H
 
diff --git a/nuparu/include/Eigen/IterativeLinearSolvers b/nuparu/include/Eigen/IterativeLinearSolvers
index 0f4159dc..957d5750 100644
--- a/nuparu/include/Eigen/IterativeLinearSolvers
+++ b/nuparu/include/Eigen/IterativeLinearSolvers
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 #define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 
@@ -12,28 +19,29 @@
   * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
   * Those solvers are accessible via the following classes:
   *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
   *  - BiCGSTAB for general square matrices.
   *
   * These iterative solvers are associated with some preconditioners:
   *  - IdentityPreconditioner - not really useful
-  *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
-  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
   *
   * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
   *
-  * \code
-  * #include <Eigen/IterativeLinearSolvers>
-  * \endcode
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+#include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/nuparu/include/Eigen/Jacobi b/nuparu/include/Eigen/Jacobi
index ba8a4dc3..17c1d785 100644
--- a/nuparu/include/Eigen/Jacobi
+++ b/nuparu/include/Eigen/Jacobi
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_JACOBI_MODULE_H
 #define EIGEN_JACOBI_MODULE_H
 
diff --git a/nuparu/include/Eigen/LU b/nuparu/include/Eigen/LU
index db579550..2d70c92d 100644
--- a/nuparu/include/Eigen/LU
+++ b/nuparu/include/Eigen/LU
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_LU_MODULE_H
 #define EIGEN_LU_MODULE_H
 
@@ -16,7 +23,6 @@
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
 #include "src/LU/FullPivLU.h"
@@ -25,16 +31,14 @@
 #include "src/LU/PartialPivLU_MKL.h"
 #endif
 #include "src/LU/Determinant.h"
-#include "src/LU/Inverse.h"
+#include "src/LU/InverseImpl.h"
 
-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
   #include "src/LU/arch/Inverse_SSE.h"
 #endif
 
-#ifdef EIGEN2_SUPPORT
-  #include "src/Eigen2Support/LU.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LU_MODULE_H
diff --git a/nuparu/include/Eigen/LeastSquares b/nuparu/include/Eigen/LeastSquares
deleted file mode 100644
index 35137c25..00000000
--- a/nuparu/include/Eigen/LeastSquares
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef EIGEN_REGRESSION_MODULE_H
-#define EIGEN_REGRESSION_MODULE_H
-
-#ifndef EIGEN2_SUPPORT
-#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
-#endif
-
-// exclude from normal eigen3-only documentation
-#ifdef EIGEN2_SUPPORT
-
-#include "Core"
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-#include "Eigenvalues"
-#include "Geometry"
-
-/** \defgroup LeastSquares_Module LeastSquares module
-  * This module provides linear regression and related features.
-  *
-  * \code
-  * #include <Eigen/LeastSquares>
-  * \endcode
-  */
-
-#include "src/Eigen2Support/LeastSquares.h"
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN2_SUPPORT
-
-#endif // EIGEN_REGRESSION_MODULE_H
diff --git a/nuparu/include/Eigen/MetisSupport b/nuparu/include/Eigen/MetisSupport
index 6a113f7a..85c41bf3 100644
--- a/nuparu/include/Eigen/MetisSupport
+++ b/nuparu/include/Eigen/MetisSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_METISSUPPORT_MODULE_H
 #define EIGEN_METISSUPPORT_MODULE_H
 
diff --git a/nuparu/include/Eigen/OrderingMethods b/nuparu/include/Eigen/OrderingMethods
index 7c0f1fff..d8ea3619 100644
--- a/nuparu/include/Eigen/OrderingMethods
+++ b/nuparu/include/Eigen/OrderingMethods
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ORDERINGMETHODS_MODULE_H
 #define EIGEN_ORDERINGMETHODS_MODULE_H
 
diff --git a/nuparu/include/Eigen/PaStiXSupport b/nuparu/include/Eigen/PaStiXSupport
index 7c616ee5..3411dfac 100644
--- a/nuparu/include/Eigen/PaStiXSupport
+++ b/nuparu/include/Eigen/PaStiXSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PASTIXSUPPORT_MODULE_H
 #define EIGEN_PASTIXSUPPORT_MODULE_H
 
@@ -35,12 +42,8 @@ extern "C" {
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/PaStiXSupport/PaStiXSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_PASTIXSUPPORT_MODULE_H
diff --git a/nuparu/include/Eigen/PardisoSupport b/nuparu/include/Eigen/PardisoSupport
index 99330ce7..340edf51 100644
--- a/nuparu/include/Eigen/PardisoSupport
+++ b/nuparu/include/Eigen/PardisoSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PARDISOSUPPORT_MODULE_H
 #define EIGEN_PARDISOSUPPORT_MODULE_H
 
@@ -7,8 +14,6 @@
 
 #include <mkl_pardiso.h>
 
-#include <unsupported/Eigen/SparseExtra>
-
 /** \ingroup Support_modules
   * \defgroup PardisoSupport_Module PardisoSupport module
   *
diff --git a/nuparu/include/Eigen/QR b/nuparu/include/Eigen/QR
index ac5b0269..f74f365f 100644
--- a/nuparu/include/Eigen/QR
+++ b/nuparu/include/Eigen/QR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_QR_MODULE_H
 #define EIGEN_QR_MODULE_H
 
@@ -15,14 +22,15 @@
   *
   * This module provides various QR decompositions
   * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::qr(),
+  *  - MatrixBase::householderQr()
+  *  - MatrixBase::colPivHouseholderQr()
+  *  - MatrixBase::fullPivHouseholderQr()
   *
   * \code
   * #include <Eigen/QR>
   * \endcode
   */
 
-#include "src/misc/Solve.h"
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
@@ -31,15 +39,7 @@
 #include "src/QR/ColPivHouseholderQR_MKL.h"
 #endif
 
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/QR.h"
-#endif
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigenvalues"
-#endif
-
 #endif // EIGEN_QR_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/nuparu/include/Eigen/QtAlignedMalloc b/nuparu/include/Eigen/QtAlignedMalloc
index 46f7d83b..4044d5ac 100644
--- a/nuparu/include/Eigen/QtAlignedMalloc
+++ b/nuparu/include/Eigen/QtAlignedMalloc
@@ -1,3 +1,9 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_QTMALLOC_MODULE_H
 #define EIGEN_QTMALLOC_MODULE_H
diff --git a/nuparu/include/Eigen/SPQRSupport b/nuparu/include/Eigen/SPQRSupport
index 77016442..f9489dcd 100644
--- a/nuparu/include/Eigen/SPQRSupport
+++ b/nuparu/include/Eigen/SPQRSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPQRSUPPORT_MODULE_H
 #define EIGEN_SPQRSUPPORT_MODULE_H
 
@@ -21,8 +28,6 @@
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/CholmodSupport/CholmodSupport.h"
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"
 
diff --git a/nuparu/include/Eigen/SVD b/nuparu/include/Eigen/SVD
index fd310017..b353f3f5 100644
--- a/nuparu/include/Eigen/SVD
+++ b/nuparu/include/Eigen/SVD
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SVD_MODULE_H
 #define EIGEN_SVD_MODULE_H
 
@@ -12,24 +19,25 @@
   *
   *
   * This module provides SVD decomposition for matrices (both real and complex).
-  * This decomposition is accessible via the following MatrixBase method:
+  * Two decomposition algorithms are provided:
+  *  - JacobiSVD implementing two-sided Jacobi iterations is numerically very accurate, fast for small matrices, but very slow for larger ones.
+  *  - BDCSVD implementing a recursive divide & conquer strategy on top of an upper-bidiagonalization which remains fast for large problems.
+  * These decompositions are accessible via the respective classes and following MatrixBase methods:
   *  - MatrixBase::jacobiSvd()
+  *  - MatrixBase::bdcSvd()
   *
   * \code
   * #include <Eigen/SVD>
   * \endcode
   */
 
-#include "src/misc/Solve.h"
+#include "src/SVD/UpperBidiagonalization.h"
+#include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
+#include "src/SVD/BDCSVD.h"
 #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
 #include "src/SVD/JacobiSVD_MKL.h"
 #endif
-#include "src/SVD/UpperBidiagonalization.h"
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/SVD.h"
-#endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/nuparu/include/Eigen/Sparse b/nuparu/include/Eigen/Sparse
index 7cc9c091..a2ef7a66 100644
--- a/nuparu/include/Eigen/Sparse
+++ b/nuparu/include/Eigen/Sparse
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSE_MODULE_H
 #define EIGEN_SPARSE_MODULE_H
 
@@ -11,9 +18,9 @@
   * - \ref SparseQR_Module
   * - \ref IterativeLinearSolvers_Module
   *
-  * \code
-  * #include <Eigen/Sparse>
-  * \endcode
+    \code
+    #include <Eigen/Sparse>
+    \endcode
   */
 
 #include "SparseCore"
diff --git a/nuparu/include/Eigen/SparseCholesky b/nuparu/include/Eigen/SparseCholesky
index 9f5056aa..b6a320c4 100644
--- a/nuparu/include/Eigen/SparseCholesky
+++ b/nuparu/include/Eigen/SparseCholesky
@@ -34,8 +34,6 @@
 #error The SparseCholesky module has nothing to offer in MPL2 only mode
 #endif
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
 #include "src/SparseCholesky/SimplicialCholesky.h"
 
 #ifndef EIGEN_MPL2_ONLY
diff --git a/nuparu/include/Eigen/SparseCore b/nuparu/include/Eigen/SparseCore
index 9b5be5e1..76966c4c 100644
--- a/nuparu/include/Eigen/SparseCore
+++ b/nuparu/include/Eigen/SparseCore
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSECORE_MODULE_H
 #define EIGEN_SPARSECORE_MODULE_H
 
@@ -14,7 +21,7 @@
 /** 
   * \defgroup SparseCore_Module SparseCore module
   *
-  * This module provides a sparse matrix representation, and basic associatd matrix manipulations
+  * This module provides a sparse matrix representation, and basic associated matrix manipulations
   * and operations.
   *
   * See the \ref TutorialSparse "Sparse tutorial"
@@ -26,37 +33,35 @@
   * This module depends on: Core.
   */
 
-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
-
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
+#include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
+#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/MappedSparseMatrix.h"
 #include "src/SparseCore/SparseVector.h"
-#include "src/SparseCore/SparseBlock.h"
-#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
+#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseBlock.h"
 #include "src/SparseCore/SparseDot.h"
-#include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseRedux.h"
-#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
 #include "src/SparseCore/ConservativeSparseSparseProduct.h"
 #include "src/SparseCore/SparseSparseProductWithPruning.h"
 #include "src/SparseCore/SparseProduct.h"
 #include "src/SparseCore/SparseDenseProduct.h"
-#include "src/SparseCore/SparseDiagonalProduct.h"
-#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/TriangularSolver.h"
-#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparsePermutation.h"
+#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseSolverBase.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/nuparu/include/Eigen/SparseLU b/nuparu/include/Eigen/SparseLU
index 8527a49b..38b38b53 100644
--- a/nuparu/include/Eigen/SparseLU
+++ b/nuparu/include/Eigen/SparseLU
@@ -20,9 +20,6 @@
   * Please, see the documentation of the SparseLU class for more details.
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 // Ordering interface
 #include "OrderingMethods"
 
diff --git a/nuparu/include/Eigen/SparseQR b/nuparu/include/Eigen/SparseQR
index 4ee42065..a6f3b7f7 100644
--- a/nuparu/include/Eigen/SparseQR
+++ b/nuparu/include/Eigen/SparseQR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSEQR_MODULE_H
 #define EIGEN_SPARSEQR_MODULE_H
 
@@ -21,9 +28,6 @@
   * 
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
diff --git a/nuparu/include/Eigen/StdDeque b/nuparu/include/Eigen/StdDeque
index f2723477..be3a7f82 100644
--- a/nuparu/include/Eigen/StdDeque
+++ b/nuparu/include/Eigen/StdDeque
@@ -14,7 +14,7 @@
 #include "Core"
 #include <deque>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)
 
diff --git a/nuparu/include/Eigen/StdList b/nuparu/include/Eigen/StdList
index 225c1e18..07ba1297 100644
--- a/nuparu/include/Eigen/StdList
+++ b/nuparu/include/Eigen/StdList
@@ -13,7 +13,7 @@
 #include "Core"
 #include <list>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */    
 
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)
 
diff --git a/nuparu/include/Eigen/StdVector b/nuparu/include/Eigen/StdVector
index 6b22627f..fdfc3776 100644
--- a/nuparu/include/Eigen/StdVector
+++ b/nuparu/include/Eigen/StdVector
@@ -14,7 +14,7 @@
 #include "Core"
 #include <vector>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)
 
diff --git a/nuparu/include/Eigen/SuperLUSupport b/nuparu/include/Eigen/SuperLUSupport
index 575e14fb..113f58ee 100644
--- a/nuparu/include/Eigen/SuperLUSupport
+++ b/nuparu/include/Eigen/SuperLUSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
 #define EIGEN_SUPERLUSUPPORT_MODULE_H
 
@@ -36,6 +43,8 @@ namespace Eigen { struct SluMatrix; }
   * - class SuperLU: a supernodal sequential LU factorization.
   * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
   *
+  * \warning This wrapper is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  *
   * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
   *
   * \code
@@ -48,12 +57,8 @@ namespace Eigen { struct SluMatrix; }
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/SuperLUSupport/SuperLUSupport.h"
 
-
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SUPERLUSUPPORT_MODULE_H
diff --git a/nuparu/include/Eigen/UmfPackSupport b/nuparu/include/Eigen/UmfPackSupport
index 984f64a8..4a9f46a1 100644
--- a/nuparu/include/Eigen/UmfPackSupport
+++ b/nuparu/include/Eigen/UmfPackSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
 #define EIGEN_UMFPACKSUPPORT_MODULE_H
 
@@ -26,9 +33,6 @@ extern "C" {
   *
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
 #include "src/UmfPackSupport/UmfPackSupport.h"
 
 #include "src/Core/util/ReenableStupidWarnings.h"
diff --git a/nuparu/include/Eigen/src/Cholesky/LDLT.h b/nuparu/include/Eigen/src/Cholesky/LDLT.h
index d19cb396..6fcae01f 100644
--- a/nuparu/include/Eigen/src/Cholesky/LDLT.h
+++ b/nuparu/include/Eigen/src/Cholesky/LDLT.h
@@ -16,7 +16,10 @@
 namespace Eigen { 
 
 namespace internal {
-template<typename MatrixType, int UpLo> struct LDLT_Traits;
+  template<typename MatrixType, int UpLo> struct LDLT_Traits;
+
+  // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
+  enum SignMatrix { PositiveSemiDef, NegativeSemiDef, ZeroSign, Indefinite };
 }
 
 /** \ingroup Cholesky_Module
@@ -40,7 +43,7 @@ template<typename MatrixType, int UpLo> struct LDLT_Traits;
   * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
   * decomposition to determine whether a system of equations has a solution.
   *
-  * \sa MatrixBase::ldlt(), class LLT
+  * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
 {
@@ -56,7 +59,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -69,7 +73,12 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * The default constructor is useful in cases in which the user intends to
       * perform decompositions via LDLT::compute(const MatrixType&).
       */
-    LDLT() : m_matrix(), m_transpositions(), m_isInitialized(false) {}
+    LDLT() 
+      : m_matrix(), 
+        m_transpositions(), 
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false) 
+    {}
 
     /** \brief Default Constructor with memory preallocation
       *
@@ -77,10 +86,11 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * according to the specified problem \a size.
       * \sa LDLT()
       */
-    LDLT(Index size)
+    explicit LDLT(Index size)
       : m_matrix(size, size),
         m_transpositions(size),
         m_temporary(size),
+        m_sign(internal::ZeroSign),
         m_isInitialized(false)
     {}
 
@@ -89,13 +99,15 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * This calculates the decomposition for the input \a matrix.
       * \sa LDLT(Index size)
       */
-    LDLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LDLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
     /** Clear any existing decomposition
@@ -139,21 +151,14 @@ template<typename _MatrixType, int _UpLo> class LDLT
     inline bool isPositive() const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_sign == 1;
-    }
-    
-    #ifdef EIGEN2_SUPPORT
-    inline bool isPositiveDefinite() const
-    {
-      return isPositive();
+      return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
     }
-    #endif
 
     /** \returns true if the matrix is negative (semidefinite) */
     inline bool isNegative(void) const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_sign == -1;
+      return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
     }
 
     /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
@@ -169,31 +174,23 @@ template<typename _MatrixType, int _UpLo> class LDLT
       * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
       * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
       *
-      * \sa MatrixBase::ldlt()
+      * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<LDLT, Rhs>
+    inline const Solve<LDLT, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
       eigen_assert(m_matrix.rows()==b.rows()
                 && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
+      return Solve<LDLT, Rhs>(*this, b.derived());
     }
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    #endif
-
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
 
-    LDLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LDLT& compute(const EigenBase<InputType>& matrix);
 
     template <typename Derived>
     LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
@@ -223,8 +220,19 @@ template<typename _MatrixType, int _UpLo> class LDLT
       eigen_assert(m_isInitialized && "LDLT is not initialized.");
       return Success;
     }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
 
     /** \internal
       * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
@@ -235,7 +243,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
     MatrixType m_matrix;
     TranspositionType m_transpositions;
     TmpMatrixType m_temporary;
-    int m_sign;
+    internal::SignMatrix m_sign;
     bool m_isInitialized;
 };
 
@@ -246,49 +254,32 @@ template<int UpLo> struct ldlt_inplace;
 template<> struct ldlt_inplace<Lower>
 {
   template<typename MatrixType, typename TranspositionType, typename Workspace>
-  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, int* sign=0)
+  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
   {
     using std::abs;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename TranspositionType::StorageIndex IndexType;
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
 
     if (size <= 1)
     {
       transpositions.setIdentity();
-      if(sign)
-        *sign = numext::real(mat.coeff(0,0))>0 ? 1:-1;
+      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
+      else sign = ZeroSign;
       return true;
     }
 
-    RealScalar cutoff(0), biggest_in_corner;
-
     for (Index k = 0; k < size; ++k)
     {
       // Find largest diagonal element
       Index index_of_biggest_in_corner;
-      biggest_in_corner = mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
+      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
       index_of_biggest_in_corner += k;
 
-      if(k == 0)
-      {
-        // The biggest overall is the point of reference to which further diagonals
-        // are compared; if any diagonal is negligible compared
-        // to the largest overall, the algorithm bails.
-        cutoff = abs(NumTraits<Scalar>::epsilon() * biggest_in_corner);
-      }
-
-      // Finish early if the matrix is not full rank.
-      if(biggest_in_corner < cutoff)
-      {
-        for(Index i = k; i < size; i++) transpositions.coeffRef(i) = i;
-        if(sign) *sign = 0;
-        break;
-      }
-
-      transpositions.coeffRef(k) = index_of_biggest_in_corner;
+      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
       if(k != index_of_biggest_in_corner)
       {
         // apply the transposition while taking care to consider only
@@ -297,7 +288,7 @@ template<> struct ldlt_inplace<Lower>
         mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
         mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
         std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
-        for(int i=k+1;i<index_of_biggest_in_corner;++i)
+        for(Index i=k+1;i<index_of_biggest_in_corner;++i)
         {
           Scalar tmp = mat.coeffRef(i,k);
           mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
@@ -318,22 +309,27 @@ template<> struct ldlt_inplace<Lower>
 
       if(k>0)
       {
-        temp.head(k) = mat.diagonal().head(k).asDiagonal() * A10.adjoint();
+        temp.head(k) = mat.diagonal().real().head(k).asDiagonal() * A10.adjoint();
         mat.coeffRef(k,k) -= (A10 * temp.head(k)).value();
         if(rs>0)
           A21.noalias() -= A20 * temp.head(k);
       }
-      if((rs>0) && (abs(mat.coeffRef(k,k)) > cutoff))
-        A21 /= mat.coeffRef(k,k);
       
-      if(sign)
-      {
-        // LDLT is not guaranteed to work for indefinite matrices, but let's try to get the sign right
-        int newSign = numext::real(mat.diagonal().coeff(index_of_biggest_in_corner)) > 0;
-        if(k == 0)
-          *sign = newSign;
-        else if(*sign != newSign)
-          *sign = 0;
+      // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
+      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
+      // we should only make sure that we do not introduce INF or NaN values.
+      // Remark that LAPACK also uses 0 as the cutoff value.
+      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
+      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+        A21 /= realAkk;
+
+      if (sign == PositiveSemiDef) {
+        if (realAkk < 0) sign = Indefinite;
+      } else if (sign == NegativeSemiDef) {
+        if (realAkk > 0) sign = Indefinite;
+      } else if (sign == ZeroSign) {
+        if (realAkk > 0) sign = PositiveSemiDef;
+        else if (realAkk < 0) sign = NegativeSemiDef;
       }
     }
 
@@ -353,7 +349,6 @@ template<> struct ldlt_inplace<Lower>
     using numext::isfinite;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     const Index size = mat.rows();
     eigen_assert(mat.cols() == size && w.size()==size);
@@ -399,7 +394,7 @@ template<> struct ldlt_inplace<Lower>
 template<> struct ldlt_inplace<Upper>
 {
   template<typename MatrixType, typename TranspositionType, typename Workspace>
-  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, int* sign=0)
+  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
   {
     Transpose<MatrixType> matt(mat);
     return ldlt_inplace<Lower>::unblocked(matt, transpositions, temp, sign);
@@ -417,16 +412,16 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Lower>
 {
   typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
 template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
   typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
 };
 
 } // end namespace internal
@@ -434,18 +429,22 @@ template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
 /** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
   */
 template<typename MatrixType, int _UpLo>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
+  check_template_parameters();
+  
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
 
-  m_matrix = a;
+  m_matrix = a.derived();
 
   m_transpositions.resize(size);
   m_isInitialized = false;
   m_temporary.resize(size);
+  m_sign = internal::ZeroSign;
 
-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, &m_sign);
+  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
 
   m_isInitialized = true;
   return *this;
@@ -458,8 +457,9 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
   */
 template<typename MatrixType, int _UpLo>
 template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename NumTraits<typename MatrixType::Scalar>::Real& sigma)
+LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
 {
+  typedef typename TranspositionType::StorageIndex IndexType;
   const Index size = w.rows();
   if (m_isInitialized)
   {
@@ -471,9 +471,9 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
     m_matrix.setZero();
     m_transpositions.resize(size);
     for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = i;
+      m_transpositions.coeffRef(i) = IndexType(i);
     m_temporary.resize(size);
-    m_sign = sigma>=0 ? 1 : -1;
+    m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
     m_isInitialized = true;
   }
 
@@ -482,48 +482,45 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int _UpLo, typename Rhs>
-struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
-  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType, int _UpLo>
+template<typename RhsType, typename DstType>
+void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  typedef LDLT<_MatrixType,_UpLo> LDLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
+  eigen_assert(rhs.rows() == rows());
+  // dst = P b
+  dst = m_transpositions * rhs;
+
+  // dst = L^-1 (P b)
+  matrixL().solveInPlace(dst);
+
+  // dst = D^-1 (L^-1 P b)
+  // more precisely, use pseudo-inverse of D (see bug 241)
+  using std::abs;
+  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
+  // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
+  // as motivated by LAPACK's xGELSS:
+  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
+  // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
+  // diagonal element is not well justified and leads to numerical issues in some cases.
+  // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
+  RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
+  
+  for (Index i = 0; i < vecD.size(); ++i)
   {
-    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
-    // dst = P b
-    dst = dec().transpositionsP() * rhs();
-
-    // dst = L^-1 (P b)
-    dec().matrixL().solveInPlace(dst);
-
-    // dst = D^-1 (L^-1 P b)
-    // more precisely, use pseudo-inverse of D (see bug 241)
-    using std::abs;
-    using std::max;
-    typedef typename LDLTType::MatrixType MatrixType;
-    typedef typename LDLTType::Scalar Scalar;
-    typedef typename LDLTType::RealScalar RealScalar;
-    const Diagonal<const MatrixType> vectorD = dec().vectorD();
-    RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() * NumTraits<Scalar>::epsilon(),
-				 RealScalar(1) / NumTraits<RealScalar>::highest()); // motivated by LAPACK's xGELSS
-    for (Index i = 0; i < vectorD.size(); ++i) {
-      if(abs(vectorD(i)) > tolerance)
-	dst.row(i) /= vectorD(i);
-      else
-	dst.row(i).setZero();
-    }
+    if(abs(vecD(i)) > tolerance)
+      dst.row(i) /= vecD(i);
+    else
+      dst.row(i).setZero();
+  }
 
-    // dst = L^-T (D^-1 L^-1 P b)
-    dec().matrixU().solveInPlace(dst);
+  // dst = L^-T (D^-1 L^-1 P b)
+  matrixU().solveInPlace(dst);
 
-    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
-    dst = dec().transpositionsP().transpose() * dst;
-  }
-};
+  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
+  dst = m_transpositions.transpose() * dst;
 }
+#endif
 
 /** \internal use x = ldlt_object.solve(x);
   *
@@ -566,7 +563,7 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
   // L^* P
   res = matrixU() * res;
   // D(L^*P)
-  res = vectorD().asDiagonal() * res;
+  res = vectorD().real().asDiagonal() * res;
   // L(DL^*P)
   res = matrixL() * res;
   // P^T (LDL^*P)
@@ -575,8 +572,10 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
   return res;
 }
 
+#ifndef __CUDACC__
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  * \sa MatrixBase::ldlt()
   */
 template<typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -587,6 +586,7 @@ SelfAdjointView<MatrixType, UpLo>::ldlt() const
 
 /** \cholesky_module
   * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+  * \sa SelfAdjointView::ldlt()
   */
 template<typename Derived>
 inline const LDLT<typename MatrixBase<Derived>::PlainObject>
@@ -594,6 +594,7 @@ MatrixBase<Derived>::ldlt() const
 {
   return LDLT<PlainObject>(derived());
 }
+#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/Cholesky/LLT.h b/nuparu/include/Eigen/src/Cholesky/LLT.h
index 2e6189f7..1f0091f3 100644
--- a/nuparu/include/Eigen/src/Cholesky/LLT.h
+++ b/nuparu/include/Eigen/src/Cholesky/LLT.h
@@ -41,7 +41,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   * Example: \include LLT_example.cpp
   * Output: \verbinclude LLT_example.out
   *    
-  * \sa MatrixBase::llt(), class LDLT
+  * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
  /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
   * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
@@ -59,7 +59,8 @@ template<typename _MatrixType, int _UpLo> class LLT
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
 
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -83,14 +84,15 @@ template<typename _MatrixType, int _UpLo> class LLT
       * according to the specified problem \a size.
       * \sa LLT()
       */
-    LLT(Index size) : m_matrix(size, size),
+    explicit LLT(Index size) : m_matrix(size, size),
                     m_isInitialized(false) {}
 
-    LLT(const MatrixType& matrix)
+    template<typename InputType>
+    explicit LLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
     /** \returns a view of the upper triangular matrix U */
@@ -115,33 +117,23 @@ template<typename _MatrixType, int _UpLo> class LLT
       * Example: \include LLT_solve.cpp
       * Output: \verbinclude LLT_solve.out
       *
-      * \sa solveInPlace(), MatrixBase::llt()
+      * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<LLT, Rhs>
+    inline const Solve<LLT, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "LLT is not initialized.");
       eigen_assert(m_matrix.rows()==b.rows()
                 && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
+      return Solve<LLT, Rhs>(*this, b.derived());
     }
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    
-    bool isPositiveDefinite() const { return true; }
-    #endif
-
     template<typename Derived>
     void solveInPlace(MatrixBase<Derived> &bAndX) const;
 
-    LLT& compute(const MatrixType& matrix);
+    template<typename InputType>
+    LLT& compute(const EigenBase<InputType>& matrix);
 
     /** \returns the LLT decomposition matrix
       *
@@ -172,8 +164,20 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     template<typename VectorType>
     LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     /** \internal
       * Used to compute and store L
       * The strict upper part is not used and even not initialized.
@@ -188,12 +192,11 @@ namespace internal {
 template<typename Scalar, int UpLo> struct llt_inplace;
 
 template<typename MatrixType, typename VectorType>
-static typename MatrixType::Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
+static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
 {
   using std::sqrt;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::ColXpr ColXpr;
   typedef typename internal::remove_all<ColXpr>::type ColXprCleaned;
   typedef typename ColXprCleaned::SegmentReturnType ColXprSegment;
@@ -262,10 +265,9 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
   template<typename MatrixType>
-  static typename MatrixType::Index unblocked(MatrixType& mat)
+  static Index unblocked(MatrixType& mat)
   {
     using std::sqrt;
-    typedef typename MatrixType::Index Index;
     
     eigen_assert(mat.rows()==mat.cols());
     const Index size = mat.rows();
@@ -283,15 +285,14 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
         return k;
       mat.coeffRef(k,k) = x = sqrt(x);
       if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 *= RealScalar(1)/x;
+      if (rs>0) A21 /= x;
     }
     return -1;
   }
 
   template<typename MatrixType>
-  static typename MatrixType::Index blocked(MatrixType& m)
+  static Index blocked(MatrixType& m)
   {
-    typedef typename MatrixType::Index Index;
     eigen_assert(m.rows()==m.cols());
     Index size = m.rows();
     if(size<32)
@@ -322,7 +323,7 @@ template<typename Scalar> struct llt_inplace<Scalar, Lower>
   }
 
   template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
   {
     return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
   }
@@ -333,19 +334,19 @@ template<typename Scalar> struct llt_inplace<Scalar, Upper>
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
   template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index unblocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE Index unblocked(MatrixType& mat)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::unblocked(matt);
   }
   template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index blocked(MatrixType& mat)
+  static EIGEN_STRONG_INLINE Index blocked(MatrixType& mat)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::blocked(matt);
   }
   template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
   {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::rankUpdate(matt, vec.conjugate(), sigma);
@@ -356,8 +357,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Lower>
 {
   typedef const TriangularView<const MatrixType, Lower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
   static bool inplace_decomposition(MatrixType& m)
   { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
 };
@@ -366,8 +367,8 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
 {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
   typedef const TriangularView<const MatrixType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
   static bool inplace_decomposition(MatrixType& m)
   { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
 };
@@ -382,12 +383,15 @@ template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
   * Output: \verbinclude TutorialLinAlgComputeTwice.out
   */
 template<typename MatrixType, int _UpLo>
-LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
+template<typename InputType>
+LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>& a)
 {
+  check_template_parameters();
+  
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
-  m_matrix = a;
+  m_matrix = a.derived();
 
   m_isInitialized = true;
   bool ok = Traits::inplace_decomposition(m_matrix);
@@ -415,22 +419,16 @@ LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, c
 
   return *this;
 }
-    
-namespace internal {
-template<typename _MatrixType, int UpLo, typename Rhs>
-struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
-  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
+ 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType,int _UpLo>
+template<typename RhsType, typename DstType>
+void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  typedef LLT<_MatrixType,UpLo> LLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dst = rhs();
-    dec().solveInPlace(dst);
-  }
-};
+  dst = rhs;
+  solveInPlace(dst);
 }
+#endif
 
 /** \internal use x = llt_object.solve(x);
   * 
@@ -465,8 +463,10 @@ MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
   return matrixL() * matrixL().adjoint().toDenseMatrix();
 }
 
+#ifndef __CUDACC__
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
+  * \sa SelfAdjointView::llt()
   */
 template<typename Derived>
 inline const LLT<typename MatrixBase<Derived>::PlainObject>
@@ -477,6 +477,7 @@ MatrixBase<Derived>::llt() const
 
 /** \cholesky_module
   * \returns the LLT decomposition of \c *this
+  * \sa SelfAdjointView::llt()
   */
 template<typename MatrixType, unsigned int UpLo>
 inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
@@ -484,7 +485,8 @@ SelfAdjointView<MatrixType, UpLo>::llt() const
 {
   return LLT<PlainObject,UpLo>(m_matrix);
 }
-
+#endif // __CUDACC__
+  
 } // end namespace Eigen
 
 #endif // EIGEN_LLT_H
diff --git a/nuparu/include/Eigen/src/Cholesky/LLT_MKL.h b/nuparu/include/Eigen/src/Cholesky/LLT_MKL.h
index 64daa445..0d42cb5b 100644
--- a/nuparu/include/Eigen/src/Cholesky/LLT_MKL.h
+++ b/nuparu/include/Eigen/src/Cholesky/LLT_MKL.h
@@ -46,7 +46,7 @@ template<typename Scalar> struct mkl_llt;
 template<> struct mkl_llt<EIGTYPE> \
 { \
   template<typename MatrixType> \
-  static inline typename MatrixType::Index potrf(MatrixType& m, char uplo) \
+  static inline Index potrf(MatrixType& m, char uplo) \
   { \
     lapack_int matrix_order; \
     lapack_int size, lda, info, StorageOrder; \
@@ -60,30 +60,30 @@ template<> struct mkl_llt<EIGTYPE> \
     lda = m.outerStride(); \
 \
     info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
-    info = (info==0) ? Success : NumericalIssue; \
+    info = (info==0) ? -1 : info>0 ? info-1 : size; \
     return info; \
   } \
 }; \
 template<> struct llt_inplace<EIGTYPE, Lower> \
 { \
   template<typename MatrixType> \
-  static typename MatrixType::Index blocked(MatrixType& m) \
+  static Index blocked(MatrixType& m) \
   { \
     return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
   } \
   template<typename MatrixType, typename VectorType> \
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
   { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \
 }; \
 template<> struct llt_inplace<EIGTYPE, Upper> \
 { \
   template<typename MatrixType> \
-  static typename MatrixType::Index blocked(MatrixType& m) \
+  static Index blocked(MatrixType& m) \
   { \
     return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
   } \
   template<typename MatrixType, typename VectorType> \
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
   { \
     Transpose<MatrixType> matt(mat); \
     return llt_inplace<EIGTYPE, Lower>::rankUpdate(matt, vec.conjugate(), sigma); \
diff --git a/nuparu/include/Eigen/src/CholmodSupport/CholmodSupport.h b/nuparu/include/Eigen/src/CholmodSupport/CholmodSupport.h
index 783324b0..06421d5e 100644
--- a/nuparu/include/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/nuparu/include/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -48,8 +48,8 @@ void cholmod_configure_matrix(CholmodType& mat)
 /** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object.
   * Note that the data are shared.
   */
-template<typename _Scalar, int _Options, typename _Index>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
+template<typename _Scalar, int _Options, typename _StorageIndex>
+cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_StorageIndex>& mat)
 {
   cholmod_sparse res;
   res.nzmax   = mat.nonZeros();
@@ -58,10 +58,12 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
   res.p       = mat.outerIndexPtr();
   res.i       = mat.innerIndexPtr();
   res.x       = mat.valuePtr();
+  res.z       = 0;
   res.sorted  = 1;
   if(mat.isCompressed())
   {
     res.packed  = 1;
+    res.nz = 0;
   }
   else
   {
@@ -72,11 +74,11 @@ cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
   res.dtype   = 0;
   res.stype   = -1;
   
-  if (internal::is_same<_Index,int>::value)
+  if (internal::is_same<_StorageIndex,int>::value)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_Index,UF_long>::value)
+  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -103,7 +105,7 @@ const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>&
 /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
   * The data are not copied but shared. */
 template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
-cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
   cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
   
@@ -136,12 +138,12 @@ cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
 
 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
   * The data are not copied but shared. */
-template<typename Scalar, int Flags, typename Index>
-MappedSparseMatrix<Scalar,Flags,Index> viewAsEigen(cholmod_sparse& cm)
+template<typename Scalar, int Flags, typename StorageIndex>
+MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
 {
-  return MappedSparseMatrix<Scalar,Flags,Index>
-         (cm.nrow, cm.ncol, static_cast<Index*>(cm.p)[cm.ncol],
-          static_cast<Index*>(cm.p), static_cast<Index*>(cm.i),static_cast<Scalar*>(cm.x) );
+  return MappedSparseMatrix<Scalar,Flags,StorageIndex>
+         (cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol],
+          static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }
 
 enum CholmodMode {
@@ -155,26 +157,35 @@ enum CholmodMode {
   * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
   */
 template<typename _MatrixType, int _UpLo, typename Derived>
-class CholmodBase : internal::noncopyable
+class CholmodBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
     typedef _MatrixType MatrixType;
     enum { UpLo = _UpLo };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef MatrixType CholMatrixType;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
     CholmodBase()
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+      : m_cholmodFactor(0), m_info(Success)
     {
+      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
       cholmod_start(&m_cholmod);
     }
 
-    CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
+    explicit CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success)
     {
       m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
       cholmod_start(&m_cholmod);
@@ -188,11 +199,8 @@ class CholmodBase : internal::noncopyable
       cholmod_finish(&m_cholmod);
     }
     
-    inline Index cols() const { return m_cholmodFactor->n; }
-    inline Index rows() const { return m_cholmodFactor->n; }
-    
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+    inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
     
     /** \brief Reports whether previous computation was successful.
       *
@@ -213,35 +221,7 @@ class CholmodBase : internal::noncopyable
       return derived();
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<CholmodBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
       * 
@@ -265,7 +245,7 @@ class CholmodBase : internal::noncopyable
     
     /** Performs a numeric decomposition of \a matrix
       *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
+      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
       *
       * \sa analyzePattern()
       */
@@ -287,7 +267,7 @@ class CholmodBase : internal::noncopyable
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       const Index size = m_cholmodFactor->n;
@@ -301,15 +281,16 @@ class CholmodBase : internal::noncopyable
       if(!x_cd)
       {
         this->m_info = NumericalIssue;
+        return;
       }
-      // TODO optimize this copy by swapping when possible (be carreful with alignment, etc.)
+      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
       cholmod_free_dense(&x_cd, &m_cholmod);
     }
     
     /** \internal */
     template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+    void _solve_impl(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       const Index size = m_cholmodFactor->n;
@@ -322,8 +303,9 @@ class CholmodBase : internal::noncopyable
       if(!x_cs)
       {
         this->m_info = NumericalIssue;
+        return;
       }
-      // TODO optimize this copy by swapping when possible (be carreful with alignment, etc.)
+      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
       dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
       cholmod_free_sparse(&x_cs, &m_cholmod);
     }
@@ -354,7 +336,6 @@ class CholmodBase : internal::noncopyable
     cholmod_factor* m_cholmodFactor;
     RealScalar m_shiftOffset[2];
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
 };
@@ -365,14 +346,16 @@ class CholmodBase : internal::noncopyable
   *
   * This class allows to solve for A.X = B sparse linear problems via a simplicial LL^T Cholesky factorization
   * using the Cholmod library.
-  * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Thefore, it has little practical interest.
-  * The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Therefore, it has little practical interest.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
   * X and B can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
@@ -392,7 +375,7 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
     CholmodSimplicialLLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSimplicialLLT() {}
@@ -412,14 +395,16 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
   *
   * This class allows to solve for A.X = B sparse linear problems via a simplicial LDL^T Cholesky factorization
   * using the Cholmod library.
-  * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Thefore, it has little practical interest.
-  * The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Therefore, it has little practical interest.
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
   * X and B can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
@@ -439,7 +424,7 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
     CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSimplicialLDLT() {}
@@ -458,13 +443,15 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
   * This class allows to solve for A.X = B sparse linear problems via a supernodal LL^T Cholesky factorization
   * using the Cholmod library.
   * This supernodal variant performs best on dense enough problems, e.g., 3D FEM, or very high order 2D FEM.
-  * The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
   * X and B can be either dense or sparse.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers
@@ -484,7 +471,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
     CholmodSupernodalLLT(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodSupernodalLLT() {}
@@ -501,7 +488,7 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
   * \brief A general Cholesky factorization and solver based on Cholmod
   *
   * This class allows to solve for A.X = B sparse linear problems via a LL^T or LDL^T Cholesky factorization
-  * using the Cholmod library. The sparse matrix A must be selfajoint and positive definite. The vectors or matrices
+  * using the Cholmod library. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
   * X and B can be either dense or sparse.
   *
   * This variant permits to change the underlying Cholesky method at runtime.
@@ -512,6 +499,8 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
   *
+  * \implsparsesolverconcept
+  *
   * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
   *
   * \sa \ref TutorialSparseDirectSolvers
@@ -531,7 +520,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     CholmodDecomposition(const MatrixType& matrix) : Base()
     {
       init();
-      compute(matrix);
+      this->compute(matrix);
     }
 
     ~CholmodDecomposition() {}
@@ -569,36 +558,6 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     }
 };
 
-namespace internal {
-  
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_CHOLMODSUPPORT_H
diff --git a/nuparu/include/Eigen/src/Core/Array.h b/nuparu/include/Eigen/src/Core/Array.h
index 497efff6..e38eda72 100644
--- a/nuparu/include/Eigen/src/Core/Array.h
+++ b/nuparu/include/Eigen/src/Core/Array.h
@@ -24,6 +24,9 @@ namespace Eigen {
   * API for the %Matrix class provides easy access to linear-algebra
   * operations.
   *
+  * See documentation of class Matrix for detailed information on the template parameters
+  * storage layout.
+  * 
   * This class can be extended with the help of the plugin mechanism described on the page
   * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
   *
@@ -69,11 +72,27 @@ class Array
       * the usage of 'using'. This should be done only for operator=.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
     {
       return Base::operator=(other);
     }
 
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill()
+      */
+    /* This overload is needed because the usage of
+      *   using Base::operator=;
+      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+      * the usage of 'using'. This should be done only for operator=.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const Scalar &value)
+    {
+      Base::setConstant(value);
+      return *this;
+    }
+
     /** Copies the value of the expression \a other into \c *this with automatic resizing.
       *
       * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
@@ -84,7 +103,8 @@ class Array
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -92,11 +112,12 @@ class Array
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array& operator=(const Array& other)
     {
       return Base::_set(other);
     }
-
+    
     /** Default constructor.
       *
       * For fixed-size matrices, does nothing.
@@ -107,6 +128,7 @@ class Array
       *
       * \sa resize(Index,Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array() : Base()
     {
       Base::_check_template_params();
@@ -116,6 +138,7 @@ class Array
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     // FIXME is it still needed ??
     /** \internal */
+    EIGEN_DEVICE_FUNC
     Array(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     {
@@ -124,41 +147,64 @@ class Array
     }
 #endif
 
-    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Array(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    Array(Array&& other)
+      : Base(std::move(other))
     {
       Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
+        Base::_set_noalias(other);
     }
+    EIGEN_DEVICE_FUNC
+    Array& operator=(Array&& other)
+    {
+      other.swap(*this);
+      return *this;
+    }
+#endif
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
     template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
     {
       Base::_check_template_params();
       this->template _init2<T0,T1>(val0, val1);
     }
     #else
-    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
+    /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
+    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
       *
-      * This is useful for dynamic-size matrices. For fixed-size matrices,
+      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass the dimension here, so it makes more sense to use the default
+      * constructor Array() instead.
+      */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Array(Index dim);
+    /** constructs an initialized 1x1 Array with the given coefficient */
+    Array(const Scalar& value);
+    /** constructs an uninitialized array with \a rows rows and \a cols columns.
+      *
+      * This is useful for dynamic-size arrays. For fixed-size arrays,
       * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Array() instead. */
     Array(Index rows, Index cols);
     /** constructs an initialized 2D vector with given coefficients */
     Array(const Scalar& val0, const Scalar& val1);
     #endif
 
     /** constructs an initialized 3D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
     {
       Base::_check_template_params();
@@ -168,6 +214,7 @@ class Array
       m_storage.data()[2] = val2;
     }
     /** constructs an initialized 4D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
     {
       Base::_check_template_params();
@@ -178,51 +225,21 @@ class Array
       m_storage.data()[3] = val3;
     }
 
-    explicit Array(const Scalar *data);
-
-    /** Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** Copy constructor */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Array& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+            : Base(other)
+    { }
 
     /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      *this = other;
-    }
-
-    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(ArrayBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
+      : Base(other.derived())
+    { }
 
-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
 
     #ifdef EIGEN_ARRAY_PLUGIN
     #include EIGEN_ARRAY_PLUGIN
diff --git a/nuparu/include/Eigen/src/Core/ArrayBase.h b/nuparu/include/Eigen/src/Core/ArrayBase.h
index 38852600..b4c24a27 100644
--- a/nuparu/include/Eigen/src/Core/ArrayBase.h
+++ b/nuparu/include/Eigen/src/Core/ArrayBase.h
@@ -46,16 +46,14 @@ template<typename Derived> class ArrayBase
 
     typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
 
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
-
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     typedef DenseBase<Derived> Base;
+    using Base::operator*;
+    using Base::operator/;
     using Base::RowsAtCompileTime;
     using Base::ColsAtCompileTime;
     using Base::SizeAtCompileTime;
@@ -64,8 +62,7 @@ template<typename Derived> class ArrayBase
     using Base::MaxSizeAtCompileTime;
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
-    using Base::CoeffReadCost;
-
+    
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -85,22 +82,10 @@ template<typename Derived> class ArrayBase
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
-      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
-      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
-      * PlainObject or const PlainObject&.
-      */
-    typedef Array<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
-
+    typedef typename Base::PlainObject PlainObject;
 
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
@@ -118,40 +103,57 @@ template<typename Derived> class ArrayBase
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const ArrayBase& other)
     {
-      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+      internal::call_assignment(derived(), other.derived());
+      return derived();
     }
-
-    Derived& operator+=(const Scalar& scalar)
-    { return *this = derived() + scalar; }
-    Derived& operator-=(const Scalar& scalar)
-    { return *this = derived() - scalar; }
+    
+    /** Set all the entries to \a value.
+      * \sa DenseBase::setConstant(), DenseBase::fill() */
+    EIGEN_DEVICE_FUNC
+    Derived& operator=(const Scalar &value)
+    { Base::setConstant(value); return derived(); }
+
+    EIGEN_DEVICE_FUNC
+    Derived& operator+=(const Scalar& scalar);
+    EIGEN_DEVICE_FUNC
+    Derived& operator-=(const Scalar& scalar);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator+=(const ArrayBase<OtherDerived>& other);
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator-=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator*=(const ArrayBase<OtherDerived>& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator/=(const ArrayBase<OtherDerived>& other);
 
   public:
+    EIGEN_DEVICE_FUNC
     ArrayBase<Derived>& array() { return *this; }
+    EIGEN_DEVICE_FUNC
     const ArrayBase<Derived>& array() const { return *this; }
 
     /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
       * \sa MatrixBase::array() */
-    MatrixWrapper<Derived> matrix() { return derived(); }
-    const MatrixWrapper<const Derived> matrix() const { return derived(); }
+    EIGEN_DEVICE_FUNC
+    MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
+    EIGEN_DEVICE_FUNC
+    const MatrixWrapper<const Derived> matrix() const { return MatrixWrapper<const Derived>(derived()); }
 
 //     template<typename Dest>
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }
 
   protected:
+    EIGEN_DEVICE_FUNC
     ArrayBase() : Base() {}
 
   private:
@@ -176,8 +178,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
   return derived();
 }
 
@@ -190,8 +191,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
   return derived();
 }
 
@@ -204,8 +204,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
   return derived();
 }
 
@@ -218,8 +217,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar>());
   return derived();
 }
 
diff --git a/nuparu/include/Eigen/src/Core/ArrayWrapper.h b/nuparu/include/Eigen/src/Core/ArrayWrapper.h
index a791bc35..4e484f29 100644
--- a/nuparu/include/Eigen/src/Core/ArrayWrapper.h
+++ b/nuparu/include/Eigen/src/Core/ArrayWrapper.h
@@ -29,6 +29,11 @@ struct traits<ArrayWrapper<ExpressionType> >
   : public traits<typename remove_all<typename ExpressionType::Nested>::type >
 {
   typedef ArrayXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    Flags = Flags0 & ~NestByRefBit
+  };
 };
 }
 
@@ -39,6 +44,7 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     typedef ArrayBase<ArrayWrapper> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
 
     typedef typename internal::conditional<
                        internal::is_lvalue<ExpressionType>::value,
@@ -46,43 +52,56 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
 
-    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return m_expression.innerStride(); }
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index rowId, Index colId) const
     {
       return m_expression.coeff(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
       return m_expression.const_cast_derived().coeffRef(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return m_expression.const_cast_derived().coeffRef(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       return m_expression.const_cast_derived().coeffRef(index);
@@ -113,9 +132,11 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     }
 
     template<typename Dest>
+    EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
     const typename internal::remove_all<NestedExpressionType>::type& 
+    EIGEN_DEVICE_FUNC
     nestedExpression() const 
     {
       return m_expression;
@@ -123,10 +144,12 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
 
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
+    EIGEN_DEVICE_FUNC
     void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
@@ -149,6 +172,11 @@ struct traits<MatrixWrapper<ExpressionType> >
  : public traits<typename remove_all<typename ExpressionType::Nested>::type >
 {
   typedef MatrixXpr XprKind;
+  // Let's remove NestByRefBit
+  enum {
+    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
+    Flags = Flags0 & ~NestByRefBit
+  };
 };
 }
 
@@ -159,6 +187,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
     typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+    typedef typename internal::remove_all<ExpressionType>::type NestedExpression;
 
     typedef typename internal::conditional<
                        internal::is_lvalue<ExpressionType>::value,
@@ -166,43 +195,56 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
+    typedef typename internal::ref_selector<ExpressionType>::type NestedExpressionType;
 
-    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return m_expression.innerStride(); }
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return m_expression.data(); }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index rowId, Index colId) const
     {
       return m_expression.coeff(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
       return m_expression.const_cast_derived().coeffRef(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return m_expression.derived().coeffRef(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       return m_expression.const_cast_derived().coeffRef(index);
@@ -232,6 +274,7 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
     }
 
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<NestedExpressionType>::type& 
     nestedExpression() const 
     {
@@ -240,10 +283,12 @@ class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
 
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index)  */
+    EIGEN_DEVICE_FUNC
     void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
     /** Forwards the resizing request to the nested expression
       * \sa DenseBase::resize(Index,Index)*/
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols) { m_expression.const_cast_derived().resize(rows,cols); }
 
   protected:
     NestedExpressionType m_expression;
diff --git a/nuparu/include/Eigen/src/Core/Assign.h b/nuparu/include/Eigen/src/Core/Assign.h
index 1dccc2f4..53806ba3 100644
--- a/nuparu/include/Eigen/src/Core/Assign.h
+++ b/nuparu/include/Eigen/src/Core/Assign.h
@@ -14,471 +14,6 @@
 
 namespace Eigen {
 
-namespace internal {
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-template <typename Derived, typename OtherDerived>
-struct assign_traits
-{
-public:
-  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
-  };
-
-private:
-  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
-  };
-
-  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)       ? int(LinearTraversal)
-                                        : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeffByOuterInner(outer, inner, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.copyCoeffByOuterInner(outer, Index, src);
-    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeff(Index, src);
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime,
-    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
-    assign_innervec_CompleteUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
-    assign_innervec_InnerUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived1, typename Derived2,
-         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
-         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
-         int Version = Specialized>
-struct assign_impl;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Unrolling, int Version>
-struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
-{
-  static inline void run(Derived1 &, const Derived2 &) { }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    for(Index i = 0; i < size; ++i)
-      dst.copyCoeff(i, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-template <bool IsAligned = false>
-struct unaligned_assign_impl
-{
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
-};
-
-template <>
-struct unaligned_assign_impl<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-#ifdef _MSC_VER
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#else
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#endif
-  {
-    for (typename Derived::Index index = start; index < end; ++index)
-      dst.copyCoeff(index, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                             : internal::first_aligned(&dst.coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-    {
-      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
-    }
-
-    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    enum { size = Derived1::SizeAtCompileTime,
-           packetSize = packet_traits<typename Derived1::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstAlignment = alignable ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || assign_traits<Derived1,Derived2>::DstIsAligned) ? 0
-                       : internal::first_aligned(&dst.coeffRef(0,0), innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-} // end namespace internal
-
-/***************************************************************************
-* Part 4 : implementation of DenseBase methods
-***************************************************************************/
-
 template<typename Derived>
 template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
@@ -492,90 +27,62 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
   EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::assign_traits<Derived, OtherDerived>::debug();
-#endif
   eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
-#ifndef EIGEN_NO_DEBUG
-  checkTransposeAliasing(other.derived());
-#endif
+  internal::call_assignment_no_alias(derived(),other.derived());
+  
   return derived();
 }
 
-namespace internal {
-
-template<typename Derived, typename OtherDerived,
-         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
-         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
-                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                  // revert to || as soon as not needed anymore.
-                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
-                              && int(Derived::SizeAtCompileTime) != 1>
-struct assign_selector;
-
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,false> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,false> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,true> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,true> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
-};
-
-} // end namespace internal
-
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
 {
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+  other.derived().evalTo(derived());
+  return derived();
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/AssignEvaluator.h b/nuparu/include/Eigen/src/Core/AssignEvaluator.h
new file mode 100644
index 00000000..9dfffbcc
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/AssignEvaluator.h
@@ -0,0 +1,810 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_EVALUATOR_H
+#define EIGEN_ASSIGN_EVALUATOR_H
+
+namespace Eigen {
+
+// This implementation is based on Assign.h
+
+namespace internal {
+  
+/***************************************************************************
+* Part 1 : the logic deciding a strategy for traversal and unrolling       *
+***************************************************************************/
+
+// copy_using_evaluator_traits is based on assign_traits
+
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
+struct copy_using_evaluator_traits
+{
+  typedef typename DstEvaluator::XprType Dst;
+  typedef typename Dst::Scalar DstScalar;
+  // TODO distinguish between linear traversal and inner-traversals
+  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type PacketType; 
+  
+  enum {
+    DstFlags = DstEvaluator::Flags,
+    SrcFlags = SrcEvaluator::Flags,
+    RequiredAlignment = unpacket_traits<PacketType>::alignment
+  };
+  
+public:
+  enum {
+    DstAlignment = DstEvaluator::Alignment,
+    SrcAlignment = SrcEvaluator::Alignment,
+    DstHasDirectAccess = DstFlags & DirectAccessBit,
+    JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
+  };
+
+private:
+  enum {
+    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
+              : int(Dst::RowsAtCompileTime),
+    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+              : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+              : int(Dst::MaxRowsAtCompileTime),
+    OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
+    PacketSize = unpacket_traits<PacketType>::size
+  };
+
+  enum {
+    DstIsRowMajor = DstFlags&RowMajorBit,
+    SrcIsRowMajor = SrcFlags&RowMajorBit,
+    StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)),
+    MightVectorize = StorageOrdersAgree
+                  && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit)
+                  && (functor_traits<AssignFunc>::PacketAccess),
+    MayInnerVectorize  = MightVectorize
+                       && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
+                       && int(OuterStride)!=Dynamic && int(OuterStride)%int(PacketSize)==0
+                       && int(JointAlignment)>=int(RequiredAlignment),
+    MayLinearize = StorageOrdersAgree && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
+    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
+                       && ((int(DstAlignment)>=int(RequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
+      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+         so it's only good for large enough sizes. */
+    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
+                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
+      /* slice vectorization can be slow, so we only want it if the slices are big, which is
+         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+         in a fixed-size matrix */
+  };
+
+public:
+  enum {
+    Traversal = int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
+              : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
+              : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
+              : int(MayLinearize)        ? int(LinearTraversal)
+                                         : int(DefaultTraversal),
+    Vectorized = int(Traversal) == InnerVectorizedTraversal
+              || int(Traversal) == LinearVectorizedTraversal
+              || int(Traversal) == SliceVectorizedTraversal
+  };
+
+private:
+  enum {
+    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
+    MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic
+                       && int(Dst::SizeAtCompileTime) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit),
+    MayUnrollInner      = int(InnerSize) != Dynamic
+                       && int(InnerSize) * int(SrcEvaluator::CoeffReadCost) <= int(UnrollingLimit)
+  };
+
+public:
+  enum {
+    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
+                ? (
+                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
+                  : int(MayUnrollInner)      ? int(InnerUnrolling)
+                                             : int(NoUnrolling)
+                  )
+              : int(Traversal) == int(LinearVectorizedTraversal)
+                ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(RequiredAlignment)) ? int(CompleteUnrolling)
+                                                                                             : int(NoUnrolling) )
+              : int(Traversal) == int(LinearTraversal)
+                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 
+                                              : int(NoUnrolling) )
+              : int(NoUnrolling)
+  };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
+    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(RequiredAlignment)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSize)
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
+};
+
+/***************************************************************************
+* Part 2 : meta-unrollers
+***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.assignCoeffByOuterInner(outer, inner);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
+  {
+    kernel.assignCoeffByOuterInner(outer, Index_);
+    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel)
+  {
+    kernel.assignCoeff(Index);
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel, int Index, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  typedef typename Kernel::PacketType PacketType;
+  
+  enum {
+    outer = Index / DstXprType::InnerSizeAtCompileTime,
+    inner = Index % DstXprType::InnerSizeAtCompileTime,
+    JointAlignment = Kernel::AssignmentTraits::JointAlignment
+  };
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    kernel.template assignPacketByOuterInner<Aligned, JointAlignment, PacketType>(outer, inner);
+    enum { NextIndex = Index + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { }
+};
+
+template<typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_innervec_InnerUnrolling
+{
+  typedef typename Kernel::PacketType PacketType;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
+  {
+    kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, Index_);
+    enum { NextIndex = Index_ + unpacket_traits<PacketType>::size };
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop>::run(kernel, outer);
+  }
+};
+
+template<typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { }
+};
+
+/***************************************************************************
+* Part 3 : implementation of all cases
+***************************************************************************/
+
+// dense_assignment_loop is based on assign_impl
+
+template<typename Kernel,
+         int Traversal = Kernel::AssignmentTraits::Traversal,
+         int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop;
+
+/************************
+*** Default traversal ***
+************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel)
+  {
+    for(Index outer = 0; outer < kernel.outerSize(); ++outer) {
+      for(Index inner = 0; inner < kernel.innerSize(); ++inner) {
+        kernel.assignCoeffByOuterInner(outer, inner);
+      }
+    }
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+
+// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
+// of the non vectorizable beginning and ending parts
+
+template <bool IsAligned = false>
+struct unaligned_dense_assignment_loop
+{
+  // if IsAligned = true, then do nothing
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {}
+};
+
+template <>
+struct unaligned_dense_assignment_loop<false>
+{
+  // MSVC must not inline this functions. If it does, it fails to optimize the
+  // packet access path.
+  // FIXME check which version exhibits this issue
+#if EIGEN_COMP_MSVC
+  template <typename Kernel>
+  static EIGEN_DONT_INLINE void run(Kernel &kernel,
+                                    Index start,
+                                    Index end)
+#else
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel,
+                                      Index start,
+                                      Index end)
+#endif
+  {
+    for (Index index = start; index < end; ++index)
+      kernel.assignCoeff(index);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index size = kernel.size();
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
+    enum {
+      requestedAlignment = Kernel::AssignmentTraits::RequiredAlignment,
+      packetSize = unpacket_traits<PacketType>::size,
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment)
+                                                            : int(Kernel::AssignmentTraits::DstAlignment),
+      srcAlignment = Kernel::AssignmentTraits::JointAlignment
+    };
+    const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(&kernel.dstEvaluator().coeffRef(0), size);
+    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
+
+    unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart);
+
+    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
+      kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index);
+
+    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    
+    enum { size = DstXprType::SizeAtCompileTime,
+           packetSize = packet_traits<typename Kernel::Scalar>::size,
+           alignedSize = (size/packetSize)*packetSize };
+
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling>
+{
+  typedef typename Kernel::PacketType PacketType;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index packetSize = unpacket_traits<PacketType>::size;
+    for(Index outer = 0; outer < outerSize; ++outer)
+      for(Index inner = 0; inner < innerSize; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<Aligned, Aligned, PacketType>(outer, inner);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    const Index outerSize = kernel.outerSize();
+    for(Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    const Index size = kernel.size();
+    for(Index i = 0; i < size; ++i)
+      kernel.assignCoeff(i);
+  }
+};
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling>
+{
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel)
+  {
+    typedef typename Kernel::DstEvaluatorType::XprType DstXprType;
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template<typename Kernel>
+struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
+{
+  EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel)
+  {
+    typedef typename Kernel::Scalar Scalar;
+    typedef typename Kernel::PacketType PacketType;
+    enum {
+      packetSize = unpacket_traits<PacketType>::size,
+      requestedAlignment = int(Kernel::AssignmentTraits::RequiredAlignment),
+      alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar),
+      dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment),
+      dstAlignment = alignable ? int(requestedAlignment)
+                               : int(Kernel::AssignmentTraits::DstAlignment)
+    };
+    const Scalar *dst_ptr = &kernel.dstEvaluator().coeffRef(0,0);
+    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
+    {
+      // the pointer is not aligend-on scalar, so alignment is not possible
+      return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
+    }
+    const Index packetAlignedMask = packetSize - 1;
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0;
+    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize);
+
+    for(Index outer = 0; outer < outerSize; ++outer)
+    {
+      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
+      // do the non-vectorizable part of the assignment
+      for(Index inner = 0; inner<alignedStart ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      // do the vectorizable part of the assignment
+      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
+        kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner);
+
+      // do the non-vectorizable part of the assignment
+      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
+        kernel.assignCoeffByOuterInner(outer, inner);
+
+      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
+    }
+  }
+};
+
+/***************************************************************************
+* Part 4 : Generic dense assignment kernel
+***************************************************************************/
+
+// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
+// to another dense writable evaluator.
+// It is parametrized by the two evaluators, and the actual assignment functor.
+// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
+// One can customize the assignment using this generic dense_assignment_kernel with different
+// functors, or by completely overloading it, by-passing a functor.
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class generic_dense_assignment_kernel
+{
+protected:
+  typedef typename DstEvaluatorTypeT::XprType DstXprType;
+  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
+public:
+  
+  typedef DstEvaluatorTypeT DstEvaluatorType;
+  typedef SrcEvaluatorTypeT SrcEvaluatorType;
+  typedef typename DstEvaluatorType::Scalar Scalar;
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
+  
+  
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
+  {
+    #ifdef EIGEN_DEBUG_ASSIGN
+    AssignmentTraits::debug();
+    #endif
+  }
+  
+  EIGEN_DEVICE_FUNC Index size() const        { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const   { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const   { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC Index rows() const        { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const        { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); }
+  
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; }
+  
+  /// Assign src(row,col) to dst(row,col) through the assignment functor.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col));
+  }
+  
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index)
+  {
+    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
+  }
+  
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner); 
+    Index col = colIndexByOuterInner(outer, inner); 
+    assignCoeff(row, col);
+  }
+  
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col));
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index)
+  {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index));
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = rowIndexByOuterInner(outer, inner); 
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
+  
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::RowsAtCompileTime) == 1 ? 0
+      : int(Traits::ColsAtCompileTime) == 1 ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? outer
+      : inner;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner)
+  {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::ColsAtCompileTime) == 1 ? 0
+      : int(Traits::RowsAtCompileTime) == 1 ? inner
+      : int(DstEvaluatorType::Flags)&RowMajorBit ? inner
+      : outer;
+  }
+  
+protected:
+  DstEvaluatorType& m_dst;
+  const SrcEvaluatorType& m_src;
+  const Functor &m_functor;
+  // TODO find a way to avoid the needs of the original expression
+  DstXprType& m_dstExpr;
+};
+
+/***************************************************************************
+* Part 5 : Entry point for dense rectangular assignment
+***************************************************************************/
+
+template<typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+  
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  DstEvaluatorType dstEvaluator(dst);
+  SrcEvaluatorType srcEvaluator(src);
+    
+  typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+  
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template<typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC void call_dense_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+{
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+}
+
+/***************************************************************************
+* Part 6 : Generic assignment
+***************************************************************************/
+
+// Based on the respective shapes of the destination and source,
+// the class AssignmentKind determine the kind of assignment mechanism.
+// AssignmentKind must define a Kind typedef.
+template<typename DstShape, typename SrcShape> struct AssignmentKind;
+
+// Assignement kind defined in this file:
+struct Dense2Dense {};
+struct EigenBase2EigenBase {};
+
+template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; };
+template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; };
+    
+// This is the main assignment class
+template< typename DstXprType, typename SrcXprType, typename Functor,
+          typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind,
+          typename Scalar = typename DstXprType::Scalar>
+struct Assignment;
+
+
+// The only purpose of this call_assignment() function is to deal with noalias() / AssumeAliasing and automatic transposition.
+// Indeed, I (Gael) think that this concept of AssumeAliasing was a mistake, and it makes thing quite complicated.
+// So this intermediate function removes everything related to AssumeAliasing such that Assignment
+// does not has to bother about these annoying details.
+
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment(const Dst& dst, const Src& src)
+{
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+                     
+// Deal with AssumeAliasing
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==1, void*>::type = 0)
+{
+  typename plain_matrix_type<Src>::type tmp(src);
+  call_assignment_no_alias(dst, tmp, func);
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<evaluator_traits<Src>::AssumeAliasing==0, void*>::type = 0)
+{
+  call_assignment_no_alias(dst, src, func);
+}
+
+// by-pass AssumeAliasing
+// When there is no aliasing, we require that 'dst' has been properly resized
+template<typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func)
+{
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
+{
+  enum {
+    NeedToTranspose = (    (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1)
+                        || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)
+                      ) && int(Dst::SizeAtCompileTime) != 1
+  };
+
+  Index dstRows = NeedToTranspose ? src.cols() : src.rows();
+  Index dstCols = NeedToTranspose ? src.rows() : src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned;
+  typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType;
+  ActualDstType actualDst(dst);
+  
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar);
+  
+  Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+
+template<typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func)
+{
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+    dst.resize(dstRows, dstCols);
+  
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src)
+  
+  Assignment<Dst,Src,Func>::run(dst, src, func);
+}
+template<typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src)
+{
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar>());
+}
+
+// forward declaration
+template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src);
+
+// Generic Dense to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Scalar>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
+#endif
+    
+    call_dense_assignment_loop(dst, src, func);
+  }
+};
+
+// Generic assignment through evalTo.
+// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Scalar>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.evalTo(dst);
+  }
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ASSIGN_EVALUATOR_H
diff --git a/nuparu/include/Eigen/src/Core/Assign_MKL.h b/nuparu/include/Eigen/src/Core/Assign_MKL.h
index 7772951b..897187a3 100644
--- a/nuparu/include/Eigen/src/Core/Assign_MKL.h
+++ b/nuparu/include/Eigen/src/Core/Assign_MKL.h
@@ -1,6 +1,7 @@
 /*
  Copyright (c) 2011, Intel Corporation. All rights reserved.
-
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+ 
  Redistribution and use in source and binary forms, with or without modification,
  are permitted provided that the following conditions are met:
 
@@ -37,17 +38,13 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Op> struct vml_call
-{ enum { IsSupported = 0 }; };
-
-template<typename Dst, typename Src, typename UnaryOp>
+template<typename Dst, typename Src>
 class vml_assign_traits
 {
   private:
     enum {
       DstHasDirectAccess = Dst::Flags & DirectAccessBit,
       SrcHasDirectAccess = Src::Flags & DirectAccessBit,
-
       StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
       InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
                 : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
@@ -57,165 +54,118 @@ class vml_assign_traits
                     : int(Dst::MaxRowsAtCompileTime),
       MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
 
-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
-                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
+      MightEnableVml = StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
       MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
       VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
-      MayEnableVml = MightEnableVml && LargeEnough,
-      MayLinearize = MayEnableVml && MightLinearize
+      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD
     };
   public:
     enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
-                : MayEnableVml ? InnerVectorizedTraversal
-                : DefaultTraversal
+      EnableVml = MightEnableVml && LargeEnough,
+      Traversal = MightLinearize ? LinearTraversal : DefaultTraversal
     };
 };
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
-         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
-struct vml_assign_impl
-  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
-{
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer) {
-      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
-                                                &(src.nestedExpression().coeffRef(0, outer));
-      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
-      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
-{
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
-  }
-};
-
-// Macroses
-
-#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
-  template<typename Derived1, typename Derived2, typename UnaryOp> \
-  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
-    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
-      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
-    } \
-  };
-
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
-
-
+#define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif
 
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
-    }                                                                            \
+#define EIGEN_VMLMODE_EXPAND__ 
+
+#define EIGEN_VMLMODE_PREFIX_LA vm
+#define EIGEN_VMLMODE_PREFIX__  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                         \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,             \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {    \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                            \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                             \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
+        VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
+      } else {                                                                                                                  \
+        const Index outerSize = dst.outerSize();                                                                                \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                             \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                             \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
+        }                                                                                                                       \
+      }                                                                                                                         \
+    }                                                                                                                           \
+  };                                                                                                                            \
+
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),s##VMLOP), float, float, VMLMODE)           \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                                         \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),c##VMLOP), scomplex, MKL_Complex8, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE),z##VMLOP), dcomplex, MKL_Complex16, VMLMODE)
+  
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE)                                                              \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                               \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+  
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin,   Sin,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin,  Asin,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh,  Sinh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos,   Cos,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos,  Acos,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh,  Cosh,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan,   Tan,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan,  Atan,  LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh,  Tanh,  LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp,   Exp,   LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log,   Ln,    LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt,  Sqrt,  _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr,   _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg,      _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor,  _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
+  template< typename DstXprType, typename SrcXprNested>                                                                       \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE>,           \
+                   Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml,EIGENTYPE>::type> {  \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                                          \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE> &/*func*/) {                           \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                     \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.functor().m_exponent);                                          \
+      if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
+      {                                                                                                                       \
+        VMLOP( dst.size(), (const VMLTYPE*)src.nestedExpression().data(), exponent,                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
+      } else {                                                                                                                \
+        const Index outerSize = dst.outerSize();                                                                              \
+        for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer,0)) :                           \
+                                                      &(src.nestedExpression().coeffRef(0, outer));                           \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
+          VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
+        }                                                                                                                     \
+      }                                                                                                                       \
+    }                                                                                                                         \
   };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
-                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
-      EIGENTYPE exponent = func.m_exponent;                                      \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
-                        (VMLTYPE*)dst, &vmlMode);                                \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
-
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
-
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,  Sin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sqrt, Sqrt)
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
-
-// The vm*powx functions are not avaibale in the windows version of MKL.
-#ifndef _WIN32
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
-#endif
+  
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float,    float,         LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double,   double,        LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8,  LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
 
 } // end namespace internal
 
diff --git a/nuparu/include/Eigen/src/Core/BandMatrix.h b/nuparu/include/Eigen/src/Core/BandMatrix.h
index ffd7fe8b..87c124fd 100644
--- a/nuparu/include/Eigen/src/Core/BandMatrix.h
+++ b/nuparu/include/Eigen/src/Core/BandMatrix.h
@@ -32,7 +32,7 @@ class BandMatrixBase : public EigenBase<Derived>
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
-    typedef typename DenseMatrixType::Index Index;
+    typedef typename DenseMatrixType::StorageIndex StorageIndex;
     typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
     typedef EigenBase<Derived> Base;
 
@@ -179,7 +179,7 @@ struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
 {
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
   enum {
     CoeffReadCost = NumTraits<Scalar>::ReadCost,
     RowsAtCompileTime = _Rows,
@@ -201,10 +201,10 @@ class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Sub
   public:
 
     typedef typename internal::traits<BandMatrix>::Scalar Scalar;
-    typedef typename internal::traits<BandMatrix>::Index Index;
+    typedef typename internal::traits<BandMatrix>::StorageIndex StorageIndex;
     typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
 
-    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
+    explicit inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
       : m_coeffs(1+supers+subs,cols),
         m_rows(rows), m_supers(supers), m_subs(subs)
     {
@@ -241,7 +241,7 @@ struct traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Opt
 {
   typedef typename _CoefficientsType::Scalar Scalar;
   typedef typename _CoefficientsType::StorageKind StorageKind;
-  typedef typename _CoefficientsType::Index Index;
+  typedef typename _CoefficientsType::StorageIndex StorageIndex;
   enum {
     CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
     RowsAtCompileTime = _Rows,
@@ -264,9 +264,9 @@ class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsT
 
     typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
     typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
-    typedef typename internal::traits<BandMatrixWrapper>::Index Index;
+    typedef typename internal::traits<BandMatrixWrapper>::StorageIndex StorageIndex;
 
-    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
+    explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
       : m_coeffs(coeffs),
         m_rows(rows), m_supers(supers), m_subs(subs)
     {
@@ -312,9 +312,9 @@ template<typename Scalar, int Size, int Options>
 class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
 {
     typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
   public:
-    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
+    explicit TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
 
     inline typename Base::template DiagonalIntReturnType<1>::Type super()
     { return Base::template diagonal<1>(); }
@@ -327,6 +327,25 @@ class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint
   protected:
 };
 
+
+struct BandShape {};
+
+template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
+struct evaluator_traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+struct evaluator_traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+  : public evaluator_traits_base<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
+{
+  typedef BandShape Shape;
+};
+
+template<> struct AssignmentKind<DenseShape,BandShape> { typedef EigenBase2EigenBase Kind; };
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Block.h b/nuparu/include/Eigen/src/Core/Block.h
index 358b3188..3748e259 100644
--- a/nuparu/include/Eigen/src/Core/Block.h
+++ b/nuparu/include/Eigen/src/Core/Block.h
@@ -21,6 +21,9 @@ namespace Eigen {
   * \param XprType the type of the expression in which we are taking a block
   * \param BlockRows the number of rows of the block we are taking at compile time (optional)
   * \param BlockCols the number of columns of the block we are taking at compile time (optional)
+  * \param InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+  *        to set of columns of a column major matrix (optional). The parameter allows to determine
+  *        at compile time whether aligned access is possible on the block expression.
   *
   * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
   * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
@@ -52,7 +55,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
   typedef typename traits<XprType>::Scalar Scalar;
   typedef typename traits<XprType>::StorageKind StorageKind;
   typedef typename traits<XprType>::XprKind XprKind;
-  typedef typename nested<XprType>::type XprTypeNested;
+  typedef typename ref_selector<XprType>::type XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum{
     MatrixRows = traits<XprType>::RowsAtCompileTime,
@@ -65,6 +68,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     MaxColsAtCompileTime = BlockCols==0 ? 0
                          : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
                          : int(traits<XprType>::MaxColsAtCompileTime),
+
     XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
     IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
                : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
@@ -77,18 +81,16 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
     OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
                              ? int(outer_stride_at_compile_time<XprType>::ret)
                              : int(inner_stride_at_compile_time<XprType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+
+    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
     FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
-                                        DirectAccessBit |
-                                        MaskPacketAccessBit |
-                                        MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit,
+    // FIXME DirectAccessBit should not be handled by expressions
+    // 
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the respective evaluator
+    Alignment = 0
   };
 };
 
@@ -108,9 +110,12 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
     typedef Impl Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+    
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
   
     /** Column or Row constructor
       */
+    EIGEN_DEVICE_FUNC
     inline Block(XprType& xpr, Index i) : Impl(xpr,i)
     {
       eigen_assert( (i>=0) && (
@@ -120,25 +125,27 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
 
     /** Fixed-size constructor
       */
-    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
-      : Impl(xpr, a_startRow, a_startCol)
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
-             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
+      eigen_assert(startRow >= 0 && BlockRows >= 1 && startRow + BlockRows <= xpr.rows()
+             && startCol >= 0 && BlockCols >= 1 && startCol + BlockCols <= xpr.cols());
     }
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline Block(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols)
     {
       eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
           && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
-      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
-          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
+      eigen_assert(startRow >= 0 && blockRows >= 0 && startRow  <= xpr.rows() - blockRows
+          && startCol >= 0 && blockCols >= 0 && startCol <= xpr.cols() - blockCols);
     }
 };
          
@@ -149,14 +156,15 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
   : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
 {
     typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
-    typedef typename XprType::Index Index;
+    typedef typename XprType::StorageIndex StorageIndex;
   public:
     typedef Impl Base;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
 namespace internal {
@@ -172,10 +180,11 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-    class InnerIterator;
+    // class InnerIterator; // FIXME apparently never used
 
     /** Column or Row constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index i)
       : m_xpr(xpr),
         // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
@@ -190,23 +199,26 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
 
     /** Fixed-size constructor
       */
-    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(BlockRows), m_blockCols(BlockCols)
     {}
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr,
-          Index a_startRow, Index a_startCol,
+          Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol),
                     m_blockRows(blockRows), m_blockCols(blockCols)
     {}
 
-    inline Index rows() const { return m_blockRows.value(); }
-    inline Index cols() const { return m_blockCols.value(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index rowId, Index colId)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -214,17 +226,20 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return m_xpr.derived()
                .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
     {
       return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(XprType)
@@ -233,6 +248,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                        m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       return m_xpr.const_cast_derived()
@@ -240,6 +256,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
                        m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
+    EIGEN_DEVICE_FUNC
     inline const CoeffReturnType coeff(Index index) const
     {
       return m_xpr
@@ -279,22 +296,25 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
 
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \sa MapBase::data() */
-    inline const Scalar* data() const;
-    inline Index innerStride() const;
-    inline Index outerStride() const;
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
+    EIGEN_DEVICE_FUNC inline Index innerStride() const;
+    EIGEN_DEVICE_FUNC inline Index outerStride() const;
     #endif
 
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
       
-    Index startRow() const 
+    EIGEN_DEVICE_FUNC
+    StorageIndex startRow() const
     { 
       return m_startRow.value(); 
     }
       
-    Index startCol() const 
+    EIGEN_DEVICE_FUNC
+    StorageIndex startCol() const
     { 
       return m_startCol.value(); 
     }
@@ -302,10 +322,10 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
   protected:
 
     const typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+    const internal::variable_if_dynamic<StorageIndex, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+    const internal::variable_if_dynamic<StorageIndex, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+    const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
+    const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
 };
 
 /** \internal Internal implementation of dense Blocks in the direct access case.*/
@@ -314,6 +334,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
   : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
 {
     typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    enum {
+      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
+    };
   public:
 
     typedef MapBase<BlockType> Base;
@@ -322,10 +345,10 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Column or Row constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(
-              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
-              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
+      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
+                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
         m_xpr(xpr)
@@ -335,29 +358,34 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Fixed-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+        m_xpr(xpr)
     {
       init();
     }
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
+      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
         m_xpr(xpr)
     {
       init();
     }
 
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
       
     /** \sa MapBase::innerStride() */
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return internal::traits<BlockType>::HasSameStorageOrderAsXprType
@@ -366,6 +394,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     }
 
     /** \sa MapBase::outerStride() */
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return m_outerStride;
@@ -379,6 +408,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal used by allowAligned() */
+    EIGEN_DEVICE_FUNC
     inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
       : Base(data, blockRows, blockCols), m_xpr(xpr)
     {
@@ -387,6 +417,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     #endif
 
   protected:
+    EIGEN_DEVICE_FUNC
     void init()
     {
       m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
diff --git a/nuparu/include/Eigen/src/Core/BooleanRedux.h b/nuparu/include/Eigen/src/Core/BooleanRedux.h
index 6e37e031..8409d874 100644
--- a/nuparu/include/Eigen/src/Core/BooleanRedux.h
+++ b/nuparu/include/Eigen/src/Core/BooleanRedux.h
@@ -17,9 +17,10 @@ namespace internal {
 template<typename Derived, int UnrollCount>
 struct all_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
   };
 
   static inline bool run(const Derived &mat)
@@ -29,9 +30,9 @@ struct all_unroller
 };
 
 template<typename Derived>
-struct all_unroller<Derived, 1>
+struct all_unroller<Derived, 0>
 {
-  static inline bool run(const Derived &mat) { return mat.coeff(0, 0); }
+  static inline bool run(const Derived &/*mat*/) { return true; }
 };
 
 template<typename Derived>
@@ -43,11 +44,12 @@ struct all_unroller<Derived, Dynamic>
 template<typename Derived, int UnrollCount>
 struct any_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
   };
-
+  
   static inline bool run(const Derived &mat)
   {
     return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
@@ -55,9 +57,9 @@ struct any_unroller
 };
 
 template<typename Derived>
-struct any_unroller<Derived, 1>
+struct any_unroller<Derived, 0>
 {
-  static inline bool run(const Derived &mat) { return mat.coeff(0, 0); }
+  static inline bool run(const Derived & /*mat*/) { return false; }
 };
 
 template<typename Derived>
@@ -78,19 +80,19 @@ struct any_unroller<Derived, Dynamic>
 template<typename Derived>
 inline bool DenseBase<Derived>::all() const
 {
+  typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
   };
+  Evaluator evaluator(derived());
   if(unroll)
-    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
       for(Index i = 0; i < rows(); ++i)
-        if (!coeff(i, j)) return false;
+        if (!evaluator.coeff(i, j)) return false;
     return true;
   }
 }
@@ -102,19 +104,19 @@ inline bool DenseBase<Derived>::all() const
 template<typename Derived>
 inline bool DenseBase<Derived>::any() const
 {
+  typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
   };
+  Evaluator evaluator(derived());
   if(unroll)
-    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
       for(Index i = 0; i < rows(); ++i)
-        if (coeff(i, j)) return true;
+        if (evaluator.coeff(i, j)) return true;
     return false;
   }
 }
@@ -124,7 +126,7 @@ inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
+inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
@@ -136,7 +138,11 @@ inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
 template<typename Derived>
 inline bool DenseBase<Derived>::hasNaN() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isNaN().any();
+#else
   return !((derived().array()==derived().array()).all());
+#endif
 }
 
 /** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
@@ -146,7 +152,11 @@ inline bool DenseBase<Derived>::hasNaN() const
 template<typename Derived>
 inline bool DenseBase<Derived>::allFinite() const
 {
+#if EIGEN_COMP_MSVC || (defined __FAST_MATH__)
+  return derived().array().isFinite().all();
+#else
   return !((derived()-derived()).hasNaN());
+#endif
 }
     
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/CMakeLists.txt b/nuparu/include/Eigen/src/Core/CMakeLists.txt
index 2346fc2b..38c3afde 100644
--- a/nuparu/include/Eigen/src/Core/CMakeLists.txt
+++ b/nuparu/include/Eigen/src/Core/CMakeLists.txt
@@ -8,3 +8,4 @@ INSTALL(FILES
 ADD_SUBDIRECTORY(products)
 ADD_SUBDIRECTORY(util)
 ADD_SUBDIRECTORY(arch)
+ADD_SUBDIRECTORY(functors)
diff --git a/nuparu/include/Eigen/src/Core/CommaInitializer.h b/nuparu/include/Eigen/src/Core/CommaInitializer.h
index a96867af..89bcd750 100644
--- a/nuparu/include/Eigen/src/Core/CommaInitializer.h
+++ b/nuparu/include/Eigen/src/Core/CommaInitializer.h
@@ -28,8 +28,8 @@ template<typename XprType>
 struct CommaInitializer
 {
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::Index Index;
 
+  EIGEN_DEVICE_FUNC
   inline CommaInitializer(XprType& xpr, const Scalar& s)
     : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
   {
@@ -37,13 +37,27 @@ struct CommaInitializer
   }
 
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
     : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
   {
     m_xpr.block(0, 0, other.rows(), other.cols()) = other;
   }
 
+  /* Copy/Move constructor which transfers ownership. This is crucial in 
+   * absence of return value optimization to avoid assertions during destruction. */
+  // FIXME in C++11 mode this could be replaced by a proper RValue constructor
+  EIGEN_DEVICE_FUNC
+  inline CommaInitializer(const CommaInitializer& o)
+  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
+    // Mark original object as finished. In absence of R-value references we need to const_cast:
+    const_cast<CommaInitializer&>(o).m_row = m_xpr.rows();
+    const_cast<CommaInitializer&>(o).m_col = m_xpr.cols();
+    const_cast<CommaInitializer&>(o).m_currentBlockRows = 0;
+  }
+
   /* inserts a scalar value in the target matrix */
+  EIGEN_DEVICE_FUNC
   CommaInitializer& operator,(const Scalar& s)
   {
     if (m_col==m_xpr.cols())
@@ -63,6 +77,7 @@ struct CommaInitializer
 
   /* inserts a matrix expression in the target matrix */
   template<typename OtherDerived>
+  EIGEN_DEVICE_FUNC
   CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
   {
     if(other.cols()==0 || other.rows()==0)
@@ -88,7 +103,11 @@ struct CommaInitializer
     return *this;
   }
 
+  EIGEN_DEVICE_FUNC
   inline ~CommaInitializer()
+#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
+  EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
+#endif
   {
     eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
          && m_col == m_xpr.cols()
@@ -102,9 +121,10 @@ struct CommaInitializer
     * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
     * \endcode
     */
+  EIGEN_DEVICE_FUNC
   inline XprType& finished() { return m_xpr; }
 
-  XprType& m_xpr;   // target expression
+  XprType& m_xpr;           // target expression
   Index m_row;              // current row id
   Index m_col;              // current col id
   Index m_currentBlockRows; // current block height
diff --git a/nuparu/include/Eigen/src/Core/CoreEvaluators.h b/nuparu/include/Eigen/src/Core/CoreEvaluators.h
new file mode 100644
index 00000000..f97dc33d
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/CoreEvaluators.h
@@ -0,0 +1,1376 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_COREEVALUATORS_H
+#define EIGEN_COREEVALUATORS_H
+
+namespace Eigen {
+  
+namespace internal {
+
+// This class returns the evaluator kind from the expression storage kind.
+// Default assumes index based accessors
+template<typename StorageKind>
+struct storage_kind_to_evaluator_kind {
+  typedef IndexBased Kind;
+};
+
+// This class returns the evaluator shape from the expression storage kind.
+// It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
+template<typename StorageKind> struct storage_kind_to_shape;
+
+template<> struct storage_kind_to_shape<Dense>                  { typedef DenseShape Shape;           };
+template<> struct storage_kind_to_shape<SolverStorage>          { typedef SolverShape Shape;           };
+template<> struct storage_kind_to_shape<PermutationStorage>     { typedef PermutationShape Shape;     };
+template<> struct storage_kind_to_shape<TranspositionsStorage>  { typedef TranspositionsShape Shape;  };
+
+// Evaluators have to be specialized with respect to various criteria such as:
+//  - storage/structure/shape
+//  - scalar type
+//  - etc.
+// Therefore, we need specialization of evaluator providing additional template arguments for each kind of evaluators.
+// We currently distinguish the following kind of evaluators:
+// - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose, MatrixWrapper, ArrayWrapper, Reverse, Replicate)
+// - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires additional tags for dispatching.
+// - mapbase_evaluator  for Map, Block, Ref
+// - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
+
+template< typename T,
+          typename LhsKind   = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind   = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar> struct binary_evaluator;
+
+template< typename T,
+          typename Kind   = typename evaluator_traits<typename T::NestedExpression>::Kind,
+          typename Scalar = typename T::Scalar> struct unary_evaluator;
+          
+// evaluator_traits<T> contains traits for evaluator<T> 
+
+template<typename T>
+struct evaluator_traits_base
+{
+  // by default, get evaluator kind and shape from storage
+  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
+  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
+  
+  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
+  // temporary; 0 if not.
+  static const int AssumeAliasing = 0;
+};
+
+// Default evaluator traits
+template<typename T>
+struct evaluator_traits : public evaluator_traits_base<T>
+{
+};
+
+
+// By default, we assume a unary expression:
+template<typename T>
+struct evaluator : public unary_evaluator<T>
+{
+  typedef unary_evaluator<T> Base;
+  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
+};
+
+
+// TODO: Think about const-correctness
+template<typename T>
+struct evaluator<const T>
+  : evaluator<T>
+{
+  EIGEN_DEVICE_FUNC
+  explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+};
+
+// ---------- base class for all evaluators ----------
+
+template<typename ExpressionType>
+struct evaluator_base : public noncopyable
+{
+  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
+  typedef traits<ExpressionType> ExpressionTraits;
+  
+  enum {
+    Alignment = 0
+  };
+};
+
+// -------------------- Matrix and Array --------------------
+//
+// evaluator<PlainObjectBase> is a common base class for the
+// Matrix and Array evaluators.
+// Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
+// so no need for more sophisticated dispatching.
+
+template<typename Derived>
+struct evaluator<PlainObjectBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef PlainObjectBase<Derived> PlainObjectType;
+  typedef typename PlainObjectType::Scalar Scalar;
+  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = PlainObjectType::IsRowMajor,
+    IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
+    RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
+    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
+    
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
+  };
+  
+  EIGEN_DEVICE_FUNC evaluator()
+    : m_data(0),
+      m_outerStride(IsVectorAtCompileTime  ? 0 
+                                           : int(IsRowMajor) ? ColsAtCompileTime 
+                                           : RowsAtCompileTime)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
+    : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (IsRowMajor)
+      return m_data[row * m_outerStride.value() + col];
+    else
+      return m_data[row + col * m_outerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    if (IsRowMajor)
+      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
+    else
+      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return const_cast<Scalar*>(m_data)[index];
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    if (IsRowMajor)
+      return ploadt<PacketType, LoadMode>(m_data + row * m_outerStride.value() + col);
+    else
+      return ploadt<PacketType, LoadMode>(m_data + row + col * m_outerStride.value());
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    return ploadt<PacketType, LoadMode>(m_data + index);
+  }
+
+  template<int StoreMode,typename PacketType>
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    if (IsRowMajor)
+      return pstoret<Scalar, PacketType, StoreMode>
+	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
+    else
+      return pstoret<Scalar, PacketType, StoreMode>
+                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
+  }
+
+  template<int StoreMode, typename PacketType>
+  void writePacket(Index index, const PacketType& x)
+  {
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
+  }
+
+protected:
+  const Scalar *m_data;
+
+  // We do not need to know the outer stride for vectors
+  variable_if_dynamic<Index, IsVectorAtCompileTime  ? 0 
+                                                    : int(IsRowMajor) ? ColsAtCompileTime 
+                                                    : RowsAtCompileTime> m_outerStride;
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+{
+  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+  
+  EIGEN_DEVICE_FUNC evaluator() {}
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m) 
+  { }
+};
+
+template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
+  : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > >
+{
+  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+
+  EIGEN_DEVICE_FUNC evaluator() {}
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
+    : evaluator<PlainObjectBase<XprType> >(m) 
+  { }
+};
+
+// -------------------- Transpose --------------------
+
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IndexBased>
+  : evaluator_base<Transpose<ArgType> >
+{
+  typedef Transpose<ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,    
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC typename XprType::Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(col, row);
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(col, row, x);
+  }
+
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode,PacketType>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+// Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
+// Likewise, there is not need to more sophisticated dispatching here.
+
+template<typename NullaryOp, typename PlainObjectType>
+struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+  : evaluator_base<CwiseNullaryOp<NullaryOp,PlainObjectType> >
+{
+  typedef CwiseNullaryOp<NullaryOp,PlainObjectType> XprType;
+  typedef typename internal::remove_all<PlainObjectType>::type PlainObjectTypeCleaned;
+  
+  enum {
+    CoeffReadCost = internal::functor_traits<NullaryOp>::Cost,
+    
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags
+          &  (  HereditaryBits
+              | (functor_has_linear_access<NullaryOp>::ret  ? LinearAccessBit : 0)
+              | (functor_traits<NullaryOp>::PacketAccess    ? PacketAccessBit : 0)))
+          | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    Alignment = AlignedMax
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n)
+    : m_functor(n.functor()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.template packetOp<Index,PacketType>(row, col);
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    return m_functor.template packetOp<Index,PacketType>(index);
+  }
+
+protected:
+  const NullaryOp m_functor;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
+  : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = evaluator<ArgType>::Flags
+          & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : m_functor(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(m_argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_argImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
+  }
+
+protected:
+  const UnaryOp m_functor;
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+// this is a binary expression
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+  : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
+{
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar,typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags)&RowMajorBit)==(int(RhsFlags)&RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
+        HereditaryBits
+      | (int(LhsFlags) & int(RhsFlags) &
+           ( (StorageOrdersAgree ? LinearAccessBit : 0)
+           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
+           )
+        )
+     ),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(row, col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(index));
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+};
+
+// -------------------- CwiseUnaryView --------------------
+
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
+  : evaluator_base<CwiseUnaryView<UnaryOp, ArgType> >
+{
+  typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+    
+    Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : m_unaryOp(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_unaryOp(m_argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_unaryOp(m_argImpl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    return m_unaryOp(m_argImpl.coeffRef(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return m_unaryOp(m_argImpl.coeffRef(index));
+  }
+
+protected:
+  const UnaryOp m_unaryOp;
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- Map --------------------
+
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
+
+template<typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived>
+{
+  typedef Derived  XprType;
+  typedef typename XprType::PointerType PointerType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  
+  enum {
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
+  };
+  
+  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
+    : m_data(const_cast<PointerType>(map.data())),  
+      m_xpr(map)
+  {
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(evaluator<Derived>::Flags&PacketAccessBit, internal::inner_stride_at_compile_time<Derived>::ret==1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+ 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+  }
+  
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_data[index * m_xpr.innerStride()];
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    return m_data[col * m_xpr.colStride() + row * m_xpr.rowStride()];
+  }
+  
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return m_data[index * m_xpr.innerStride()];
+  }
+ 
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index row, Index col) const 
+  {
+    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    return internal::ploadt<PacketType, LoadMode>(ptr);
+  }
+
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index index) const 
+  {
+    return internal::ploadt<PacketType, LoadMode>(m_data + index * m_xpr.innerStride());
+  }
+  
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x) 
+  {
+    PointerType ptr = m_data + row * m_xpr.rowStride() + col * m_xpr.colStride();
+    return internal::pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+  }
+  
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x) 
+  {
+    internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_xpr.innerStride(), x);
+  }
+ 
+protected:
+  PointerType m_data;
+  const XprType& m_xpr;
+};
+
+template<typename PlainObjectType, int MapOptions, typename StrideType> 
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType> >
+  : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType>
+{
+  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                             ? int(PlainObjectType::InnerStrideAtCompileTime)
+                             : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                             ? int(PlainObjectType::OuterStrideAtCompileTime)
+                             : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
+    
+    PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
+    LinearAccessMask = bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
+    Flags = int( evaluator<PlainObjectType>::Flags) & (LinearAccessMask&PacketAccessMask),
+    
+    Alignment = int(MapOptions)&int(AlignedMask)
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map)
+    : mapbase_evaluator<XprType, PlainObjectType>(map) 
+  { }
+};
+
+// -------------------- Ref --------------------
+
+template<typename PlainObjectType, int RefOptions, typename StrideType> 
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
+  : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType>
+{
+  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+  
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
+    : mapbase_evaluator<XprType, PlainObjectType>(ref) 
+  { }
+};
+
+// -------------------- Block --------------------
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
+         bool HasDirectAccess = internal::has_direct_access<ArgType>::ret> struct block_evaluator;
+         
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+  : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+    
+    ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags)&RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? 1
+               : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+               : ArgTypeIsRowMajor,
+    HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(inner_stride_at_compile_time<ArgType>::ret)
+                             : int(outer_stride_at_compile_time<ArgType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
+                             ? int(outer_stride_at_compile_time<ArgType>::ret)
+                             : int(inner_stride_at_compile_time<ArgType>::ret),
+    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
+                       && (InnerStrideAtCompileTime == 1)
+                        ? PacketAccessBit : 0,
+    
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,    
+    FlagsRowMajorBit = XprType::Flags&RowMajorBit,
+    Flags0 = evaluator<ArgType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
+                                           DirectAccessBit |
+                                           MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+    
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
+  };
+  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+// no direct-access => dispatch to a unary evaluator
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
+  : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+    : unary_evaluator<XprType>(block) 
+  {}
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
+  : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
+    : m_argImpl(block.nestedExpression()), 
+      m_startRow(block.startRow()), 
+      m_startCol(block.startCol()) 
+  { }
+ 
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime
+  };
+ 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  { 
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col); 
+  }
+  
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  { 
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  { 
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col); 
+  }
+  
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  { 
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+ 
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index row, Index col) const 
+  { 
+    return m_argImpl.template packet<LoadMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col); 
+  }
+
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index index) const 
+  { 
+    return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                       RowsAtCompileTime == 1 ? index : 0);
+  }
+  
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x) 
+  { 
+    return m_argImpl.template writePacket<StoreMode,PacketType>(m_startRow.value() + row, m_startCol.value() + col, x); 
+  }
+  
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x) 
+  { 
+    return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                             RowsAtCompileTime == 1 ? index : 0,
+                                             x);
+  }
+ 
+protected:
+  evaluator<ArgType> m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+};
+
+// TODO: This evaluator does not actually use the child evaluator; 
+// all action is via the data() as returned by the Block expression.
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel> 
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
+  : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                      typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject>
+{
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
+    : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
+  {
+    // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
+    eigen_assert(((size_t(block.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
+  }
+};
+
+
+// -------------------- Select --------------------
+// NOTE shall we introduce a ternary_evaluator?
+
+// TODO enable vectorization for Select
+template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+  : evaluator_base<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
+{
+  typedef Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> XprType;
+  enum {
+    CoeffReadCost = evaluator<ConditionMatrixType>::CoeffReadCost
+                  + EIGEN_PLAIN_ENUM_MAX(evaluator<ThenMatrixType>::CoeffReadCost,
+                                         evaluator<ElseMatrixType>::CoeffReadCost),
+
+    Flags = (unsigned int)evaluator<ThenMatrixType>::Flags & evaluator<ElseMatrixType>::Flags & HereditaryBits,
+    
+    Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
+  };
+
+  inline EIGEN_DEVICE_FUNC  explicit evaluator(const XprType& select)
+    : m_conditionImpl(select.conditionMatrix()),
+      m_thenImpl(select.thenMatrix()),
+      m_elseImpl(select.elseMatrix())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+ 
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    if (m_conditionImpl.coeff(row, col))
+      return m_thenImpl.coeff(row, col);
+    else
+      return m_elseImpl.coeff(row, col);
+  }
+
+  inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    if (m_conditionImpl.coeff(index))
+      return m_thenImpl.coeff(index);
+    else
+      return m_elseImpl.coeff(index);
+  }
+ 
+protected:
+  evaluator<ConditionMatrixType> m_conditionImpl;
+  evaluator<ThenMatrixType> m_thenImpl;
+  evaluator<ElseMatrixType> m_elseImpl;
+};
+
+
+// -------------------- Replicate --------------------
+
+template<typename ArgType, int RowFactor, int ColFactor> 
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
+  : evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
+{
+  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  enum {
+    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
+  };
+  typedef typename internal::nested_eval<ArgType,Factor>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits|LinearAccessMask) & ~RowMajorBit) | (traits<XprType>::Flags & RowMajorBit),
+    
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
+    : m_arg(replicate.nestedExpression()),
+      m_argImpl(m_arg),
+      m_rows(replicate.nestedExpression().rows()),
+      m_cols(replicate.nestedExpression().cols())
+  {}
+ 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+    
+    return m_argImpl.coeff(actual_row, actual_col);
+  }
+  
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+    
+    return m_argImpl.coeff(actual_index);
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    const Index actual_row = internal::traits<XprType>::RowsAtCompileTime==1 ? 0
+                           : RowFactor==1 ? row
+                           : row % m_rows.value();
+    const Index actual_col = internal::traits<XprType>::ColsAtCompileTime==1 ? 0
+                           : ColFactor==1 ? col
+                           : col % m_cols.value();
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_row, actual_col);
+  }
+  
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    const Index actual_index = internal::traits<XprType>::RowsAtCompileTime==1
+                                  ? (ColFactor==1 ?  index : index%m_cols.value())
+                                  : (RowFactor==1 ?  index : index%m_rows.value());
+
+    return m_argImpl.template packet<LoadMode,PacketType>(actual_index);
+  }
+ 
+protected:
+  const ArgTypeNested m_arg;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
+};
+
+
+// -------------------- PartialReduxExpr --------------------
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))),
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(j));
+    else
+      return m_functor(m_arg.row(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(index));
+    else
+      return m_functor(m_arg.row(index));
+  }
+
+protected:
+  const ArgTypeNested m_arg;
+  const MemberOp m_functor;
+};
+
+
+// -------------------- MatrixWrapper and ArrayWrapper --------------------
+//
+// evaluator_wrapper_base<T> is a common base class for the
+// MatrixWrapper and ArrayWrapper evaluators.
+
+template<typename XprType>
+struct evaluator_wrapper_base
+  : evaluator_base<XprType>
+{
+  typedef typename remove_all<typename XprType::NestedExpressionType>::type ArgType;
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index row, Index col) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(row, col);
+  }
+
+  template<int LoadMode, typename PacketType> 
+  PacketType packet(Index index) const
+  {
+    return m_argImpl.template packet<LoadMode,PacketType>(index);
+  }
+
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(row, col, x);
+  }
+
+  template<int StoreMode, typename PacketType> 
+  void writePacket(Index index, const PacketType& x)
+  {
+    m_argImpl.template writePacket<StoreMode>(index, x);
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+template<typename TArgType>
+struct unary_evaluator<MatrixWrapper<TArgType> >
+  : evaluator_wrapper_base<MatrixWrapper<TArgType> >
+{
+  typedef MatrixWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+template<typename TArgType>
+struct unary_evaluator<ArrayWrapper<TArgType> >
+  : evaluator_wrapper_base<ArrayWrapper<TArgType> >
+{
+  typedef ArrayWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
+    : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
+  { }
+};
+
+
+// -------------------- Reverse --------------------
+
+// defined in Reverse.h:
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond;
+
+template<typename ArgType, int Direction>
+struct unary_evaluator<Reverse<ArgType, Direction> >
+  : evaluator_base<Reverse<ArgType, Direction> >
+{
+  typedef Reverse<ArgType, Direction> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::IsRowMajor,
+    IsColMajor = !IsRowMajor,
+    ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
+    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+    ReversePacket = (Direction == BothDirections)
+                    || ((Direction == Vertical)   && IsColMajor)
+                    || ((Direction == Horizontal) && IsRowMajor),
+                    
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess = ( (Direction==BothDirections) && (int(Flags0)&PacketAccessBit) )
+                  || ((ReverseRow && XprType::ColsAtCompileTime==1) || (ReverseCol && XprType::RowsAtCompileTime==1))
+                 ? LinearAccessBit : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+    
+    Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
+    : m_argImpl(reverse.nestedExpression()),
+      m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
+      m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
+  { }
+ 
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  {
+    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row,
+                           ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index col)
+  {
+    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row,
+                              ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  {
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    return reverse_packet::run(m_argImpl.template packet<LoadMode,PacketType>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col));
+  }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    return preverse(m_argImpl.template packet<LoadMode,PacketType>(m_rows.value() * m_cols.value() - index - PacketSize));
+  }
+
+  template<int LoadMode, typename PacketType>
+  void writePacket(Index row, Index col, const PacketType& x)
+  {
+    // FIXME we could factorize some code with packet(i,j)
+    enum {
+      PacketSize = unpacket_traits<PacketType>::size,
+      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
+      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1
+    };
+    typedef internal::reverse_packet_cond<PacketType,ReversePacket> reverse_packet;
+    m_argImpl.template writePacket<LoadMode>(
+                                  ReverseRow ? m_rows.value() - row - OffsetRow : row,
+                                  ReverseCol ? m_cols.value() - col - OffsetCol : col,
+                                  reverse_packet::run(x));
+  }
+
+  template<int LoadMode, typename PacketType>
+  void writePacket(Index index, const PacketType& x)
+  {
+    enum { PacketSize = unpacket_traits<PacketType>::size };
+    m_argImpl.template writePacket<LoadMode>
+      (m_rows.value() * m_cols.value() - index - PacketSize, preverse(x));
+  }
+ 
+protected:
+  evaluator<ArgType> m_argImpl;
+
+  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
+  // Nonetheless, in this case it is important to set to 1 such that the coeff(index) method works fine for vectors.
+  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 1> m_rows;
+  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 1> m_cols;
+};
+
+
+// -------------------- Diagonal --------------------
+
+template<typename ArgType, int DiagIndex>
+struct evaluator<Diagonal<ArgType, DiagIndex> >
+  : evaluator_base<Diagonal<ArgType, DiagIndex> >
+{
+  typedef Diagonal<ArgType, DiagIndex> XprType;
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    
+    Flags = (unsigned int)evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit) & ~RowMajorBit,
+    
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
+    : m_argImpl(diagonal.nestedExpression()),
+      m_index(diagonal.index())
+  { }
+ 
+  typedef typename XprType::Scalar Scalar;
+  // FIXME having to check whether ArgType is sparse here i not very nice.
+  typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
+                                         typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
+  {
+    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index row, Index)
+  {
+    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC Scalar& coeffRef(Index index)
+  {
+    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
+  }
+
+protected:
+  evaluator<ArgType> m_argImpl;
+  const internal::variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+
+private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value() > 0 ? 0 : -m_index.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value() > 0 ? m_index.value() : 0; }
+};
+
+
+//----------------------------------------------------------------------
+// deprecated code
+//----------------------------------------------------------------------
+
+// -------------------- EvalToTemp --------------------
+
+// expression class for evaluating nested expression to a temporary
+
+template<typename ArgType> class EvalToTemp;
+
+template<typename ArgType>
+struct traits<EvalToTemp<ArgType> >
+  : public traits<ArgType>
+{ };
+
+template<typename ArgType>
+class EvalToTemp
+  : public dense_xpr_base<EvalToTemp<ArgType> >::type
+{
+ public:
+ 
+  typedef typename dense_xpr_base<EvalToTemp>::type Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
+ 
+  explicit EvalToTemp(const ArgType& arg)
+    : m_arg(arg)
+  { }
+ 
+  const ArgType& arg() const
+  {
+    return m_arg;
+  }
+
+  Index rows() const 
+  {
+    return m_arg.rows();
+  }
+
+  Index cols() const 
+  {
+    return m_arg.cols();
+  }
+
+ private:
+  const ArgType& m_arg;
+};
+ 
+template<typename ArgType>
+struct evaluator<EvalToTemp<ArgType> >
+  : public evaluator<typename ArgType::PlainObject>
+{
+  typedef EvalToTemp<ArgType>                   XprType;
+  typedef typename ArgType::PlainObject         PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.arg())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+  }
+
+  // This constructor is used when nesting an EvalTo evaluator in another evaluator
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg)
+    : m_result(arg)
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COREEVALUATORS_H
diff --git a/nuparu/include/Eigen/src/Core/CoreIterators.h b/nuparu/include/Eigen/src/Core/CoreIterators.h
index 6da4683d..4eb42b93 100644
--- a/nuparu/include/Eigen/src/Core/CoreIterators.h
+++ b/nuparu/include/Eigen/src/Core/CoreIterators.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,47 +15,113 @@ namespace Eigen {
 /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
  */
 
-/** \ingroup SparseCore_Module
-  * \class InnerIterator
-  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
-  *
-  * todo
+namespace internal {
+
+template<typename XprType, typename EvaluatorKind>
+class inner_iterator_selector;
+
+}
+
+/** \class InnerIterator
+  * \brief An InnerIterator allows to loop over the element of any matrix expression.
+  * 
+  * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is constructed.
+  * 
+  * TODO: add a usage example
   */
+template<typename XprType>
+class InnerIterator
+{
+protected:
+  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
+  typedef internal::evaluator<XprType> EvaluatorType;
+  typedef typename internal::traits<XprType>::Scalar Scalar;
+public:
+  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
+  InnerIterator(const XprType &xpr, const Index &outerId)
+    : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize())
+  {}
+  
+  /// \returns the value of the current coefficient.
+  EIGEN_STRONG_INLINE Scalar value() const          { return m_iter.value(); }
+  /** Increment the iterator \c *this to the next non-zero coefficient.
+    * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
+    */
+  EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
+  /// \returns the column or row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
+  /// \returns the row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index row() const             { return m_iter.row(); }
+  /// \returns the column index of the current coefficient.
+  EIGEN_STRONG_INLINE Index col() const             { return m_iter.col(); }
+  /// \returns \c true if the iterator \c *this still references a valid coefficient.
+  EIGEN_STRONG_INLINE operator bool() const         { return m_iter; }
+  
+protected:
+  EvaluatorType m_eval;
+  IteratorType m_iter;
+private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template<typename T> InnerIterator(const EigenBase<T>&,Index outer);
+};
+
+namespace internal {
 
-// generic version for dense matrix and expressions
-template<typename Derived> class DenseBase<Derived>::InnerIterator
+// Generic inner iterator implementation for dense objects
+template<typename XprType>
+class inner_iterator_selector<XprType, IndexBased>
 {
-  protected:
-    typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
-
-    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
-      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
-    {}
-
-    EIGEN_STRONG_INLINE Scalar value() const
-    {
-      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
-                          : m_expression.coeff(m_inner, m_outer);
-    }
-
-    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
-
-  protected:
-    const Derived& m_expression;
-    Index m_inner;
-    const Index m_outer;
-    const Index m_end;
+protected:
+  typedef evaluator<XprType> EvaluatorType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
+    : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize)
+  {}
+
+  EIGEN_STRONG_INLINE Scalar value() const
+  {
+    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner)
+                        : m_eval.coeff(m_inner, m_outer);
+  }
+
+  EIGEN_STRONG_INLINE inner_iterator_selector& operator++() { m_inner++; return *this; }
+
+  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+  inline Index row() const { return IsRowMajor ? m_outer : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+protected:
+  const EvaluatorType& m_eval;
+  Index m_inner;
+  const Index m_outer;
+  const Index m_end;
 };
 
+// For iterator-based evaluator, inner-iterator is already implemented as
+// evaluator<>::InnerIterator
+template<typename XprType>
+class inner_iterator_selector<XprType, IteratorBased>
+ : public evaluator<XprType>::InnerIterator
+{
+protected:
+  typedef typename evaluator<XprType>::InnerIterator Base;
+  typedef evaluator<XprType> EvaluatorType;
+  
+public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &/*innerSize*/)
+    : Base(eval, outerId)
+  {}  
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_COREITERATORS_H
diff --git a/nuparu/include/Eigen/src/Core/CwiseBinaryOp.h b/nuparu/include/Eigen/src/Core/CwiseBinaryOp.h
index 586f77aa..e42c3031 100644
--- a/nuparu/include/Eigen/src/Core/CwiseBinaryOp.h
+++ b/nuparu/include/Eigen/src/Core/CwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -56,72 +56,51 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
                        typename Rhs::Scalar
                      )
                    >::type Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind,
+                                              typename traits<Rhs>::StorageKind,
+                                              BinaryOp>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
   typedef typename Lhs::Nested LhsNested;
   typedef typename Rhs::Nested RhsNested;
   typedef typename remove_reference<LhsNested>::type _LhsNested;
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
-    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
-        HereditaryBits
-      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
-           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
-           )
-        )
-     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + functor_traits<BinaryOp>::Cost
+    Flags = _LhsNested::Flags & RowMajorBit
   };
 };
 } // end namespace internal
 
-// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
-// that would take two operands of different types. If there were such an example, then this check should be
-// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
-// currently they take only one typename Scalar template parameter.
-// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
-// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
-// add together a float matrix and a double matrix.
-#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
-                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same<LHS, RHS>::value)), \
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
 template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOp : internal::no_assignment_operator,
+template<typename BinaryOp, typename LhsType, typename RhsType>
+class CwiseBinaryOp : 
   public CwiseBinaryOpImpl<
-          BinaryOp, Lhs, Rhs,
-          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                           typename internal::traits<Rhs>::StorageKind>::ret>
+          BinaryOp, LhsType, RhsType,
+          typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                        typename internal::traits<RhsType>::StorageKind,
+                                                        BinaryOp>::ret>,
+  internal::no_assignment_operator
 {
   public:
+    
+    typedef typename internal::remove_all<LhsType>::type Lhs;
+    typedef typename internal::remove_all<RhsType>::type Rhs;
 
     typedef typename CwiseBinaryOpImpl<
-        BinaryOp, Lhs, Rhs,
-        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
+        BinaryOp, LhsType, RhsType,
+        typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                      typename internal::traits<Rhs>::StorageKind,
+                                                      BinaryOp>::ret>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
 
-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
+    typedef typename internal::ref_selector<LhsType>::type LhsNested;
+    typedef typename internal::ref_selector<RhsType>::type RhsNested;
     typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
     typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
     {
@@ -131,6 +110,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
       eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const {
       // return the fixed size type if available to enable compile time optimizations
       if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
@@ -138,6 +118,7 @@ class CwiseBinaryOp : internal::no_assignment_operator,
       else
         return m_lhs.rows();
     }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const {
       // return the fixed size type if available to enable compile time optimizations
       if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
@@ -147,10 +128,13 @@ class CwiseBinaryOp : internal::no_assignment_operator,
     }
 
     /** \returns the left hand side nested expression */
+    EIGEN_DEVICE_FUNC
     const _LhsNested& lhs() const { return m_lhs; }
     /** \returns the right hand side nested expression */
+    EIGEN_DEVICE_FUNC
     const _RhsNested& rhs() const { return m_rhs; }
     /** \returns the functor representing the binary operation */
+    EIGEN_DEVICE_FUNC
     const BinaryOp& functor() const { return m_functor; }
 
   protected:
@@ -159,41 +143,13 @@ class CwiseBinaryOp : internal::no_assignment_operator,
     const BinaryOp m_functor;
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
-  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+// Generic API dispatcher
+template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl
+  : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
 {
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
-  public:
-
-    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().lhs().coeff(rowId, colId),
-                                 derived().rhs().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
-                                          derived().rhs().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().lhs().coeff(index),
-                                 derived().rhs().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
-                                          derived().rhs().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
 };
 
 /** replaces \c *this by \c *this - \a other.
@@ -205,8 +161,7 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
   return derived();
 }
 
@@ -219,11 +174,11 @@ template<typename OtherDerived>
 EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
   return derived();
 }
 
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
+
diff --git a/nuparu/include/Eigen/src/Core/CwiseNullaryOp.h b/nuparu/include/Eigen/src/Core/CwiseNullaryOp.h
index a93bab2d..2bc6933d 100644
--- a/nuparu/include/Eigen/src/Core/CwiseNullaryOp.h
+++ b/nuparu/include/Eigen/src/Core/CwiseNullaryOp.h
@@ -35,37 +35,35 @@ template<typename NullaryOp, typename PlainObjectType>
 struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
 {
   enum {
-    Flags = (traits<PlainObjectType>::Flags
-      & (  HereditaryBits
-         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
-         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
-      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
-    CoeffReadCost = functor_traits<NullaryOp>::Cost
+    Flags = traits<PlainObjectType>::Flags & RowMajorBit
   };
 };
 }
 
 template<typename NullaryOp, typename PlainObjectType>
-class CwiseNullaryOp : internal::no_assignment_operator,
-  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
+class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type, internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
 
-    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
-      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
+    EIGEN_DEVICE_FUNC
+    CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func)
     {
-      eigen_assert(nbRows >= 0
-            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-            &&  nbCols >= 0
-            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
+      eigen_assert(rows >= 0
+            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+            &&  cols >= 0
+            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
     {
       return m_functor(rowId, colId);
@@ -77,6 +75,7 @@ class CwiseNullaryOp : internal::no_assignment_operator,
       return m_functor.packetOp(rowId, colId);
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
     {
       return m_functor(index);
@@ -89,6 +88,7 @@ class CwiseNullaryOp : internal::no_assignment_operator,
     }
 
     /** \returns the functor representing the nullary operation */
+    EIGEN_DEVICE_FUNC
     const NullaryOp& functor() const { return m_functor; }
 
   protected:
@@ -113,10 +113,10 @@ class CwiseNullaryOp : internal::no_assignment_operator,
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -132,16 +132,19 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   *
   * The template parameter \a CustomNullaryOp is the type of the functor.
   *
+  * Here is an example with C++11 random generators: \include random_cpp11.cpp
+  * Output: \verbinclude random_cpp11.out
+  * 
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
@@ -155,19 +158,19 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }
 
 /** \returns an expression of a constant matrix of value \a value
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this DenseBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
+  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
   * instead.
   *
   * The template parameter \a CustomNullaryOp is the type of the functor.
@@ -176,9 +179,9 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
 }
 
 /** \returns an expression of a constant matrix of value \a value
@@ -242,7 +245,7 @@ EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturn
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,false>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,size));
 }
 
 /**
@@ -255,7 +258,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,false>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,Derived::SizeAtCompileTime));
 }
 
 /**
@@ -276,7 +279,7 @@ EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedRetu
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,true>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,size));
 }
 
 /**
@@ -289,7 +292,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,true>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar,true>(low,high,Derived::SizeAtCompileTime));
 }
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -297,9 +300,10 @@ template<typename Derived>
 bool DenseBase<Derived>::isApproxToConstant
 (const Scalar& val, const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
+      if(!internal::isApprox(self.coeff(i, j), val, prec))
         return false;
   return true;
 }
@@ -353,8 +357,8 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
 
 /** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   * \param val the value to which all coefficients are set
   *
   * Example: \include Matrix_setConstant_int_int.cpp
@@ -364,9 +368,9 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
+PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(val);
 }
 
@@ -387,7 +391,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,false>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar,false>(low,high,newSize));
 }
 
 /**
@@ -425,9 +429,9 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
+DenseBase<Derived>::Zero(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(0));
+  return Constant(rows, cols, Scalar(0));
 }
 
 /** \returns an expression of a zero vector.
@@ -481,9 +485,10 @@ DenseBase<Derived>::Zero()
 template<typename Derived>
 bool DenseBase<Derived>::isZero(const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
     for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec))
         return false;
   return true;
 }
@@ -520,8 +525,8 @@ PlainObjectBase<Derived>::setZero(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setZero_int_int.cpp
   * Output: \verbinclude Matrix_setZero_int_int.out
@@ -530,9 +535,9 @@ PlainObjectBase<Derived>::setZero(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setZero(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(0));
 }
 
@@ -540,7 +545,7 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
 
 /** \returns an expression of a matrix where all coefficients equal one.
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -554,9 +559,9 @@ PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
+DenseBase<Derived>::Ones(Index rows, Index cols)
 {
-  return Constant(nbRows, nbCols, Scalar(1));
+  return Constant(rows, cols, Scalar(1));
 }
 
 /** \returns an expression of a vector where all coefficients equal one.
@@ -646,8 +651,8 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to one.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setOnes_int_int.cpp
   * Output: \verbinclude Matrix_setOnes_int_int.out
@@ -656,9 +661,9 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setConstant(Scalar(1));
 }
 
@@ -666,7 +671,7 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
 
 /** \returns an expression of the identity matrix (not necessarily square).
   *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
+  * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
@@ -680,9 +685,9 @@ PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
+MatrixBase<Derived>::Identity(Index rows, Index cols)
 {
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
 }
 
 /** \returns an expression of the identity matrix (not necessarily square).
@@ -716,18 +721,19 @@ template<typename Derived>
 bool MatrixBase<Derived>::isIdentity
 (const RealScalar& prec) const
 {
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index j = 0; j < cols(); ++j)
   {
     for(Index i = 0; i < rows(); ++i)
     {
       if(i == j)
       {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
+        if(!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec))
           return false;
       }
       else
       {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
+        if(!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec))
           return false;
       }
     }
@@ -740,6 +746,7 @@ namespace internal {
 template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
 struct setIdentity_impl
 {
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Derived& run(Derived& m)
   {
     return m = Derived::Identity(m.rows(), m.cols());
@@ -749,7 +756,7 @@ struct setIdentity_impl
 template<typename Derived>
 struct setIdentity_impl<Derived, true>
 {
-  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Derived& run(Derived& m)
   {
     m.setZero();
@@ -776,8 +783,8 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
 
 /** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setIdentity_int_int.cpp
   * Output: \verbinclude Matrix_setIdentity_int_int.out
@@ -785,9 +792,9 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
   * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
   */
 template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
+EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
 {
-  derived().resize(nbRows, nbCols);
+  derived().resize(rows, cols);
   return setIdentity();
 }
 
diff --git a/nuparu/include/Eigen/src/Core/CwiseUnaryOp.h b/nuparu/include/Eigen/src/Core/CwiseUnaryOp.h
index f2de749f..da1d1992 100644
--- a/nuparu/include/Eigen/src/Core/CwiseUnaryOp.h
+++ b/nuparu/include/Eigen/src/Core/CwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -44,10 +44,7 @@ struct traits<CwiseUnaryOp<UnaryOp, XprType> >
   typedef typename XprType::Nested XprTypeNested;
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   enum {
-    Flags = _XprTypeNested::Flags & (
-      HereditaryBits | LinearAccessBit | AlignedBit
-      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
-    CoeffReadCost = _XprTypeNested::CoeffReadCost + functor_traits<UnaryOp>::Cost
+    Flags = _XprTypeNested::Flags & RowMajorBit 
   };
 };
 }
@@ -56,28 +53,34 @@ template<typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;
 
 template<typename UnaryOp, typename XprType>
-class CwiseUnaryOp : internal::no_assignment_operator,
-  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
+class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>, internal::no_assignment_operator
 {
   public:
 
     typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+    typedef typename internal::remove_all<XprType>::type NestedExpression;
 
-    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+    EIGEN_DEVICE_FUNC
+    explicit inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
 
     /** \returns the functor representing the unary operation */
+    EIGEN_DEVICE_FUNC
     const UnaryOp& functor() const { return m_functor; }
 
     /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<typename XprType::Nested>::type&
     nestedExpression() const { return m_xpr; }
 
     /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
     typename internal::remove_all<typename XprType::Nested>::type&
     nestedExpression() { return m_xpr.const_cast_derived(); }
 
@@ -86,39 +89,13 @@ class CwiseUnaryOp : internal::no_assignment_operator,
     const UnaryOp m_functor;
 };
 
-// This is the generic implementation for dense storage.
-// It can be used for any expression types implementing the dense concept.
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
-  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
+// Generic API dispatcher
+template<typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl
+  : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
 {
-  public:
-
-    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
-    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
-    }
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/CwiseUnaryView.h b/nuparu/include/Eigen/src/Core/CwiseUnaryView.h
index b2638d32..72244751 100644
--- a/nuparu/include/Eigen/src/Core/CwiseUnaryView.h
+++ b/nuparu/include/Eigen/src/Core/CwiseUnaryView.h
@@ -37,8 +37,8 @@ struct traits<CwiseUnaryView<ViewOp, MatrixType> >
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
-    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
-    CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = traits<_MatrixTypeNested>::Flags & (RowMajorBit | FlagsLvalueBit | DirectAccessBit), // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
     // need to cast the sizeof's from size_t to int explicitly, otherwise:
     // "error: no integral type can represent all of the enumerator values
@@ -62,8 +62,9 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
 
     typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
+    explicit inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
       : m_matrix(mat), m_functor(func) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
@@ -83,11 +84,19 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
     nestedExpression() { return m_matrix.const_cast_derived(); }
 
   protected:
-    // FIXME changed from MatrixType::Nested because of a weird compilation error with sun CC
-    typename internal::nested<MatrixType>::type m_matrix;
+    typename internal::ref_selector<MatrixType>::type m_matrix;
     ViewOp m_functor;
 };
 
+// Generic API dispatcher
+template<typename ViewOp, typename XprType, typename StorageKind>
+class CwiseUnaryViewImpl
+  : public internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryView<ViewOp, XprType> >::type Base;
+};
+
 template<typename ViewOp, typename MatrixType>
 class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
   : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
@@ -100,38 +109,18 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
     EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
     
-    inline Scalar* data() { return &coeffRef(0); }
-    inline const Scalar* data() const { return &coeff(0); }
+    EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeff(0)); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
 
-    inline Index outerStride() const
+    EIGEN_DEVICE_FUNC inline Index outerStride() const
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(row, col));
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
-    }
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/DenseBase.h b/nuparu/include/Eigen/src/Core/DenseBase.h
index c5800f6c..e181dafa 100644
--- a/nuparu/include/Eigen/src/Core/DenseBase.h
+++ b/nuparu/include/Eigen/src/Core/DenseBase.h
@@ -40,31 +40,43 @@ static inline void check_DenseIndex_is_signed() {
   */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                     typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>
+  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
+                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                                            DenseCoeffsBase<Derived> >
 #else
   : public DenseCoeffsBase<Derived>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
-    using internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                typename NumTraits<typename internal::traits<Derived>::Scalar>::Real>::operator*;
 
-    class InnerIterator;
+    /** Inner iterator type to iterate over the coefficients of a row or column.
+      * \sa class InnerIterator
+      */
+    typedef Eigen::InnerIterator<Derived> InnerIterator;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
 
-    /** \brief The type of indices 
-      * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-      * \sa \ref TopicPreprocessorDirectives.
-      */
-    typedef typename internal::traits<Derived>::Index Index; 
+    /**
+      * \brief The type used to store indices
+      * \details This typedef is relevant for types that store multiple indices such as
+      *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
+      * \sa \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
+     */
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
     typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;
 
-    typedef DenseCoeffsBase<Derived> Base;
+    using Base::operator*;
+    using Base::operator/;
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -74,16 +86,6 @@ template<typename Derived> class DenseBase
     using Base::colIndexByOuterInner;
     using Base::coeff;
     using Base::coeffByOuterInner;
-    using Base::packet;
-    using Base::packetByOuterInner;
-    using Base::writePacket;
-    using Base::writePacketByOuterInner;
-    using Base::coeffRef;
-    using Base::coeffRefByOuterInner;
-    using Base::copyCoeff;
-    using Base::copyCoeffByOuterInner;
-    using Base::copyPacket;
-    using Base::copyPacketByOuterInner;
     using Base::operator();
     using Base::operator[];
     using Base::x;
@@ -169,30 +171,54 @@ template<typename Derived> class DenseBase
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
                              : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
       InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
       OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
     };
+    
+    typedef typename internal::find_best_packet<Scalar,SizeAtCompileTime>::type PacketScalar;
 
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 0 };
+    
+    /** The plain matrix type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Matrix<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainMatrix;
+    
+    /** The plain array type corresponding to this expression.
+      * \sa PlainObject */
+    typedef Array<typename internal::traits<Derived>::Scalar,
+                internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime,
+                internal::traits<Derived>::MaxColsAtCompileTime
+          > PlainArray;
+
+    /** \brief The plain matrix or array type corresponding to this expression.
+      *
+      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+      * that the return type of eval() is either PlainObject or const PlainObject&.
+      */
+    typedef typename internal::conditional<internal::is_same<typename internal::traits<Derived>::XprKind,MatrixXpr >::value,
+                                 PlainMatrix, PlainArray>::type PlainObject;
 
     /** \returns the number of nonzero coefficients which is in practice the number
       * of stored coefficients. */
+    EIGEN_DEVICE_FUNC
     inline Index nonZeros() const { return size(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */
 
     /** \returns the outer size.
       *
       * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
       * column-major matrix, and the number of rows for a row-major matrix. */
+    EIGEN_DEVICE_FUNC
     Index outerSize() const
     {
       return IsVectorAtCompileTime ? 1
@@ -204,6 +230,7 @@ template<typename Derived> class DenseBase
       * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
       * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
       * column-major matrix, and the number of columns for a row-major matrix. */
+    EIGEN_DEVICE_FUNC
     Index innerSize() const
     {
       return IsVectorAtCompileTime ? this->size()
@@ -214,6 +241,7 @@ template<typename Derived> class DenseBase
       * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
       * nothing else.
       */
+    EIGEN_DEVICE_FUNC
     void resize(Index newSize)
     {
       EIGEN_ONLY_USED_FOR_DEBUG(newSize);
@@ -224,22 +252,22 @@ template<typename Derived> class DenseBase
       * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
       * nothing else.
       */
-    void resize(Index nbRows, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    void resize(Index rows, Index cols)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
-      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
-      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
                 && "DenseBase::resize() does not actually allow to resize.");
     }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,false>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar,true>,PlainObject> RandomAccessLinSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -247,110 +275,122 @@ template<typename Derived> class DenseBase
 
     /** Copies \a other into *this. \returns a reference to *this. */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const DenseBase& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const EigenBase<OtherDerived> &other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator+=(const EigenBase<OtherDerived> &other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator-=(const EigenBase<OtherDerived> &other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Copies \a other into *this without evaluating other. \returns a reference to *this. */
+    /** \ínternal
+      * Copies \a other into *this without evaluating other. \returns a reference to *this.
+      * \deprecated */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
 
+    EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const Scalar& s);
 
+    /** \deprecated it now returns \c *this */
     template<unsigned int Added,unsigned int Removed>
-    const Flagged<Derived, Added, Removed> flagged() const;
+    EIGEN_DEPRECATED
+    const Derived& flagged() const
+    { return derived(); }
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
 
-    Eigen::Transpose<Derived> transpose();
-	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<Derived> TransposeReturnType;
+    EIGEN_DEVICE_FUNC
+    TransposeReturnType transpose();
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    EIGEN_DEVICE_FUNC
     ConstTransposeReturnType transpose() const;
+    EIGEN_DEVICE_FUNC
     void transposeInPlace();
-#ifndef EIGEN_NO_DEBUG
-  protected:
-    template<typename OtherDerived>
-    void checkTransposeAliasing(const OtherDerived& other) const;
-  public:
-#endif
-
 
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(Index rows, Index cols, const Scalar& value);
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(Index size, const Scalar& value);
-    static const ConstantReturnType
+    EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(const Scalar& value);
 
-    static const SequentialLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
     LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    static const SequentialLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
     LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(const Scalar& low, const Scalar& high);
 
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(Index size, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
+    template<typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    static const CwiseNullaryOp<CustomNullaryOp, PlainObject>
     NullaryExpr(const CustomNullaryOp& func);
 
-    static const ConstantReturnType Zero(Index rows, Index cols);
-    static const ConstantReturnType Zero(Index size);
-    static const ConstantReturnType Zero();
-    static const ConstantReturnType Ones(Index rows, Index cols);
-    static const ConstantReturnType Ones(Index size);
-    static const ConstantReturnType Ones();
-
-    void fill(const Scalar& value);
-    Derived& setConstant(const Scalar& value);
-    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
-    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
-    Derived& setZero();
-    Derived& setOnes();
-    Derived& setRandom();
-
-    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Zero();
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
+    EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
+
+    EIGEN_DEVICE_FUNC void fill(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC Derived& setZero();
+    EIGEN_DEVICE_FUNC Derived& setOnes();
+    EIGEN_DEVICE_FUNC Derived& setRandom();
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isApprox(const DenseBase<OtherDerived>& other,
                   const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC 
     bool isMuchSmallerThan(const RealScalar& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    template<typename OtherDerived>
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
-    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+    EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     
     inline bool hasNaN() const;
     inline bool allFinite() const;
 
+    EIGEN_DEVICE_FUNC
     inline Derived& operator*=(const Scalar& other);
+    EIGEN_DEVICE_FUNC
     inline Derived& operator/=(const Scalar& other);
 
     typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
@@ -358,7 +398,10 @@ template<typename Derived> class DenseBase
       *
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
+      * 
+      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
     {
       // Even though MSVC does not honor strong inlining when the return type
@@ -366,61 +409,68 @@ template<typename Derived> class DenseBase
       // size types on MSVC.
       return typename internal::eval<Derived>::type(derived());
     }
-
+    
     /** swaps *this with the expression \a other.
       *
       */
     template<typename OtherDerived>
-    void swap(const DenseBase<OtherDerived>& other,
-              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
+    EIGEN_DEVICE_FUNC
+    void swap(const DenseBase<OtherDerived>& other)
     {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
     /** swaps *this with the matrix or array \a other.
       *
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void swap(PlainObjectBase<OtherDerived>& other)
     {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
+      eigen_assert(rows()==other.rows() && cols()==other.cols());
+      call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+    EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+    EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
+    template<bool Enable> EIGEN_DEVICE_FUNC
+    inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
 
-    inline const NestByValue<Derived> nestByValue() const;
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    EIGEN_DEVICE_FUNC Scalar sum() const;
+    EIGEN_DEVICE_FUNC Scalar mean() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;
 
-    Scalar sum() const;
-    Scalar mean() const;
-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar prod() const;
 
-    Scalar prod() const;
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+    EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
 
-    typename internal::traits<Derived>::Scalar minCoeff() const;
-    typename internal::traits<Derived>::Scalar maxCoeff() const;
-
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType>
+    template<typename IndexType> EIGEN_DEVICE_FUNC
     typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
 
     template<typename BinaryOp>
-    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
-    redux(const BinaryOp& func) const;
+    EIGEN_DEVICE_FUNC
+    Scalar redux(const BinaryOp& func) const;
 
     template<typename Visitor>
+    EIGEN_DEVICE_FUNC
     void visit(Visitor& func) const;
 
     inline const WithFormat<Derived> format(const IOFormat& fmt) const;
 
     /** \returns the unique coefficient of a 1x1 expression */
+    EIGEN_DEVICE_FUNC
     CoeffReturnType value() const
     {
       EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
@@ -428,8 +478,8 @@ template<typename Derived> class DenseBase
       return derived().coeff(0,0);
     }
 
-    bool all(void) const;
-    bool any(void) const;
+    bool all() const;
+    bool any() const;
     Index count() const;
 
     typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
@@ -437,14 +487,35 @@ template<typename Derived> class DenseBase
     typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
     typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
 
-    ConstRowwiseReturnType rowwise() const;
-    RowwiseReturnType rowwise();
-    ConstColwiseReturnType colwise() const;
-    ColwiseReturnType colwise();
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    *
+    * Example: \include MatrixBase_rowwise.cpp
+    * Output: \verbinclude MatrixBase_rowwise.out
+    *
+    * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const {
+      return ConstRowwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
+
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
+    *
+    * Example: \include MatrixBase_colwise.cpp
+    * Output: \verbinclude MatrixBase_colwise.out
+    *
+    * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+    */
+    EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const {
+      return ConstColwiseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC ColwiseReturnType colwise();
 
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
+    typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>,PlainObject> RandomReturnType;
+    static const RandomReturnType Random(Index rows, Index cols);
+    static const RandomReturnType Random(Index size);
+    static const RandomReturnType Random();
 
     template<typename ThenDerived,typename ElseDerived>
     const Select<Derived,ThenDerived,ElseDerived>
@@ -462,14 +533,33 @@ template<typename Derived> class DenseBase
     template<int p> RealScalar lpNorm() const;
 
     template<int RowFactor, int ColFactor>
+    EIGEN_DEVICE_FUNC
     const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    const Replicate<Derived,Dynamic,Dynamic> replicate(Index rowFacor,Index colFactor) const;
+    /**
+    * \return an expression of the replication of \c *this
+    *
+    * Example: \include MatrixBase_replicate_int_int.cpp
+    * Output: \verbinclude MatrixBase_replicate_int_int.out
+    *
+    * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+    */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC
+    const Replicate<Derived, Dynamic, Dynamic> replicate(Index rowFactor, Index colFactor) const
+    {
+      return Replicate<Derived, Dynamic, Dynamic>(derived(), rowFactor, colFactor);
+    }
 
     typedef Reverse<Derived, BothDirections> ReverseReturnType;
     typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
-    ReverseReturnType reverse();
-    ConstReverseReturnType reverse() const;
-    void reverseInPlace();
+    EIGEN_DEVICE_FUNC ReverseReturnType reverse();
+    /** This is the const version of reverse(). */
+    //Code moved here due to a CUDA compiler bug
+    EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const
+    {
+      return ConstReverseReturnType(derived());
+    }
+    EIGEN_DEVICE_FUNC void reverseInPlace();
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #   include "../plugins/BlockMethods.h"
@@ -478,27 +568,18 @@ template<typename Derived> class DenseBase
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 
-#ifdef EIGEN2_SUPPORT
-
-    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
-    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
-    template<int CRows, int CCols>
-    Block<Derived, CRows, CCols> corner(CornerType type);
-    template<int CRows, int CCols>
-    const Block<Derived, CRows, CCols> corner(CornerType type) const;
-
-#endif // EIGEN2_SUPPORT
-
 
     // disable the use of evalTo for dense objects with a nice compilation error
-    template<typename Dest> inline void evalTo(Dest& ) const
+    template<typename Dest>
+    EIGEN_DEVICE_FUNC
+    inline void evalTo(Dest& ) const
     {
       EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
     }
 
   protected:
     /** Default constructor. Do nothing. */
-    DenseBase()
+    EIGEN_DEVICE_FUNC DenseBase()
     {
       /* Just checks for self-consistency of the flags.
        * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
@@ -511,9 +592,9 @@ template<typename Derived> class DenseBase
     }
 
   private:
-    explicit DenseBase(int);
-    DenseBase(int,int);
-    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
+    EIGEN_DEVICE_FUNC explicit DenseBase(int);
+    EIGEN_DEVICE_FUNC DenseBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/DenseCoeffsBase.h b/nuparu/include/Eigen/src/Core/DenseCoeffsBase.h
index 3c890f21..820a90e6 100644
--- a/nuparu/include/Eigen/src/Core/DenseCoeffsBase.h
+++ b/nuparu/include/Eigen/src/Core/DenseCoeffsBase.h
@@ -35,7 +35,6 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 {
   public:
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
 
@@ -61,6 +60,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     using Base::size;
     using Base::derived;
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
     {
       return int(Derived::RowsAtCompileTime) == 1 ? 0
@@ -69,6 +69,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
           : inner;
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
     {
       return int(Derived::ColsAtCompileTime) == 1 ? 0
@@ -91,13 +92,15 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       *
       * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
     {
       eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
     {
       return coeff(rowIndexByOuterInner(outer, inner),
@@ -108,11 +111,12 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       *
       * \sa operator()(Index,Index), operator[](Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
     {
       eigen_assert(row >= 0 && row < rows()
           && col >= 0 && col < cols());
-      return derived().coeff(row, col);
+      return coeff(row, col);
     }
 
     /** Short version: don't use this function, use
@@ -130,11 +134,14 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     coeff(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return internal::evaluator<Derived>(derived()).coeff(index);
     }
 
 
@@ -146,15 +153,14 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       * z() const, w() const
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     operator[](Index index) const
     {
-      #ifndef EIGEN2_SUPPORT
       EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                           THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
       eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
     }
 
     /** \returns the coefficient at given index.
@@ -167,30 +173,35 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
       * z() const, w() const
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     operator()(Index index) const
     {
       eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
+      return coeff(index);
     }
 
     /** equivalent to operator[](0).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     x() const { return (*this)[0]; }
 
     /** equivalent to operator[](1).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     y() const { return (*this)[1]; }
 
     /** equivalent to operator[](2).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     z() const { return (*this)[2]; }
 
     /** equivalent to operator[](3).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE CoeffReturnType
     w() const { return (*this)[3]; }
 
@@ -207,9 +218,9 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
     {
-      eigen_internal_assert(row >= 0 && row < rows()
-                      && col >= 0 && col < cols());
-      return derived().template packet<LoadMode>(row,col);
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+      eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(row,col);
     }
 
 
@@ -234,8 +245,11 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
     template<int LoadMode>
     EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+      typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().template packet<LoadMode>(index);
+      return internal::evaluator<Derived>(derived()).template packet<LoadMode,DefaultPacketType>(index);
     }
 
   protected:
@@ -278,7 +292,6 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
     typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -311,13 +324,15 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       *
       * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
     {
       eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+                         && col >= 0 && col < cols());
+      return internal::evaluator<Derived>(derived()).coeffRef(row,col);
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     coeffRefByOuterInner(Index outer, Index inner)
     {
@@ -330,12 +345,13 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index)
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     operator()(Index row, Index col)
     {
       eigen_assert(row >= 0 && row < rows()
           && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
+      return coeffRef(row, col);
     }
 
 
@@ -354,11 +370,14 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     coeffRef(Index index)
     {
+      EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                          THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
       eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return internal::evaluator<Derived>(derived()).coeffRef(index);
     }
 
     /** \returns a reference to the coefficient at given index.
@@ -368,15 +387,14 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     operator[](Index index)
     {
-      #ifndef EIGEN2_SUPPORT
       EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
                           THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
       eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
     }
 
     /** \returns a reference to the coefficient at given index.
@@ -388,167 +406,37 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
       * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
       */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     operator()(Index index)
     {
       eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
+      return coeffRef(index);
     }
 
     /** equivalent to operator[](0).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     x() { return (*this)[0]; }
 
     /** equivalent to operator[](1).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     y() { return (*this)[1]; }
 
     /** equivalent to operator[](2).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     z() { return (*this)[2]; }
 
     /** equivalent to operator[](3).  */
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar&
     w() { return (*this)[3]; }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row,col,val);
-    }
-
-
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacketByOuterInner
-    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
-                            colIndexByOuterInner(outer, inner),
-                            val);
-    }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit and the LinearAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index index, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,val);
-    }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    /** \internal Copies the coefficient at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().coeffRef(row, col) = other.derived().coeff(row, col);
-    }
-
-    /** \internal Copies the coefficient at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().coeffRef(index) = other.derived().coeff(index);
-    }
-
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().copyCoeff(row, col, other);
-    }
-
-    /** \internal Copies the packet at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row, col,
-        other.derived().template packet<LoadMode>(row, col));
-    }
-
-    /** \internal Copies the packet at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,
-        other.derived().template packet<LoadMode>(index));
-    }
-
-    /** \internal */
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
-    }
-#endif
-
 };
 
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
@@ -568,7 +456,6 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
   public:
 
     typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -581,6 +468,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa outerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -591,6 +479,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return derived().outerStride();
@@ -606,6 +495,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), outerStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -615,6 +505,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -639,7 +530,6 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
   public:
 
     typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
@@ -652,6 +542,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa outerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return derived().innerStride();
@@ -662,6 +553,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), rowStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return derived().outerStride();
@@ -677,6 +569,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), outerStride(), colStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index rowStride() const
     {
       return Derived::IsRowMajor ? outerStride() : innerStride();
@@ -686,6 +579,7 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
       *
       * \sa innerStride(), outerStride(), rowStride()
       */
+    EIGEN_DEVICE_FUNC
     inline Index colStride() const
     {
       return Derived::IsRowMajor ? innerStride() : outerStride();
@@ -694,33 +588,42 @@ class DenseCoeffsBase<Derived, DirectWriteAccessors>
 
 namespace internal {
 
-template<typename Derived, bool JustReturnZero>
+template<int Alignment, typename Derived, bool JustReturnZero>
 struct first_aligned_impl
 {
-  static inline typename Derived::Index run(const Derived&)
+  static inline Index run(const Derived&)
   { return 0; }
 };
 
-template<typename Derived>
-struct first_aligned_impl<Derived, false>
+template<int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false>
 {
-  static inline typename Derived::Index run(const Derived& m)
+  static inline Index run(const Derived& m)
   {
-    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
+    return internal::first_aligned<Alignment>(&m.const_cast_derived().coeffRef(0,0), m.size());
   }
 };
 
-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect to \a Alignment for vectorization.
+  *
+  * \tparam Alignment requested alignment in Bytes.
   *
   * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
   * documentation.
   */
+template<int Alignment, typename Derived>
+static inline Index first_aligned(const DenseBase<Derived>& m)
+{
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
 template<typename Derived>
-static inline typename Derived::Index first_aligned(const Derived& m)
+static inline Index first_default_aligned(const DenseBase<Derived>& m)
 {
-  return first_aligned_impl
-          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-          ::run(m);
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment),Derived>(m);
 }
 
 template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
diff --git a/nuparu/include/Eigen/src/Core/DenseStorage.h b/nuparu/include/Eigen/src/Core/DenseStorage.h
index 3e7f9c1b..34048461 100644
--- a/nuparu/include/Eigen/src/Core/DenseStorage.h
+++ b/nuparu/include/Eigen/src/Core/DenseStorage.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,26 +24,37 @@ namespace internal {
 
 struct constructor_without_unaligned_array_assert {};
 
+template<typename T, int Size>
+EIGEN_DEVICE_FUNC
+void check_static_allocation_size()
+{
+  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
+  #if EIGEN_STACK_ALLOCATION_LIMIT
+  EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+  #endif
+}
+
 /** \internal
   * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
   * to 16 bytes boundary if the total size is a multiple of 16 bytes.
   */
 template <typename T, int Size, int MatrixOrArrayOptions,
           int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%16)==0) ? 16
-                        : 0 >
+                        : compute_default_alignment<T,Size>::value >
 struct plain_array
 {
   T array[Size];
 
-  plain_array() 
+  EIGEN_DEVICE_FUNC
+  plain_array()
   { 
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    check_static_allocation_size<T,Size>();
   }
 
-  plain_array(constructor_without_unaligned_array_assert) 
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert)
   { 
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    check_static_allocation_size<T,Size>();
   }
 };
 
@@ -56,41 +67,100 @@ struct plain_array
   template<typename PtrType>
   EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #else
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
+    eigen_assert((reinterpret_cast<size_t>(array) & (sizemask)) == 0 \
               && "this assertion is explained here: " \
               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
               " **** READ THIS WEB PAGE !!! ****");
 #endif
 
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 8>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
 template <typename T, int Size, int MatrixOrArrayOptions>
 struct plain_array<T, Size, MatrixOrArrayOptions, 16>
 {
-  EIGEN_USER_ALIGN16 T array[Size];
+  EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  { 
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 32>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size];
+
+  EIGEN_DEVICE_FUNC
+  plain_array() 
+  {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31);
+    check_static_allocation_size<T,Size>();
+  }
+
+  EIGEN_DEVICE_FUNC
+  plain_array(constructor_without_unaligned_array_assert) 
+  { 
+    check_static_allocation_size<T,Size>();
+  }
+};
+
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 64>
+{
+  EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size];
 
+  EIGEN_DEVICE_FUNC
   plain_array() 
   { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63);
+    check_static_allocation_size<T,Size>();
   }
 
+  EIGEN_DEVICE_FUNC
   plain_array(constructor_without_unaligned_array_assert) 
   { 
-    EIGEN_STATIC_ASSERT(Size * sizeof(T) <= 128 * 128 * 8, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
+    check_static_allocation_size<T,Size>();
   }
 };
 
 template <typename T, int MatrixOrArrayOptions, int Alignment>
 struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
 {
-  EIGEN_USER_ALIGN16 T array[1];
-  plain_array() {}
-  plain_array(constructor_without_unaligned_array_assert) {}
+  T array[1];
+  EIGEN_DEVICE_FUNC plain_array() {}
+  EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 
 } // end namespace internal
@@ -114,33 +184,50 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
 {
     internal::plain_array<T,Size,_Options> m_data;
   public:
-    inline DenseStorage() {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    static inline DenseIndex rows(void) {return _Rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC 
+    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
+    EIGEN_DEVICE_FUNC 
+    DenseStorage& operator=(const DenseStorage& other)
+    { 
+      if (this != &other) m_data = other.m_data;
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols);
+      EIGEN_UNUSED_VARIABLE(size);
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
+    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // null matrix
 template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
 {
   public:
-    inline DenseStorage() {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    inline DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void swap(DenseStorage& ) {}
-    static inline DenseIndex rows(void) {return _Rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    inline const T *data() const { return 0; }
-    inline T *data() { return 0; }
+    EIGEN_DEVICE_FUNC DenseStorage() {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) { return *this; }
+    EIGEN_DEVICE_FUNC DenseStorage(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& ) {}
+    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC void resize(Index,Index,Index) {}
+    EIGEN_DEVICE_FUNC const T *data() const { return 0; }
+    EIGEN_DEVICE_FUNC T *data() { return 0; }
 };
 
 // more specializations for null matrices; these are necessary to resolve ambiguities
@@ -157,86 +244,157 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, 0, Dynamic,
 template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
+    Index m_rows;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_rows(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
-    inline void swap(DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
+    { 
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_rows = other.m_rows;
+        m_cols = other.m_cols;
+      }
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows() const {return m_rows;}
-    inline DenseIndex cols() const {return m_cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage and fixed width
 template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
+    Index m_rows;
   public:
-    inline DenseStorage() : m_rows(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    inline DenseIndex cols(void) const {return _Cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    inline void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) 
+    {
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_rows = other.m_rows;
+      }
+      return *this; 
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC void resize(Index, Index rows, Index) { m_rows = rows; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // dynamic-size matrix with fixed-size storage and fixed height
 template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
 {
     internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_cols;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    inline DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows(void) const {return _Rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    inline void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    inline const T *data() const { return m_data.array; }
-    inline T *data() { return m_data.array; }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        m_data = other.m_data;
+        m_cols = other.m_cols;
+      }
+      return *this;
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    void resize(Index, Index, Index cols) { m_cols = cols; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
 
 // purely dynamic matrix.
 template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
 {
     T *m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
+    Index m_rows;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
        : m_data(0), m_rows(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
-    inline void swap(DenseStorage& other)
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*other.m_cols))
+      , m_rows(other.m_rows)
+      , m_cols(other.m_cols)
+    {
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*other.m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other)
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+      other.m_rows = 0;
+      other.m_cols = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other)
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    void conservativeResize(Index size, Index rows, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
-    void resize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC void resize(Index size, Index rows, Index cols)
     {
       if(size != m_rows*m_cols)
       {
@@ -247,33 +405,70 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
-      m_rows = nbRows;
-      m_cols = nbCols;
+      m_rows = rows;
+      m_cols = cols;
     }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };
 
 // matrix with dynamic width and fixed height (so that matrix has dynamic size).
 template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
 {
     T *m_data;
-    DenseIndex m_cols;
+    Index m_cols;
   public:
-    inline DenseStorage() : m_data(0), m_cols(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    static inline DenseIndex rows(void) {return _Rows;}
-    inline DenseIndex cols(void) const {return m_cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_cols(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(cols)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0);
+      EIGEN_UNUSED_VARIABLE(rows);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(_Rows*other.m_cols))
+      , m_cols(other.m_cols)
+    {
+      internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }    
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other)
+      : m_data(std::move(other.m_data))
+      , m_cols(std::move(other.m_cols))
+    {
+      other.m_data = nullptr;
+      other.m_cols = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other)
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_cols, other.m_cols);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
+    EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
+    EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
+    EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
-      m_cols = nbCols;
+      m_cols = cols;
     }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex, DenseIndex nbCols)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index, Index cols)
     {
       if(size != _Rows*m_cols)
       {
@@ -284,32 +479,69 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
-      m_cols = nbCols;
+      m_cols = cols;
     }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };
 
 // matrix with dynamic height and fixed width (so that matrix has dynamic size).
 template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
 {
     T *m_data;
-    DenseIndex m_rows;
+    Index m_rows;
   public:
-    inline DenseStorage() : m_data(0), m_rows(0) {}
-    inline DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    inline DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-    inline ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    inline void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    inline DenseIndex rows(void) const {return m_rows;}
-    static inline DenseIndex cols(void) {return _Cols;}
-    inline void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    EIGEN_DEVICE_FUNC DenseStorage() : m_data(0), m_rows(0) {}
+    explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
+    EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(rows)
+    {
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+      eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols);
+      EIGEN_UNUSED_VARIABLE(cols);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(other.m_rows*_Cols))
+      , m_rows(other.m_rows)
+    {
+      internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data);
+    }
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
+    {
+      if (this != &other)
+      {
+        DenseStorage tmp(other);
+        this->swap(tmp);
+      }
+      return *this;
+    }    
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    DenseStorage(DenseStorage&& other)
+      : m_data(std::move(other.m_data))
+      , m_rows(std::move(other.m_rows))
+    {
+      other.m_data = nullptr;
+      other.m_rows = 0;
+    }
+    EIGEN_DEVICE_FUNC
+    DenseStorage& operator=(DenseStorage&& other)
+    {
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      return *this;
+    }
+#endif
+    EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
+    EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
+    EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
+    void conservativeResize(Index size, Index rows, Index)
     {
       m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
-      m_rows = nbRows;
+      m_rows = rows;
     }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex nbRows, DenseIndex)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index size, Index rows, Index)
     {
       if(size != m_rows*_Cols)
       {
@@ -320,10 +552,10 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
           m_data = 0;
         EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
       }
-      m_rows = nbRows;
+      m_rows = rows;
     }
-    inline const T *data() const { return m_data; }
-    inline T *data() { return m_data; }
+    EIGEN_DEVICE_FUNC const T *data() const { return m_data; }
+    EIGEN_DEVICE_FUNC T *data() { return m_data; }
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Diagonal.h b/nuparu/include/Eigen/src/Core/Diagonal.h
index aab8007b..fa317626 100644
--- a/nuparu/include/Eigen/src/Core/Diagonal.h
+++ b/nuparu/include/Eigen/src/Core/Diagonal.h
@@ -37,7 +37,7 @@ template<typename MatrixType, int DiagIndex>
 struct traits<Diagonal<MatrixType,DiagIndex> >
  : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   typedef typename MatrixType::StorageKind StorageKind;
   enum {
@@ -52,8 +52,7 @@ struct traits<Diagonal<MatrixType,DiagIndex> >
                                                  MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
     MaxColsAtCompileTime = 1,
     MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+    Flags = (unsigned int)_MatrixTypeNested::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit, // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
     InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
     OuterStrideAtCompileTime = 0
@@ -70,20 +69,28 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
     typedef typename internal::dense_xpr_base<Diagonal>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
 
-    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const
-    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }
+    {
+      return m_index.value()<0 ? numext::mini<Index>(m_matrix.cols(),m_matrix.rows()+m_index.value())
+                               : numext::mini<Index>(m_matrix.rows(),m_matrix.cols()-m_index.value());
+    }
 
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return 1; }
 
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return m_matrix.outerStride() + 1;
     }
 
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return 0;
@@ -95,48 +102,58 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
       return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index row, Index) const
     {
       return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index row, Index) const
     {
       return m_matrix.coeff(row+rowOffset(), row+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index idx)
     {
       EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
       return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index idx) const
     {
       return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
     }
 
+    EIGEN_DEVICE_FUNC
     inline CoeffReturnType coeff(Index idx) const
     {
       return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
     }
 
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    EIGEN_DEVICE_FUNC
+    inline const typename internal::remove_all<typename MatrixType::Nested>::type& 
     nestedExpression() const 
     {
       return m_matrix;
     }
 
-    int index() const
+    EIGEN_DEVICE_FUNC
+    inline Index index() const
     {
       return m_index.value();
     }
@@ -147,10 +164,13 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
 
   private:
     // some compilers may fail to optimize std::max etc in case of compile-time constants...
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // triger a compile time error is someone try to call packet
+    // trigger a compile-time error if someone try to call packet
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
     template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
 };
@@ -167,7 +187,7 @@ template<typename Derived>
 inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return DiagonalReturnType(derived());
 }
 
 /** This is the const version of diagonal(). */
@@ -190,18 +210,18 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<DynamicIndex>::Type
+inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return typename DiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return DiagonalDynamicIndexReturnType(derived(), index);
 }
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<DynamicIndex>::Type
+inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return typename ConstDiagonalIndexReturnType<DynamicIndex>::Type(derived(), index);
+  return ConstDiagonalDynamicIndexReturnType(derived(), index);
 }
 
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
@@ -216,20 +236,20 @@ MatrixBase<Derived>::diagonal(Index index) const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
+template<int Index_>
+inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
-  return derived();
+  return typename DiagonalIndexReturnType<Index_>::Type(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
+template<int Index_>
+inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
-  return derived();
+  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/DiagonalMatrix.h b/nuparu/include/Eigen/src/Core/DiagonalMatrix.h
index e6c220f4..5a9e3abd 100644
--- a/nuparu/include/Eigen/src/Core/DiagonalMatrix.h
+++ b/nuparu/include/Eigen/src/Core/DiagonalMatrix.h
@@ -22,7 +22,7 @@ class DiagonalBase : public EigenBase<Derived>
     typedef typename DiagonalVectorType::Scalar Scalar;
     typedef typename DiagonalVectorType::RealScalar RealScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
     enum {
       RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
@@ -30,79 +30,62 @@ class DiagonalBase : public EigenBase<Derived>
       MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
       MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
       IsVectorAtCompileTime = 0,
-      Flags = 0
+      Flags = NoPreferredStorageOrderBit
     };
 
     typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
     typedef DenseMatrixType DenseType;
     typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;
 
+    EIGEN_DEVICE_FUNC
     inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     inline Derived& derived() { return *static_cast<Derived*>(this); }
 
+    EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived> &other) const;
-    template<typename DenseDerived>
-    void addTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() += diagonal(); }
-    template<typename DenseDerived>
-    void subTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() -= diagonal(); }
-
+    
+    EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+    EIGEN_DEVICE_FUNC
     inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return diagonal().size(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return diagonal().size(); }
 
-    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
-      */
     template<typename MatrixDerived>
-    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,MatrixDerived,LazyProduct>
     operator*(const MatrixBase<MatrixDerived> &matrix) const
     {
-      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
+      return Product<Derived, MatrixDerived, LazyProduct>(derived(),matrix.derived());
     }
 
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> > InverseReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const InverseReturnType
     inverse() const
     {
-      return diagonal().cwiseInverse();
+      return InverseReturnType(diagonal().cwiseInverse());
     }
     
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    typedef DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> > ScalarMultipleReturnType;
+    EIGEN_DEVICE_FUNC
+    inline const ScalarMultipleReturnType
     operator*(const Scalar& scalar) const
     {
-      return diagonal() * scalar;
+      return ScalarMultipleReturnType(diagonal() * scalar);
     }
-    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
+    EIGEN_DEVICE_FUNC
+    friend inline const ScalarMultipleReturnType
     operator*(const Scalar& scalar, const DiagonalBase& other)
     {
-      return other.diagonal() * scalar;
-    }
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return diagonal().isApprox(other.diagonal(), precision);
+      return ScalarMultipleReturnType(other.diagonal() * scalar);
     }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return toDenseMatrix().isApprox(other, precision);
-    }
-    #endif
 };
 
-template<typename Derived>
-template<typename DenseDerived>
-void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
-{
-  other.setZero();
-  other.diagonal() = diagonal();
-}
 #endif
 
 /** \class DiagonalMatrix
@@ -124,10 +107,9 @@ struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
  : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
   typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef DiagonalShape StorageKind;
   enum {
-    Flags = LvalueBit
+    Flags = LvalueBit | NoPreferredStorageOrderBit
   };
 };
 }
@@ -141,7 +123,7 @@ class DiagonalMatrix
     typedef const DiagonalMatrix& Nested;
     typedef _Scalar Scalar;
     typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
-    typedef typename internal::traits<DiagonalMatrix>::Index Index;
+    typedef typename internal::traits<DiagonalMatrix>::StorageIndex StorageIndex;
     #endif
 
   protected:
@@ -151,24 +133,31 @@ class DiagonalMatrix
   public:
 
     /** const version of diagonal(). */
+    EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
     /** \returns a reference to the stored vector of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
     inline DiagonalVectorType& diagonal() { return m_diagonal; }
 
     /** Default constructor without initialization */
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix() {}
 
     /** Constructs a diagonal matrix with given dimension  */
-    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
 
     /** 2D constructor. */
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}
 
     /** 3D constructor. */
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
 
     /** Copy constructor. */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
@@ -178,11 +167,13 @@ class DiagonalMatrix
 
     /** generic constructor from expression of the diagonal coefficients */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
     {}
 
     /** Copy operator. */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
     {
       m_diagonal = other.diagonal();
@@ -193,6 +184,7 @@ class DiagonalMatrix
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
       */
+    EIGEN_DEVICE_FUNC
     DiagonalMatrix& operator=(const DiagonalMatrix& other)
     {
       m_diagonal = other.diagonal();
@@ -201,14 +193,19 @@ class DiagonalMatrix
     #endif
 
     /** Resizes to given size. */
+    EIGEN_DEVICE_FUNC
     inline void resize(Index size) { m_diagonal.resize(size); }
     /** Sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
     inline void setZero() { m_diagonal.setZero(); }
     /** Resizes and sets all coefficients to zero. */
+    EIGEN_DEVICE_FUNC
     inline void setZero(Index size) { m_diagonal.setZero(size); }
     /** Sets this matrix to be the identity matrix of the current size. */
+    EIGEN_DEVICE_FUNC
     inline void setIdentity() { m_diagonal.setOnes(); }
     /** Sets this matrix to be the identity matrix of the given size. */
+    EIGEN_DEVICE_FUNC
     inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
 };
 
@@ -232,14 +229,15 @@ struct traits<DiagonalWrapper<_DiagonalVectorType> >
 {
   typedef _DiagonalVectorType DiagonalVectorType;
   typedef typename DiagonalVectorType::Scalar Scalar;
-  typedef typename DiagonalVectorType::Index Index;
-  typedef typename DiagonalVectorType::StorageKind StorageKind;
+  typedef typename DiagonalVectorType::StorageIndex StorageIndex;
+  typedef DiagonalShape StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
     ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    Flags =  (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
   };
 };
 }
@@ -255,9 +253,11 @@ class DiagonalWrapper
     #endif
 
     /** Constructor from expression of diagonal coefficients to wrap. */
-    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
 
     /** \returns a const reference to the wrapped expression of diagonal coefficients. */
+    EIGEN_DEVICE_FUNC
     const DiagonalVectorType& diagonal() const { return m_diagonal; }
 
   protected:
@@ -277,7 +277,7 @@ template<typename Derived>
 inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
-  return derived();
+  return DiagonalWrapper<const Derived>(derived());
 }
 
 /** \returns true if *this is approximately equal to a diagonal matrix,
@@ -308,6 +308,33 @@ bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
   return true;
 }
 
+namespace internal {
+
+template<> struct storage_kind_to_shape<DiagonalShape> { typedef DiagonalShape Shape; };
+
+struct Diagonal2Dense {};
+
+template<> struct AssignmentKind<DenseShape,DiagonalShape> { typedef Diagonal2Dense Kind; };
+
+// Diagonal matrix to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    dst.setZero();
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+
+} // namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_DIAGONALMATRIX_H
diff --git a/nuparu/include/Eigen/src/Core/DiagonalProduct.h b/nuparu/include/Eigen/src/Core/DiagonalProduct.h
index c03a0c2e..d372b938 100644
--- a/nuparu/include/Eigen/src/Core/DiagonalProduct.h
+++ b/nuparu/include/Eigen/src/Core/DiagonalProduct.h
@@ -13,116 +13,14 @@
 
 namespace Eigen { 
 
-namespace internal {
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
- : traits<MatrixType>
-{
-  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
-    // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
-
-    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0) | AlignedBit,//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-    CoeffReadCost = NumTraits<Scalar>::MulCost + MatrixType::CoeffReadCost + DiagonalType::DiagonalVectorType::CoeffReadCost
-  };
-};
-}
-
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-class DiagonalProduct : internal::no_assignment_operator,
-                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
-{
-  public:
-
-    typedef MatrixBase<DiagonalProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
-
-    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
-      : m_matrix(matrix), m_diagonal(diagonal)
-    {
-      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
-    }
-    
-    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-    {
-      enum {
-        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
-      };
-      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
-      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
-        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
-       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
-    }
-    
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-  protected:
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
-    {
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
-    {
-      enum {
-        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
-      };
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
-    }
-
-    typename MatrixType::Nested m_matrix;
-    typename DiagonalType::Nested m_diagonal;
-};
-
 /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
-  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
+  return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Dot.h b/nuparu/include/Eigen/src/Core/Dot.h
index 9d7651f1..003450f1 100644
--- a/nuparu/include/Eigen/src/Core/Dot.h
+++ b/nuparu/include/Eigen/src/Core/Dot.h
@@ -29,6 +29,7 @@ template<typename T, typename U,
 struct dot_nocheck
 {
   typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  EIGEN_DEVICE_FUNC
   static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
     return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -39,6 +40,7 @@ template<typename T, typename U>
 struct dot_nocheck<T, U, true>
 {
   typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
+  EIGEN_DEVICE_FUNC
   static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
   {
     return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
@@ -59,6 +61,7 @@ struct dot_nocheck<T, U, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
 MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 {
@@ -73,34 +76,6 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
 }
 
-#ifdef EIGEN2_SUPPORT
-/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
-  * (conjugating the second variable). Of course this only makes a difference in the complex case.
-  *
-  * This method is only available in EIGEN2_SUPPORT mode.
-  *
-  * \only_for_vectors
-  *
-  * \sa dot()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-  eigen_assert(size() == other.size());
-
-  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
-}
-#endif
-
-
 //---------- implementation of L2 norm and related functions ----------
 
 /** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
@@ -124,7 +99,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
 template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
-  using std::sqrt;
+  EIGEN_USING_STD_MATH(sqrt)
   return sqrt(squaredNorm());
 }
 
@@ -138,8 +113,7 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
-  typedef typename internal::nested<Derived>::type Nested;
-  typedef typename internal::remove_reference<Nested>::type _Nested;
+  typedef typename internal::nested_eval<Derived,2>::type _Nested;
   _Nested n(derived());
   return n / n.norm();
 }
@@ -164,9 +138,10 @@ template<typename Derived, int p>
 struct lpNorm_selector
 {
   typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const MatrixBase<Derived>& m)
   {
-    using std::pow;
+    EIGEN_USING_STD_MATH(pow)
     return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
   }
 };
@@ -174,6 +149,7 @@ struct lpNorm_selector
 template<typename Derived>
 struct lpNorm_selector<Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
   {
     return m.cwiseAbs().sum();
@@ -183,6 +159,7 @@ struct lpNorm_selector<Derived, 1>
 template<typename Derived>
 struct lpNorm_selector<Derived, 2>
 {
+  EIGEN_DEVICE_FUNC
   static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
   {
     return m.norm();
@@ -192,6 +169,7 @@ struct lpNorm_selector<Derived, 2>
 template<typename Derived>
 struct lpNorm_selector<Derived, Infinity>
 {
+  EIGEN_DEVICE_FUNC
   static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
   {
     return m.cwiseAbs().maxCoeff();
@@ -200,9 +178,11 @@ struct lpNorm_selector<Derived, Infinity>
 
 } // end namespace internal
 
-/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
-  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
-  *          norm, that is the maximum of the absolute values of the coefficients of *this.
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
+  *          of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
+  *          norm, that is the maximum of the absolute values of the coefficients of \c *this.
+  *
+  * \note For matrices, this function does not compute the <a href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
   *
   * \sa norm()
   */
@@ -227,8 +207,8 @@ template<typename OtherDerived>
 bool MatrixBase<Derived>::isOrthogonal
 (const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
 {
-  typename internal::nested<Derived,2>::type nested(derived());
-  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+  typename internal::nested_eval<Derived,2>::type nested(derived());
+  typename internal::nested_eval<OtherDerived,2>::type otherNested(other.derived());
   return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
 }
 
@@ -246,13 +226,13 @@ bool MatrixBase<Derived>::isOrthogonal
 template<typename Derived>
 bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
 {
-  typename Derived::Nested nested(derived());
+  typename internal::nested_eval<Derived,1>::type self(derived());
   for(Index i = 0; i < cols(); ++i)
   {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
+    if(!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
       return false;
     for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
+      if(!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec))
         return false;
   }
   return true;
diff --git a/nuparu/include/Eigen/src/Core/EigenBase.h b/nuparu/include/Eigen/src/Core/EigenBase.h
index 2b8dd1b7..79dabda3 100644
--- a/nuparu/include/Eigen/src/Core/EigenBase.h
+++ b/nuparu/include/Eigen/src/Core/EigenBase.h
@@ -13,7 +13,9 @@
 
 namespace Eigen {
 
-/** Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+/** \class EigenBase
+  * 
+  * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
   *
   * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
   *
@@ -26,34 +28,52 @@ namespace Eigen {
 template<typename Derived> struct EigenBase
 {
 //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
-
+  
+  /** \brief The interface type of indices
+    * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+    * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+    */
+  typedef Eigen::Index Index;
+
+  // FIXME is it needed?
   typedef typename internal::traits<Derived>::StorageKind StorageKind;
-  typedef typename internal::traits<Derived>::Index Index;
 
   /** \returns a reference to the derived object */
+  EIGEN_DEVICE_FUNC
   Derived& derived() { return *static_cast<Derived*>(this); }
   /** \returns a const reference to the derived object */
+  EIGEN_DEVICE_FUNC
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
+  EIGEN_DEVICE_FUNC
   inline Derived& const_cast_derived() const
   { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
+  EIGEN_DEVICE_FUNC
   inline const Derived& const_derived() const
   { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+  EIGEN_DEVICE_FUNC
   inline Index rows() const { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+  EIGEN_DEVICE_FUNC
   inline Index cols() const { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
     * \sa rows(), cols(), SizeAtCompileTime. */
+  EIGEN_DEVICE_FUNC
   inline Index size() const { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
-  template<typename Dest> inline void evalTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void evalTo(Dest& dst) const
   { derived().evalTo(dst); }
 
   /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
-  template<typename Dest> inline void addTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void addTo(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -63,7 +83,9 @@ template<typename Derived> struct EigenBase
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
-  template<typename Dest> inline void subTo(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC
+  inline void subTo(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -73,7 +95,8 @@ template<typename Derived> struct EigenBase
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -81,7 +104,8 @@ template<typename Derived> struct EigenBase
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
+  template<typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const
   {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
@@ -106,7 +130,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().evalTo(derived());
+  call_assignment(derived(), other.derived());
   return derived();
 }
 
@@ -114,7 +138,7 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().addTo(derived());
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar>());
   return derived();
 }
 
@@ -122,40 +146,10 @@ template<typename Derived>
 template<typename OtherDerived>
 Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
 {
-  other.derived().subTo(derived());
-  return derived();
-}
-
-/** replaces \c *this by \c *this * \a other.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline Derived&
-MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
-{
-  other.derived().applyThisOnTheRight(derived());
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar>());
   return derived();
 }
 
-/** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
-{
-  other.derived().applyThisOnTheRight(derived());
-}
-
-/** replaces \c *this by \c *this * \a other. */
-template<typename Derived>
-template<typename OtherDerived>
-inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
-{
-  other.derived().applyThisOnTheLeft(derived());
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_EIGENBASE_H
diff --git a/nuparu/include/Eigen/src/Core/Flagged.h b/nuparu/include/Eigen/src/Core/Flagged.h
deleted file mode 100644
index 1f2955fc..00000000
--- a/nuparu/include/Eigen/src/Core/Flagged.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FLAGGED_H
-#define EIGEN_FLAGGED_H
-
-namespace Eigen { 
-
-/** \class Flagged
-  * \ingroup Core_Module
-  *
-  * \brief Expression with modified flags
-  *
-  * \param ExpressionType the type of the object of which we are modifying the flags
-  * \param Added the flags added to the expression
-  * \param Removed the flags removed from the expression (has priority over Added).
-  *
-  * This class represents an expression whose flags have been modified.
-  * It is the return type of MatrixBase::flagged()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::flagged()
-  */
-
-namespace internal {
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-struct traits<Flagged<ExpressionType, Added, Removed> > : traits<ExpressionType>
-{
-  enum { Flags = (ExpressionType::Flags | Added) & ~Removed };
-};
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged
-  : public MatrixBase<Flagged<ExpressionType, Added, Removed> >
-{
-  public:
-
-    typedef MatrixBase<Flagged> Base;
-    
-    EIGEN_DENSE_PUBLIC_INTERFACE(Flagged)
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef typename ExpressionType::InnerIterator InnerIterator;
-
-    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row, col);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(index);
-    }
-    
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_matrix.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_matrix.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
-
-    const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived>
-    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-/** \returns an expression of *this with added and removed flags
-  *
-  * This is mostly for internal use.
-  *
-  * \sa class Flagged
-  */
-template<typename Derived>
-template<unsigned int Added,unsigned int Removed>
-inline const Flagged<Derived, Added, Removed>
-DenseBase<Derived>::flagged() const
-{
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_FLAGGED_H
diff --git a/nuparu/include/Eigen/src/Core/ForceAlignedAccess.h b/nuparu/include/Eigen/src/Core/ForceAlignedAccess.h
index 807c7a29..7b08b45e 100644
--- a/nuparu/include/Eigen/src/Core/ForceAlignedAccess.h
+++ b/nuparu/include/Eigen/src/Core/ForceAlignedAccess.h
@@ -39,29 +39,29 @@ template<typename ExpressionType> class ForceAlignedAccess
     typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
 
-    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -90,7 +90,7 @@ template<typename ExpressionType> class ForceAlignedAccess
       m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType& m_expression;
@@ -127,7 +127,7 @@ template<bool Enable>
 inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
 MatrixBase<Derived>::forceAlignedAccessIf() const
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 /** \returns an expression of *this with forced aligned access if \a Enable is true.
@@ -138,7 +138,7 @@ template<bool Enable>
 inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
 MatrixBase<Derived>::forceAlignedAccessIf()
 {
-  return derived();
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Functors.h b/nuparu/include/Eigen/src/Core/Functors.h
deleted file mode 100644
index 04fb2173..00000000
--- a/nuparu/include/Eigen/src/Core/Functors.h
+++ /dev/null
@@ -1,985 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FUNCTORS_H
-#define EIGEN_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// associative functors:
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, MatrixBase::sum()
-  */
-template<typename Scalar> struct scalar_sum_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename Scalar> struct scalar_min_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::min; return (min)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename Scalar> struct scalar_max_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::max; return (max)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the hypot of two scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar> struct scalar_hypot_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
-  {
-    using std::max;
-    using std::min;
-    using std::sqrt;
-    Scalar p = (max)(_x, _y);
-    Scalar q = (min)(_x, _y);
-    Scalar qp = q/p;
-    return p * sqrt(Scalar(1) + qp*qp);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-// other binary functors:
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_difference_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-// unary functors:
-
-/** \internal
-  * \brief Template functor to compute the opposite of a scalar
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_opposite_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pnegate(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_opposite_op<Scalar> >
-{ enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasNegate };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs
-  */
-template<typename Scalar> struct scalar_abs_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pabs(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAbs
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the squared absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs2
-  */
-template<typename Scalar> struct scalar_abs2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
-
-/** \internal
-  * \brief Template functor to compute the conjugate of a complex value
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
-  */
-template<typename Scalar> struct scalar_conjugate_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_conjugate_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
-    PacketAccess = packet_traits<Scalar>::HasConj
-  };
-};
-
-/** \internal
-  * \brief Template functor to cast a scalar to another type
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::cast()
-  */
-template<typename Scalar, typename NewType>
-struct scalar_cast_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef NewType result_type;
-  EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
-};
-template<typename Scalar, typename NewType>
-struct functor_traits<scalar_cast_op<Scalar,NewType> >
-{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the exponential of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::exp()
-  */
-template<typename Scalar> struct scalar_exp_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the logarithm of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::log()
-  */
-template<typename Scalar> struct scalar_log_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
-
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-// nullary functors
-
-template<typename Scalar>
-struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
-
-template<typename Scalar> struct scalar_identity_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
-
-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
-
-// linear access for packet ops:
-// 1) initialization
-//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
-// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
-//   base += [size*step, ..., size*step]
-//
-// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
-//       in order to avoid the padd() in operator() ?
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
-  { 
-    m_base = padd(m_base, pset1<Packet>(m_step));
-    return m_low+Scalar(i)*m_step; 
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_packetStep;
-  mutable Packet m_base;
-};
-
-// random access for packet ops:
-// 1) each step
-//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
-  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(i),m_interPacket))); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_lowPacket;
-  const Packet m_stepPacket;
-  const Packet m_interPacket;
-};
-
-// ----- Linspace functor ----------------------------------------------------------------
-
-// Forward declaration (we default to random access which does not really give
-// us a speed gain when using packet access but it allows to use the functor in
-// nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/(num_steps-1))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
-  {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
-
-  // This proxy object handles the actual required temporaries, the different
-  // implementations (random vs. sequential access) as well as the
-  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
-};
-
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
-
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to compute the square root of a scalar
-  * \sa class CwiseUnaryOp, Cwise::sqrt()
-  */
-template<typename Scalar> struct scalar_sqrt_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sqrt_op<Scalar> >
-{ enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSqrt
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::cos()
-  */
-template<typename Scalar> struct scalar_cos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
-  inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasCos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sin()
-  */
-template<typename Scalar> struct scalar_sin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSin
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the tan of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::tan()
-  */
-template<typename Scalar> struct scalar_tan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_tan_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasTan
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::acos()
-  */
-template<typename Scalar> struct scalar_acos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_acos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasACos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::asin()
-  */
-template<typename Scalar> struct scalar_asin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_asin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasASin
-  };
-};
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  inline Scalar operator() (const Scalar& a) const { return m_other / a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
-};
-
-/** \internal
-  * \brief Template functor to compute the inverse of a scalar
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
-  inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-/** \internal
-  * \brief Template functor to compute the square of a scalar
-  * \sa class CwiseUnaryOp, Cwise::square()
-  */
-template<typename Scalar>
-struct scalar_square_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_square_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-/** \internal
-  * \brief Template functor to compute the cube of a scalar
-  * \sa class CwiseUnaryOp, Cwise::cube()
-  */
-template<typename Scalar>
-struct scalar_cube_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,pmul(a,a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cube_op<Scalar> >
-{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-// default functor traits for STL functors:
-
-template<typename T>
-struct functor_traits<std::multiplies<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::divides<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::plus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::minus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::negate<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_or<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_and<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_not<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::not_equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder2nd<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder1st<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::unary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-#ifdef EIGEN_STDEXT_SUPPORT
-
-template<typename T0,typename T1>
-struct functor_traits<std::project1st<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::project2nd<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select2nd<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select1st<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::unary_compose<T0,T1> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
-
-template<typename T0,typename T1,typename T2>
-struct functor_traits<std::binary_compose<T0,T1,T2> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
-
-#endif // EIGEN_STDEXT_SUPPORT
-
-// allow to add new functors and specializations of functor_traits from outside Eigen.
-// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
-#ifdef EIGEN_FUNCTORS_PLUGIN
-#include EIGEN_FUNCTORS_PLUGIN
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_FUNCTORS_H
diff --git a/nuparu/include/Eigen/src/Core/Fuzzy.h b/nuparu/include/Eigen/src/Core/Fuzzy.h
index fe63bd29..3e403a09 100644
--- a/nuparu/include/Eigen/src/Core/Fuzzy.h
+++ b/nuparu/include/Eigen/src/Core/Fuzzy.h
@@ -19,18 +19,19 @@ namespace internal
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isApprox_selector
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
   {
-    using std::min;
-    typename internal::nested<Derived,2>::type nested(x);
-    typename internal::nested<OtherDerived,2>::type otherNested(y);
-    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+    typename internal::nested_eval<Derived,2>::type nested(x);
+    typename internal::nested_eval<OtherDerived,2>::type otherNested(y);
+    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
   }
 };
 
 template<typename Derived, typename OtherDerived>
 struct isApprox_selector<Derived, OtherDerived, true>
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
   {
     return x.matrix() == y.matrix();
@@ -40,6 +41,7 @@ struct isApprox_selector<Derived, OtherDerived, true>
 template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_object_selector
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
   {
     return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
@@ -49,6 +51,7 @@ struct isMuchSmallerThan_object_selector
 template<typename Derived, typename OtherDerived>
 struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
   {
     return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
@@ -58,6 +61,7 @@ struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
 template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
 struct isMuchSmallerThan_scalar_selector
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
   {
     return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
@@ -67,6 +71,7 @@ struct isMuchSmallerThan_scalar_selector
 template<typename Derived>
 struct isMuchSmallerThan_scalar_selector<Derived, true>
 {
+  EIGEN_DEVICE_FUNC
   static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
   {
     return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
diff --git a/nuparu/include/Eigen/src/Core/GeneralProduct.h b/nuparu/include/Eigen/src/Core/GeneralProduct.h
index 2a59d946..fe8204ac 100644
--- a/nuparu/include/Eigen/src/Core/GeneralProduct.h
+++ b/nuparu/include/Eigen/src/Core/GeneralProduct.h
@@ -11,29 +11,7 @@
 #ifndef EIGEN_GENERAL_PRODUCT_H
 #define EIGEN_GENERAL_PRODUCT_H
 
-namespace Eigen { 
-
-/** \class GeneralProduct
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two general matrices or vectors
-  *
-  * \param LhsNested the type used to store the left-hand side
-  * \param RhsNested the type used to store the right-hand side
-  * \param ProductMode the type of the product
-  *
-  * This class represents an expression of the product of two general matrices.
-  * We call a general matrix, a dense matrix with full storage. For instance,
-  * This excludes triangular, selfadjoint, and sparse matrices.
-  * It is the return type of the operator* between general matrices. Its template
-  * arguments are determined automatically by ProductReturnType. Therefore,
-  * GeneralProduct should never be used direclty. To determine the result type of a
-  * function which involves a matrix product, use ProductReturnType::Type.
-  *
-  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
-class GeneralProduct;
+namespace Eigen {
 
 enum {
   Large = 2,
@@ -59,15 +37,14 @@ template<typename Lhs, typename Rhs> struct product_type
   typedef typename remove_all<Lhs>::type _Lhs;
   typedef typename remove_all<Rhs>::type _Rhs;
   enum {
-    MaxRows  = _Lhs::MaxRowsAtCompileTime,
-    Rows  = _Lhs::RowsAtCompileTime,
-    MaxCols  = _Rhs::MaxColsAtCompileTime,
-    Cols  = _Rhs::ColsAtCompileTime,
-    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
-                                           _Rhs::MaxRowsAtCompileTime),
-    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime),
-    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+    MaxRows = traits<_Lhs>::MaxRowsAtCompileTime,
+    Rows    = traits<_Lhs>::RowsAtCompileTime,
+    MaxCols = traits<_Rhs>::MaxColsAtCompileTime,
+    Cols    = traits<_Rhs>::ColsAtCompileTime,
+    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::MaxColsAtCompileTime,
+                                           traits<_Rhs>::MaxRowsAtCompileTime),
+    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(traits<_Lhs>::ColsAtCompileTime,
+                                        traits<_Rhs>::RowsAtCompileTime)
   };
 
   // the splitting into different lines of code here, introducing the _select enums and the typedef below,
@@ -82,7 +59,8 @@ template<typename Lhs, typename Rhs> struct product_type
 
 public:
   enum {
-    value = selector::ret
+    value = selector::ret,
+    ret = selector::ret
   };
 #ifdef EIGEN_DEBUG_PRODUCT
   static void debug()
@@ -98,6 +76,31 @@ template<typename Lhs, typename Rhs> struct product_type
 #endif
 };
 
+// template<typename Lhs, typename Rhs> struct product_tag
+// {
+// private:
+//   
+//   typedef typename remove_all<Lhs>::type _Lhs;
+//   typedef typename remove_all<Rhs>::type _Rhs;
+//   enum {
+//     Rows  = _Lhs::RowsAtCompileTime,
+//     Cols  = _Rhs::ColsAtCompileTime,
+//     Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime, _Rhs::RowsAtCompileTime)
+//   };
+// 
+//   enum {
+//     rows_select = Rows==1 ? int(Rows) : int(Large),
+//     cols_select = Cols==1 ? int(Cols) : int(Large),
+//     depth_select = Depth==1 ? int(Depth) : int(Large)
+//   };
+//   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
+// 
+// public:
+//   enum {
+//     ret = selector::ret
+//   };
+// 
+// };
 
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
@@ -128,54 +131,6 @@ template<>              struct product_type_selector<Large,Large,Small>  { enum
 
 } // end namespace internal
 
-/** \class ProductReturnType
-  * \ingroup Core_Module
-  *
-  * \brief Helper class to get the correct and optimized returned type of operator*
-  *
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
-  *
-  * This class defines the typename Type representing the optimized product expression
-  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
-  * is the recommended way to define the result type of a function returning an expression
-  * which involve a matrix product. The class Product should never be
-  * used directly.
-  *
-  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType>
-struct ProductReturnType
-{
-  // TODO use the nested type to reduce instanciations ????
-//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-
-  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
-};
-
-// this is a workaround for sun CC
-template<typename Lhs, typename Rhs>
-struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{};
-
 /***********************************************************************
 *  Implementation of Inner Vector Vector Product
 ***********************************************************************/
@@ -187,119 +142,10 @@ struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedPr
 // product ends up to a row-vector times col-vector product... To tackle this use
 // case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
 
-namespace internal {
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
- : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, InnerProduct>
-  : internal::no_assignment_operator,
-    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
-{
-    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
-  public:
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-    }
-
-    /** Convertion to scalar */
-    operator const typename Base::Scalar() const {
-      return Base::coeff(0,0);
-    }
-};
-
 /***********************************************************************
 *  Implementation of Outer Vector Vector Product
 ***********************************************************************/
 
-namespace internal {
-
-// Column major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
-{
-  typedef typename Dest::Index Index;
-  // FIXME make sure lhs is sequentially stored
-  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  const Index cols = dest.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dest.col(j), prod.rhs().coeff(j) * prod.lhs());
-}
-
-// Row major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
-  typedef typename Dest::Index Index;
-  // FIXME make sure rhs is sequentially stored
-  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  const Index rows = dest.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dest.row(i), prod.lhs().coeff(i) * prod.rhs());
-}
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, OuterProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
-{
-    template<typename T> struct IsRowMajor : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-    
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-    
-    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
-    struct adds {
-      Scalar m_scale;
-      adds(const Scalar& s) : m_scale(s) {}
-      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
-        dst.const_cast_derived() += m_scale * src;
-      }
-    };
-    
-    template<typename Dest>
-    inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, set(), IsRowMajor<Dest>());
-    }
-    
-    template<typename Dest>
-    inline void addTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, add(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest>
-    inline void subTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, sub(), IsRowMajor<Dest>());
-    }
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::outer_product_selector_run(*this, dest, adds(alpha), IsRowMajor<Dest>());
-    }
-};
-
 /***********************************************************************
 *  Implementation of General Matrix Vector Product
 ***********************************************************************/
@@ -313,60 +159,13 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
  */
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
-{};
-
 template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_selector;
+struct gemv_dense_selector;
 
 } // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemvProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    typedef typename Lhs::Scalar LhsScalar;
-    typedef typename Rhs::Scalar RhsScalar;
-
-    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
-    {
-//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
-//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-
-    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
-
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
-      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
-    }
-};
-
 namespace internal {
 
-// The vector is on the left => transposition
-template<int StorageOrder, bool BlasCompatible>
-struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    Transpose<Dest> destT(dest);
-    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
-      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
-        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
-  }
-};
-
 template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;
 
 template<typename Scalar,int Size,int MaxSize>
@@ -384,7 +183,7 @@ struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
 {
-  #if EIGEN_ALIGN_STATICALLY
+  #if EIGEN_MAX_STATIC_ALIGN_BYTES!=0
   internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
   EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
   #else
@@ -397,33 +196,48 @@ struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
   internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
   EIGEN_STRONG_INLINE Scalar* data() {
     return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
+            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(EIGEN_MAX_ALIGN_BYTES-1))) + EIGEN_MAX_ALIGN_BYTES)
             : m_data.array;
   }
   #endif
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,true>
+// The vector is on the left => transposition
+template<int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector<OnTheLeft,StorageOrder,BlasCompatible>
+{
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
+  {
+    Transpose<Dest> destT(dest);
+    enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
+    gemv_dense_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
+      ::run(rhs.transpose(), lhs.transpose(), destT, alpha);
+  }
+};
+
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
 {
-  template<typename ProductType, typename Dest>
-  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static inline void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    typedef typename Dest::RealScalar  RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
     typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
 
-    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
-    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
+    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
+    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@@ -435,18 +249,18 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
 
     gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
 
-    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+    const bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+    const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                   evalToDest ? dest.data() : static_dest.data());
-    
+
     if(!evalToDest)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+      Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
       if(!alphaIsCompatible)
@@ -458,11 +272,13 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhs.data(), actualRhs.innerStride(),
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhs.data(), actualRhs.innerStride()),
         actualDestPtr, 1,
         compatibleAlpha);
 
@@ -476,34 +292,34 @@ template<> struct gemv_selector<OnTheRight,ColMajor,true>
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,true>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    typedef typename Lhs::Scalar   LhsScalar;
+    typedef typename Rhs::Scalar   RhsScalar;
+    typedef typename Dest::Scalar  ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -511,45 +327,46 @@ template<> struct gemv_selector<OnTheRight,RowMajor,true>
     if(!DirectlyUseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     general_matrix_vector_product
-      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
+        <Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
         actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhsPtr, 1,
+        LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+        RhsMapper(actualRhsPtr, 1),
         dest.data(), dest.innerStride(),
         actualAlpha);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,ColMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Index Index;
-    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
-    const Index size = prod.rhs().rows();
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory, otherwise use a temp
+    typename nested_eval<Rhs,1>::type actual_rhs(rhs);
+    const Index size = rhs.rows();
     for(Index k=0; k<size; ++k)
-      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
+      dest += (alpha*actual_rhs.coeff(k)) * lhs.col(k);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,false>
+template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
 {
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef typename Dest::Index Index;
-    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
-    const Index rows = prod.rows();
+    typename nested_eval<Rhs,Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
+    const Index rows = dest.rows();
     for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
   }
 };
 
@@ -565,9 +382,11 @@ template<> struct gemv_selector<OnTheRight,RowMajor,false>
   *
   * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
   */
+#ifndef __CUDACC__
+
 template<typename Derived>
 template<typename OtherDerived>
-inline const typename ProductReturnType<Derived, OtherDerived>::Type
+inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
   // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -592,9 +411,12 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 #ifdef EIGEN_DEBUG_PRODUCT
   internal::product_type<Derived,OtherDerived>::debug();
 #endif
-  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+
+  return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 
+#endif // __CUDACC__
+
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
   *
   * The returned product will behave like any other expressions: the coefficients of the product will be
@@ -608,7 +430,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
   */
 template<typename Derived>
 template<typename OtherDerived>
-const typename LazyProductReturnType<Derived,OtherDerived>::Type
+const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
   enum {
@@ -627,7 +449,7 @@ MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
     INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
   EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
 
-  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,LazyProduct>(derived(), other.derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/GenericPacketMath.h b/nuparu/include/Eigen/src/Core/GenericPacketMath.h
index 5f783ebe..8ad51bad 100644
--- a/nuparu/include/Eigen/src/Core/GenericPacketMath.h
+++ b/nuparu/include/Eigen/src/Core/GenericPacketMath.h
@@ -42,21 +42,27 @@ namespace internal {
 struct default_packet_traits
 {
   enum {
+    HasHalfPacket = 0,
+
     HasAdd    = 1,
     HasSub    = 1,
     HasMul    = 1,
     HasNegate = 1,
     HasAbs    = 1,
+    HasArg    = 0,
     HasAbs2   = 1,
     HasMin    = 1,
     HasMax    = 1,
     HasConj   = 1,
     HasSetLinear = 1,
+    HasBlend  = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
+    HasRsqrt  = 0,
     HasExp    = 0,
     HasLog    = 0,
+    HasLog10    = 0,
     HasPow    = 0,
 
     HasSin    = 0,
@@ -64,17 +70,31 @@ struct default_packet_traits
     HasTan    = 0,
     HasASin   = 0,
     HasACos   = 0,
-    HasATan   = 0
+    HasATan   = 0,
+    HasSinh    = 0,
+    HasCosh    = 0,
+    HasTanh    = 0,
+    HasLGamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+
+    HasRound  = 0,
+    HasFloor  = 0,
+    HasCeil   = 0,
+
+    HasSign   = 0
   };
 };
 
 template<typename T> struct packet_traits : default_packet_traits
 {
   typedef T type;
+  typedef T half;
   enum {
     Vectorizable = 0,
     size = 1,
-    AlignedOnScalar = 0
+    AlignedOnScalar = 0,
+    HasHalfPacket = 0
   };
   enum {
     HasAdd    = 0,
@@ -90,135 +110,250 @@ template<typename T> struct packet_traits : default_packet_traits
   };
 };
 
+template<typename T> struct packet_traits<const T> : packet_traits<T> { };
+
+template <typename Src, typename Tgt> struct type_casting_traits {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a) {
+  return static_cast<TgtPacket>(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket
+pcast(const SrcPacket& a, const SrcPacket& /*b*/) {
+  return static_cast<TgtPacket>(a);
+}
+
+
 /** \internal \returns a + b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 padd(const Packet& a,
         const Packet& b) { return a+b; }
 
 /** \internal \returns a - b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 psub(const Packet& a,
         const Packet& b) { return a-b; }
 
 /** \internal \returns -a (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pnegate(const Packet& a) { return -a; }
 
 /** \internal \returns conj(a) (coeff-wise) */
-template<typename Packet> inline Packet
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pconj(const Packet& a) { return numext::conj(a); }
 
 /** \internal \returns a * b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmul(const Packet& a,
         const Packet& b) { return a*b; }
 
 /** \internal \returns a / b (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pdiv(const Packet& a,
         const Packet& b) { return a/b; }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmin(const Packet& a,
-        const Packet& b) { using std::min; return (min)(a, b); }
+        const Packet& b) { return numext::mini(a, b); }
 
 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmax(const Packet& a,
-        const Packet& b) { using std::max; return (max)(a, b); }
+        const Packet& b) { return numext::maxi(a, b); }
 
 /** \internal \returns the absolute value of \a a */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }
 
+/** \internal \returns the phase angle of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+parg(const Packet& a) { using numext::arg; return arg(a); }
+
 /** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }
 
 /** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 por(const Packet& a, const Packet& b) { return a | b; }
 
 /** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pxor(const Packet& a, const Packet& b) { return a ^ b; }
 
 /** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pandnot(const Packet& a, const Packet& b) { return a & (!b); }
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+
+/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
+
 /** \internal \returns a packet with elements of \a *from duplicated.
-  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
-  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
+  * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
+  * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
   * Currently, this function is only used for scalar * complex products.
- */
-template<typename Packet> inline Packet
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Packet> inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+/** \internal \returns a packet with elements of \a *from quadrupled.
+  * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
+  * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
+  * Currently, this function is only used in matrix products.
+  * For packet-size smaller or equal to 4, this function is equivalent to pload1 
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+ploadquad(const typename unpacket_traits<Packet>::type* from)
+{ return pload1<Packet>(from); }
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * a2 = pload1(a+2);
+  * a3 = pload1(a+3);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast2
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast4(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+  a2 = pload1<Packet>(a+2);
+  a3 = pload1<Packet>(a+3);
+}
+
+/** \internal equivalent to
+  * \code
+  * a0 = pload1(a+0);
+  * a1 = pload1(a+1);
+  * \endcode
+  * \sa pset1, pload1, ploaddup, pbroadcast4
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC
+inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
+                        Packet& a0, Packet& a1)
+{
+  a0 = pload1<Packet>(a+0);
+  a1 = pload1<Packet>(a+1);
+}
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Scalar> inline typename packet_traits<Scalar>::type
-plset(const Scalar& a) { return a; }
+template<typename Packet> inline Packet
+plset(const typename unpacket_traits<Packet>::type& a) { return a; }
 
 /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
 { (*to) = from; }
 
 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
-{ (*to) = from; }
+template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
+{  (*to) = from; }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
+ { return ploadu<Packet>(from); }
+
+ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/)
+ { pstore(to, from); }
 
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> inline void prefetch(const Scalar* addr)
 {
-#if !defined(_MSC_VER)
-__builtin_prefetch(addr);
+#ifdef __CUDA_ARCH__
+#if defined(__LP64__)
+  // 64-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+#else
+  // 32-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
+#endif
+#elif !EIGEN_COMP_MSVC
+  __builtin_prefetch(addr);
 #endif
 }
 
 /** \internal \returns the first element of a packet */
-template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
 { return a; }
 
 /** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 preduxp(const Packet* vecs) { return vecs[0]; }
 
 /** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
+{ return a; }
+
+/** \internal \returns the sum of the elements of \a a by block of 4 elements.
+  * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+  * For packet-size smaller or equal to 4, this boils down to a noop.
+  */
+template<typename Packet> EIGEN_DEVICE_FUNC inline
+typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
+predux4(const Packet& a)
 { return a; }
 
 /** \internal \returns the product of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }
 
 /** \internal \returns the min of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }
 
 /** \internal \returns the max of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }
 
 /** \internal \returns the reversed elements of \a a*/
-template<typename Packet> inline Packet preverse(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
 
+template<size_t offset, typename Packet>
+struct protate_impl
+{
+  // Empty so attempts to use this unimplemented path will fail to compile.
+  // Only specializations of this template should be used.
+};
+
+/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
+  * by the given offset, e.g. for offset == 1:
+  *     (packet[3], packet[2], packet[1], packet[0]) becomes (packet[0], packet[3], packet[2], packet[1])
+  */
+template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
+{
+  return offset ? protate_impl<offset, Packet>::run(a) : a;
+}
 
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
-template<typename Packet> inline Packet pcplxflip(const Packet& a)
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
   // FIXME: uncomment the following in case we drop the internal imag and real functions.
 //   using std::imag;
@@ -250,6 +385,22 @@ Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { using std::acos; return acos(a); }
 
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan(const Packet& a) { using std::atan; return atan(a); }
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { using std::exp; return exp(a); }
@@ -258,10 +409,44 @@ Packet pexp(const Packet& a) { using std::exp; return exp(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet plog(const Packet& a) { using std::log; return log(a); }
 
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plog10(const Packet& a) { using std::log10; return log10(a); }
+
 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
 
+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet prsqrt(const Packet& a) {
+  return pdiv(pset1<Packet>(1), psqrt(a));
+}
+
+/** \internal \returns the rounded value of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pround(const Packet& a) { using numext::round; return round(a); }
+
+/** \internal \returns the floor of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
+
+/** \internal \returns the ceil of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
 /***************************************************************************
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
@@ -275,34 +460,45 @@ inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename u
 }
 
 /** \internal \returns a * b + c (coeff-wise) */
-template<typename Packet> inline Packet
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pmadd(const Packet&  a,
          const Packet&  b,
          const Packet&  c)
 { return padd(pmul(a, b),c); }
 
 /** \internal \returns a packet version of \a *from.
-  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
-template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
     return pload<Packet>(from);
   else
     return ploadu<Packet>(from);
 }
 
 /** \internal copy the packet \a from to \a *to.
-  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
+  * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template<typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
 {
-  if(LoadMode == Aligned)
+  if(Alignment >= unpacket_traits<Packet>::alignment)
     pstore(to, from);
   else
     pstoreu(to, from);
 }
 
+/** \internal \returns a packet version of \a *from.
+  * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+  * hardware if available to speedup the loading of data that won't be modified
+  * by the current computation.
+  */
+template<typename Packet, int LoadMode>
+inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
+{
+  return ploadt<Packet, LoadMode>(from);
+}
+
 /** \internal default implementation of palign() allowing partial specialization */
 template<int Offset,typename PacketType>
 struct palign_impl
@@ -336,15 +532,46 @@ inline void palign(PacketType& first, const PacketType& second)
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/
 
+// Eigen+CUDA does not support complexes.
+#ifndef __CUDACC__
+
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
 
 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
 { return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
 
+#endif
+
+
+/***************************************************************************
+ * PacketBlock, that is a collection of N packets where the number of words
+ * in the packet is a multiple of N.
+***************************************************************************/
+template <typename Packet,int N=unpacket_traits<Packet>::size> struct PacketBlock {
+  Packet packet[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
+  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
+}
+
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+***************************************************************************/
+template <size_t N> struct Selector {
+  bool select[N];
+};
+
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
 
 #endif // EIGEN_GENERIC_PACKET_MATH_H
-
diff --git a/nuparu/include/Eigen/src/Core/GlobalFunctions.h b/nuparu/include/Eigen/src/Core/GlobalFunctions.h
index 2acf9772..62fec700 100644
--- a/nuparu/include/Eigen/src/Core/GlobalFunctions.h
+++ b/nuparu/include/Eigen/src/Core/GlobalFunctions.h
@@ -14,8 +14,8 @@
 #define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
   template<typename Derived> \
   inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
-  NAME(const Eigen::ArrayBase<Derived>& x) { \
-    return x.derived(); \
+  (NAME)(const Eigen::ArrayBase<Derived>& x) { \
+    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
   }
 
 #define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
@@ -30,25 +30,44 @@
   { \
     static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
     { \
-      return x.derived(); \
+      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived()); \
     } \
   };
 
-
 namespace Eigen
 {
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse,scalar_inverse_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan,scalar_atan_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2,scalar_abs2_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg,scalar_arg_op)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isnan,scalar_isnan_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op)
+  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op)
   
   template<typename Derived>
   inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
@@ -56,16 +75,46 @@ namespace Eigen
     return x.derived().pow(exponent);
   }
 
-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<Derived>& exponents) 
+  /** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power.
+    *
+    * Example: \include Cwise_array_power_array.cpp
+    * Output: \verbinclude Cwise_array_power_array.out
+    * 
+    * \sa ArrayBase::pow()
+    */
+  template<typename Derived,typename ExponentDerived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
   {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>(
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
   }
   
+  /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
+    *
+    * This function computes the coefficient-wise power between a scalar and an array of exponents.
+    * Beaware that the scalar type of the input scalar \a x and the exponents \a exponents must be the same.
+    *
+    * Example: \include Cwise_scalar_power_array.cpp
+    * Output: \verbinclude Cwise_scalar_power_array.out
+    * 
+    * \sa ArrayBase::pow()
+    */
+  template<typename Derived>
+  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>
+  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents) 
+  {
+    typename Derived::ConstantReturnType constant_x(exponents.rows(), exponents.cols(), x);
+    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const typename Derived::ConstantReturnType, const Derived>(
+      constant_x,
+      exponents.derived()
+    );
+  }
+  
   /**
   * \brief Component-wise division of a scalar by array elements.
   **/
diff --git a/nuparu/include/Eigen/src/Core/IO.h b/nuparu/include/Eigen/src/Core/IO.h
index c8d5f637..9ae37bb5 100644
--- a/nuparu/include/Eigen/src/Core/IO.h
+++ b/nuparu/include/Eigen/src/Core/IO.h
@@ -49,7 +49,7 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   */
 struct IOFormat
 {
-  /** Default contructor, see class IOFormat for the meaning of the parameters */
+  /** Default constructor, see class IOFormat for the meaning of the parameters */
   IOFormat(int _precision = StreamPrecision, int _flags = 0,
     const std::string& _coeffSeparator = " ",
     const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
@@ -57,6 +57,10 @@ struct IOFormat
   : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
     rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
   {
+    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // don't add rowSpacer if columns are not to be aligned
+    if((flags & DontAlignCols))
+      return;
     int i = int(matSuffix.length())-1;
     while (i>=0 && matSuffix[i]!='\n')
     {
@@ -160,7 +164,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   
   typename Derived::Nested m = _m;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
 
   Index width = 0;
 
@@ -185,21 +188,22 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
     explicit_precision = fmt.precision;
   }
 
+  std::streamsize old_precision = 0;
+  if(explicit_precision) old_precision = s.precision(explicit_precision);
+
   bool align_cols = !(fmt.flags & DontAlignCols);
   if(align_cols)
   {
     // compute the largest width
-    for(Index j = 1; j < m.cols(); ++j)
+    for(Index j = 0; j < m.cols(); ++j)
       for(Index i = 0; i < m.rows(); ++i)
       {
         std::stringstream sstr;
-        if(explicit_precision) sstr.precision(explicit_precision);
+        sstr.copyfmt(s);
         sstr << m.coeff(i,j);
         width = std::max<Index>(width, Index(sstr.str().length()));
       }
   }
-  std::streamsize old_precision = 0;
-  if(explicit_precision) old_precision = s.precision(explicit_precision);
   s << fmt.matPrefix;
   for(Index i = 0; i < m.rows(); ++i)
   {
diff --git a/nuparu/include/Eigen/src/Core/Inverse.h b/nuparu/include/Eigen/src/Core/Inverse.h
new file mode 100644
index 00000000..f3ec8499
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/Inverse.h
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+namespace Eigen { 
+
+template<typename XprType,typename StorageKind> class InverseImpl;
+
+namespace internal {
+
+template<typename XprType>
+struct traits<Inverse<XprType> >
+  : traits<typename XprType::PlainObject>
+{
+  typedef typename XprType::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit
+  };
+};
+
+} // end namespace internal
+
+/** \class Inverse
+  *
+  * \brief Expression of the inverse of another expression
+  *
+  * \tparam XprType the type of the expression we are taking the inverse
+  *
+  * This class represents an abstract expression of A.inverse()
+  * and most of the time this is the only way it is used.
+  *
+  */
+template<typename XprType>
+class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::StorageKind>
+{
+public:
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::PlainObject                       PlainObject;
+  typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
+  typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef typename internal::remove_all<XprType>::type NestedExpression;
+  
+  explicit Inverse(const XprType &xpr)
+    : m_xpr(xpr)
+  {}
+
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+
+  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+
+protected:
+  XprTypeNested m_xpr;
+};
+
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class InverseImpl
+  : public internal::generic_xpr_base<Inverse<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
+private:
+
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+/** \internal
+  * \brief Default evaluator for Inverse expression.
+  * 
+  * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
+  * by a call to internal::call_assignment_no_alias.
+  * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
+  * there own nested expression.
+  *
+  * \sa class Inverse
+  */
+template<typename ArgType>
+struct unary_evaluator<Inverse<ArgType> >
+  : public evaluator<typename Inverse<ArgType>::PlainObject>
+{
+  typedef Inverse<ArgType> InverseType;
+  typedef typename InverseType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  unary_evaluator(const InverseType& inv_xpr)
+    : m_result(inv_xpr.rows(), inv_xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    internal::call_assignment_no_alias(m_result, inv_xpr);
+  }
+  
+protected:
+  PlainObject m_result;
+};
+  
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_INVERSE_H
diff --git a/nuparu/include/Eigen/src/Core/Map.h b/nuparu/include/Eigen/src/Core/Map.h
index f804c89d..3a8375da 100644
--- a/nuparu/include/Eigen/src/Core/Map.h
+++ b/nuparu/include/Eigen/src/Core/Map.h
@@ -19,7 +19,7 @@ namespace Eigen {
   * \brief A matrix or vector expression mapping an existing array of data.
   *
   * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
   *                The default is \c #Unaligned.
   * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
   *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
@@ -70,8 +70,6 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
   : public traits<PlainObjectType>
 {
   typedef traits<PlainObjectType> TraitsBase;
-  typedef typename PlainObjectType::Index Index;
-  typedef typename PlainObjectType::Scalar Scalar;
   enum {
     InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
                              ? int(PlainObjectType::InnerStrideAtCompileTime)
@@ -79,22 +77,9 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
     OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
                              ? int(PlainObjectType::OuterStrideAtCompileTime)
                              : int(StrideType::OuterStrideAtCompileTime),
-    HasNoInnerStride = InnerStrideAtCompileTime == 1,
-    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
-    HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
-    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    KeepsPacketAccess = bool(HasNoInnerStride)
-                        && ( bool(IsDynamicSize)
-                           || HasNoOuterStride
-                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+    Alignment = int(MapOptions)&int(AlignedMask),
     Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
-    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
   };
 private:
   enum { Options }; // Expressions don't have Options
@@ -110,19 +95,17 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     EIGEN_DENSE_PUBLIC_INTERFACE(Map)
 
     typedef typename Base::PointerType PointerType;
-#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
-    typedef const Scalar* PointerArgType;
-    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
-#else
     typedef PointerType PointerArgType;
+    EIGEN_DEVICE_FUNC
     inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
-#endif
 
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const
     {
       return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
     }
 
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
       return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
@@ -134,10 +117,11 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the fixed-size case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_stride optional Stride object, passing the strides.
+      * \param stride optional Stride object, passing the strides.
       */
-    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
+    EIGEN_DEVICE_FUNC
+    explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -145,11 +129,12 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the dynamic-size vector case.
       *
       * \param dataPtr pointer to the array to map
-      * \param a_size the size of the vector expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param size the size of the vector expression
+      * \param stride optional Stride object, passing the strides.
       */
-    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
+    EIGEN_DEVICE_FUNC
+    inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -157,12 +142,13 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     /** Constructor in the dynamic-size matrix case.
       *
       * \param dataPtr pointer to the array to map
-      * \param nbRows the number of rows of the matrix expression
-      * \param nbCols the number of columns of the matrix expression
-      * \param a_stride optional Stride object, passing the strides.
+      * \param rows the number of rows of the matrix expression
+      * \param cols the number of columns of the matrix expression
+      * \param stride optional Stride object, passing the strides.
       */
-    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
+    EIGEN_DEVICE_FUNC
+    inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride)
     {
       PlainObjectType::Base::_check_template_params();
     }
@@ -173,19 +159,6 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     StrideType m_stride;
 };
 
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Array(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Array>(data));
-}
-
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Matrix(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Matrix>(data));
-}
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/Core/MapBase.h b/nuparu/include/Eigen/src/Core/MapBase.h
index 6876de58..75a80daa 100644
--- a/nuparu/include/Eigen/src/Core/MapBase.h
+++ b/nuparu/include/Eigen/src/Core/MapBase.h
@@ -12,7 +12,7 @@
 #define EIGEN_MAPBASE_H
 
 #define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
-      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+      EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
                           YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
 
 namespace Eigen { 
@@ -37,7 +37,6 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     };
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -76,8 +75,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
-    inline Index rows() const { return m_rows.value(); }
-    inline Index cols() const { return m_cols.value(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_rows.value(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_cols.value(); }
 
     /** Returns a pointer to the first coefficient of the matrix or vector.
       *
@@ -85,24 +84,28 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       *
       * \sa innerStride(), outerStride()
       */
-    inline const Scalar* data() const { return m_data; }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return m_data; }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index rowId, Index colId) const
     {
       return m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeff(Index index) const
     {
       EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
       return m_data[index * innerStride()];
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return this->m_data[colId * colStride() + rowId * rowStride()];
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -123,12 +126,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
     }
 
-    inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
+    EIGEN_DEVICE_FUNC
+    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
     {
       EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
       checkSanity();
     }
 
+    EIGEN_DEVICE_FUNC
     inline MapBase(PointerType dataPtr, Index vecSize)
             : m_data(dataPtr),
               m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
@@ -140,24 +145,28 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
       checkSanity();
     }
 
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
-            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
+    EIGEN_DEVICE_FUNC
+    inline MapBase(PointerType dataPtr, Index rows, Index cols)
+            : m_data(dataPtr), m_rows(rows), m_cols(cols)
     {
       eigen_assert( (dataPtr == 0)
-              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
+              || (   rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows)
+                  && cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
       checkSanity();
     }
 
+    #ifdef EIGEN_MAPBASE_PLUGIN
+    #include EIGEN_MAPBASE_PLUGIN
+    #endif
+
   protected:
 
+    EIGEN_DEVICE_FUNC
     void checkSanity() const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
-                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
-                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
-                   && "data is not aligned");
+#if EIGEN_MAX_ALIGN_BYTES>0
+      eigen_assert(((size_t(m_data) % EIGEN_PLAIN_ENUM_MAX(1,internal::traits<Derived>::Alignment)) == 0) && "data is not aligned");
+#endif
     }
 
     PointerType m_data;
@@ -168,13 +177,14 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
 template<typename Derived> class MapBase<Derived, WriteAccessors>
   : public MapBase<Derived, ReadOnlyAccessors>
 {
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
   public:
 
     typedef MapBase<Derived, ReadOnlyAccessors> Base;
 
     typedef typename Base::Scalar Scalar;
     typedef typename Base::PacketScalar PacketScalar;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::PointerType PointerType;
 
     using Base::derived;
@@ -195,14 +205,18 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                     const Scalar
                   >::type ScalarWithConstIfNotLvalue;
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar* data() const { return this->m_data; }
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
     {
       return this->m_data[col * colStride() + row * rowStride()];
     }
 
+    EIGEN_DEVICE_FUNC
     inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
     {
       EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
@@ -224,19 +238,24 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
                 (this->m_data + index * innerStride(), val);
     }
 
-    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
+    EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+    EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
 
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const MapBase& other)
     {
-      Base::Base::operator=(other);
+      ReadOnlyMapBase::Base::operator=(other);
       return derived();
     }
 
-    using Base::Base::operator=;
+    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+    // see bugs 821 and 920.
+    using ReadOnlyMapBase::Base::operator=;
 };
 
+#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
+
 } // end namespace Eigen
 
 #endif // EIGEN_MAPBASE_H
diff --git a/nuparu/include/Eigen/src/Core/MathFunctions.h b/nuparu/include/Eigen/src/Core/MathFunctions.h
index 2bfc5ebd..48cf565f 100644
--- a/nuparu/include/Eigen/src/Core/MathFunctions.h
+++ b/nuparu/include/Eigen/src/Core/MathFunctions.h
@@ -10,8 +10,20 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
+// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
+#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406
+
 namespace Eigen {
 
+// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
+// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
+long        abs(long        x) { return (labs(x));  }
+double      abs(double      x) { return (fabs(x));  }
+float       abs(float       x) { return (fabsf(x)); }
+long double abs(long double x) { return (fabsl(x)); }
+#endif
+  
 namespace internal {
 
 /** \internal \struct global_math_functions_filtering_base
@@ -62,6 +74,7 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct real_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     return x;
@@ -72,6 +85,7 @@ template<typename Scalar>
 struct real_default_impl<Scalar,true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     using std::real;
@@ -87,7 +101,6 @@ struct real_retval
   typedef typename NumTraits<Scalar>::Real type;
 };
 
-
 /****************************************************************************
 * Implementation of imag                                                 *
 ****************************************************************************/
@@ -96,6 +109,7 @@ template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct imag_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar&)
   {
     return RealScalar(0);
@@ -106,6 +120,7 @@ template<typename Scalar>
 struct imag_default_impl<Scalar,true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     using std::imag;
@@ -129,10 +144,12 @@ template<typename Scalar>
 struct real_ref_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar& run(Scalar& x)
   {
     return reinterpret_cast<RealScalar*>(&x)[0];
   }
+  EIGEN_DEVICE_FUNC
   static inline const RealScalar& run(const Scalar& x)
   {
     return reinterpret_cast<const RealScalar*>(&x)[0];
@@ -153,10 +170,12 @@ template<typename Scalar, bool IsComplex>
 struct imag_ref_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar& run(Scalar& x)
   {
     return reinterpret_cast<RealScalar*>(&x)[1];
   }
+  EIGEN_DEVICE_FUNC
   static inline const RealScalar& run(const Scalar& x)
   {
     return reinterpret_cast<RealScalar*>(&x)[1];
@@ -166,10 +185,12 @@ struct imag_ref_default_impl
 template<typename Scalar>
 struct imag_ref_default_impl<Scalar, false>
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(Scalar&)
   {
     return Scalar(0);
   }
+  EIGEN_DEVICE_FUNC
   static inline const Scalar run(const Scalar&)
   {
     return Scalar(0);
@@ -192,6 +213,7 @@ struct imag_ref_retval
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
 struct conj_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
     return x;
@@ -201,6 +223,7 @@ struct conj_impl
 template<typename Scalar>
 struct conj_impl<Scalar,true>
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
     using std::conj;
@@ -218,25 +241,39 @@ struct conj_retval
 * Implementation of abs2                                                 *
 ****************************************************************************/
 
-template<typename Scalar>
-struct abs2_impl
+template<typename Scalar,bool IsComplex>
+struct abs2_impl_default
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
     return x*x;
   }
 };
 
-template<typename RealScalar>
-struct abs2_impl<std::complex<RealScalar> >
+template<typename Scalar>
+struct abs2_impl_default<Scalar, true> // IsComplex
 {
-  static inline RealScalar run(const std::complex<RealScalar>& x)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
   {
     return real(x)*real(x) + imag(x)*imag(x);
   }
 };
 
+template<typename Scalar>
+struct abs2_impl
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x)
+  {
+    return abs2_impl_default<Scalar,NumTraits<Scalar>::IsComplex>::run(x);
+  }
+};
+
 template<typename Scalar>
 struct abs2_retval
 {
@@ -251,9 +288,10 @@ template<typename Scalar, bool IsComplex>
 struct norm1_default_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
     return abs(real(x)) + abs(imag(x));
   }
 };
@@ -261,9 +299,10 @@ struct norm1_default_impl
 template<typename Scalar>
 struct norm1_default_impl<Scalar, false>
 {
+  EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
     return abs(x);
   }
 };
@@ -287,16 +326,22 @@ struct hypot_impl
   typedef typename NumTraits<Scalar>::Real RealScalar;
   static inline RealScalar run(const Scalar& x, const Scalar& y)
   {
-    using std::max;
-    using std::min;
-    using std::abs;
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(abs);
+    EIGEN_USING_STD_MATH(sqrt);
     RealScalar _x = abs(x);
     RealScalar _y = abs(y);
-    RealScalar p = (max)(_x, _y);
-    if(p==RealScalar(0)) return 0;
-    RealScalar q = (min)(_x, _y);
-    RealScalar qp = q/p;
+    Scalar p, qp;
+    if(_x>_y)
+    {
+      p = _x;
+      qp = _y / p;
+    }
+    else
+    {
+      p = _y;
+      qp = _x / p;
+    }
+    if(p==RealScalar(0)) return RealScalar(0);
     return p * sqrt(RealScalar(1) + qp*qp);
   }
 };
@@ -314,6 +359,7 @@ struct hypot_retval
 template<typename OldType, typename NewType>
 struct cast_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline NewType run(const OldType& x)
   {
     return static_cast<NewType>(x);
@@ -323,48 +369,121 @@ struct cast_impl
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 
 template<typename OldType, typename NewType>
+EIGEN_DEVICE_FUNC
 inline NewType cast(const OldType& x)
 {
   return cast_impl<OldType, NewType>::run(x);
 }
 
 /****************************************************************************
-* Implementation of atanh2                                                *
+* Implementation of round                                                   *
 ****************************************************************************/
 
-template<typename Scalar, bool IsInteger>
-struct atanh2_default_impl
-{
-  typedef Scalar retval;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline Scalar run(const Scalar& x, const Scalar& y)
+#if EIGEN_HAS_CXX11_MATH
+  template<typename Scalar>
+  struct round_impl {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+      using std::round;
+      return round(x);
+    }
+  };
+#else
+  template<typename Scalar>
+  struct round_impl
   {
-    using std::abs;
-    using std::log;
-    using std::sqrt;
-    Scalar z = x / y;
-    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
-      return RealScalar(0.5) * log((y + x) / (y - x));
-    else
-      return z + z*z*z / RealScalar(3);
-  }
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
+      EIGEN_USING_STD_MATH(floor);
+      EIGEN_USING_STD_MATH(ceil);
+      return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
+    }
+  };
+#endif
+
+template<typename Scalar>
+struct round_retval
+{
+  typedef Scalar type;
 };
 
+/****************************************************************************
+* Implementation of arg                                                     *
+****************************************************************************/
+
+#if EIGEN_HAS_CXX11_MATH
+  template<typename Scalar>
+  struct arg_impl {
+    static inline Scalar run(const Scalar& x)
+    {
+      EIGEN_USING_STD_MATH(arg);
+      return arg(x);
+    }
+  };
+#else
+  template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+  struct arg_default_impl
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_DEVICE_FUNC
+    static inline RealScalar run(const Scalar& x)
+    {
+      return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); }
+  };
+
+  template<typename Scalar>
+  struct arg_default_impl<Scalar,true>
+  {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_DEVICE_FUNC
+    static inline RealScalar run(const Scalar& x)
+    {
+      EIGEN_USING_STD_MATH(arg);
+      return arg(x);
+    }
+  };
+
+  template<typename Scalar> struct arg_impl : arg_default_impl<Scalar> {};
+#endif
+
 template<typename Scalar>
-struct atanh2_default_impl<Scalar, true>
+struct arg_retval
+{
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of log1p                                                   *
+****************************************************************************/
+template<typename Scalar, bool isComplex = NumTraits<Scalar>::IsComplex >
+struct log1p_impl
 {
-  static inline Scalar run(const Scalar&, const Scalar&)
+  static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    return Scalar(0);
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    EIGEN_USING_STD_MATH(log);
+    Scalar x1p = RealScalar(1) + x;
+    return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
   }
 };
 
+#if EIGEN_HAS_CXX11_MATH
 template<typename Scalar>
-struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
+struct log1p_impl<Scalar, false> {
+  static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    using std::log1p;
+    return log1p(x);
+  }
+};
+#endif
 
 template<typename Scalar>
-struct atanh2_retval
+struct log1p_retval
 {
   typedef Scalar type;
 };
@@ -379,7 +498,7 @@ struct pow_default_impl
   typedef Scalar retval;
   static inline Scalar run(const Scalar& x, const Scalar& y)
   {
-    using std::pow;
+    EIGEN_USING_STD_MATH(pow);
     return pow(x, y);
   }
 };
@@ -447,48 +566,48 @@ struct random_default_impl<Scalar, false, false>
 };
 
 enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
 };
 
-template<unsigned int n, int lower, int upper> struct floor_log2_selector
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
 {
   enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
   };
 };
 
 template<unsigned int n,
          int lower = 0,
          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
 {
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
 {
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
 {
   enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 {
   // no value, error at compile time
 };
@@ -496,11 +615,22 @@ struct floor_log2<n, lower, upper, floor_log2_bogus>
 template<typename Scalar>
 struct random_default_impl<Scalar, false, true>
 {
-  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
-
   static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  { 
+    typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
+    if(y<x)
+      return x;
+    std::size_t range = ScalarX(y)-ScalarX(x);
+    std::size_t offset = 0;
+    // rejection sampling
+    std::size_t divisor    = (range+RAND_MAX-1)/(range+1);
+    std::size_t multiplier = (range+RAND_MAX-1)/std::size_t(RAND_MAX);
+
+    do {
+      offset = ( (std::size_t(std::rand()) * multiplier) / divisor );
+    } while (offset > range);
+
+    return Scalar(ScalarX(x) + offset);
   }
 
   static inline Scalar run()
@@ -508,7 +638,7 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
     return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
            scalar_bits = sizeof(Scalar) * CHAR_BIT,
            shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
            offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
@@ -545,97 +675,322 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
+// Implementatin of is* functions
+
+// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
+#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
+#define EIGEN_USE_STD_FPCLASSIFY 1
+#else
+#define EIGEN_USE_STD_FPCLASSIFY 0
+#endif
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isnan_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isinf_impl(const T&) { return false; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<internal::is_integral<T>::value,bool>::type
+isfinite_impl(const T&) { return true; }
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isfinite_impl(const T& x)
+{
+  #if EIGEN_USE_STD_FPCLASSIFY
+    using std::isfinite;
+    return isfinite EIGEN_NOT_A_MACRO (x);
+  #else
+    return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isinf_impl(const T& x)
+{
+  #if EIGEN_USE_STD_FPCLASSIFY
+    using std::isinf;
+    return isinf EIGEN_NOT_A_MACRO (x);
+  #else
+    return x>NumTraits<T>::highest() || x<NumTraits<T>::lowest();
+  #endif
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
+isnan_impl(const T& x)
+{
+  #if EIGEN_USE_STD_FPCLASSIFY
+    using std::isnan;
+    return isnan EIGEN_NOT_A_MACRO (x);
+  #else
+    return x != x;
+  #endif
+}
+
+#if (!EIGEN_USE_STD_FPCLASSIFY)
+
+#if EIGEN_COMP_MSVC
+
+template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
+{
+  return _fpclass(x)==_FPCLASS_NINF || _fpclass(x)==_FPCLASS_PINF;
+}
+
+//MSVC defines a _isnan builtin function, but for double only
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x); }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x); }
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x); }
+
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
+
+#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
+
+#if EIGEN_GNUC_AT_LEAST(5,0)
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
+#else
+  // NOTE the inline qualifier and noinline attribute are both needed: the former is to avoid linking issue (duplicate symbol),
+  //      while the second prevent too aggressive optimizations in fast-math mode:
+  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
+#endif
+
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+
+#undef EIGEN_TMP_NOOPT_ATTRIB
+
+#endif
+
+#endif
+
+// The following overload are defined at the end of this file
+template<typename T> bool isfinite_impl(const std::complex<T>& x);
+template<typename T> bool isnan_impl(const std::complex<T>& x);
+template<typename T> bool isinf_impl(const std::complex<T>& x);
+
 } // end namespace internal
 
 /****************************************************************************
-* Generic math function                                                    *
+* Generic math functions                                                    *
 ****************************************************************************/
 
 namespace numext {
 
+#ifndef __CUDA_ARCH__
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  EIGEN_USING_STD_MATH(min);
+  return min EIGEN_NOT_A_MACRO (x,y);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  EIGEN_USING_STD_MATH(max);
+  return max EIGEN_NOT_A_MACRO (x,y);
+}
+#else
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
+{
+  return y < x ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
+{
+  return fmin(x, y);
+}
+template<typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
+{
+  return x < y ? y : x;
+}
+template<>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
+{
+  return fmax(x, y);
+}
+#endif
+
+
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
 }  
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
 {
   return internal::real_ref_impl<Scalar>::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
 {
   return internal::imag_ref_impl<Scalar>::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
 {
   return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
 {
   return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 
 template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
 {
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
 template<typename Scalar>
+EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
 {
   return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
 }
 
-// std::isfinite is non standard, so let's define our own version,
-// even though it is not very efficient.
-template<typename T> bool (isfinite)(const T& x)
+template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
+template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (floor)(const T& x)
+{
+  EIGEN_USING_STD_MATH(floor);
+  return floor(x);
+}
+
+template<typename T>
+EIGEN_DEVICE_FUNC
+T (ceil)(const T& x)
 {
-  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+  EIGEN_USING_STD_MATH(ceil);
+  return ceil(x);
+}
+
+// Log base 2 for 32 bits positive integers.
+// Conveniently returns 0 for x==0.
+inline int log2(int x)
+{
+  eigen_assert(x>=0);
+  unsigned int v(x);
+  static const int table[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return table[(v * 0x07C4ACDDU) >> 27];
 }
 
 } // end namespace numext
 
 namespace internal {
 
+template<typename T>
+bool isfinite_impl(const std::complex<T>& x)
+{
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template<typename T>
+bool isnan_impl(const std::complex<T>& x)
+{
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template<typename T>
+bool isinf_impl(const std::complex<T>& x)
+{
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
 /****************************************************************************
 * Implementation of fuzzy comparisons                                       *
 ****************************************************************************/
@@ -649,18 +1004,19 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, false>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
   {
-    using std::abs;
+    EIGEN_USING_STD_MATH(abs);
     return abs(x) <= abs(y) * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    using std::min;
-    using std::abs;
-    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
+    EIGEN_USING_STD_MATH(abs);
+    return abs(x - y) <= numext::mini(abs(x), abs(y)) * prec;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
     return x <= y || isApprox(x, y, prec);
@@ -671,15 +1027,17 @@ template<typename Scalar>
 struct scalar_fuzzy_default_impl<Scalar, false, true>
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
   {
     return x == Scalar(0);
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
   {
     return x == y;
   }
+  EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
   {
     return x <= y;
@@ -697,29 +1055,28 @@ struct scalar_fuzzy_default_impl<Scalar, true, false>
   }
   static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
   {
-    using std::min;
-    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
+    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
   }
 };
 
 template<typename Scalar>
 struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
 
-template<typename Scalar, typename OtherScalar>
+template<typename Scalar, typename OtherScalar> EIGEN_DEVICE_FUNC
 inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
 }
 
-template<typename Scalar>
+template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApprox(const Scalar& x, const Scalar& y,
                           typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
   return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }
 
-template<typename Scalar>
+template<typename Scalar> EIGEN_DEVICE_FUNC
 inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
                                     typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
 {
@@ -742,17 +1099,19 @@ template<> struct scalar_fuzzy_impl<bool>
 {
   typedef bool RealScalar;
   
-  template<typename OtherScalar>
+  template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
   {
     return !x;
   }
   
+  EIGEN_DEVICE_FUNC
   static inline bool isApprox(bool x, bool y, bool)
   {
     return x == y;
   }
 
+  EIGEN_DEVICE_FUNC
   static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
   {
     return (!x) || y;
diff --git a/nuparu/include/Eigen/src/Core/Matrix.h b/nuparu/include/Eigen/src/Core/Matrix.h
index 0ba5d90c..ce1b70d2 100644
--- a/nuparu/include/Eigen/src/Core/Matrix.h
+++ b/nuparu/include/Eigen/src/Core/Matrix.h
@@ -24,13 +24,13 @@ namespace Eigen {
   * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
   *
   * The first three template parameters are required:
-  * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
-  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
+  * \tparam _Scalar Numeric type, e.g. float, double, int or std::complex<float>.
+  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
   * \tparam _Rows Number of rows, or \b Dynamic
   * \tparam _Cols Number of columns, or \b Dynamic
   *
   * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam _Options \anchor matrix_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
+  * \tparam _Options A combination of either \b #RowMajor or \b #ColMajor, and of either
   *                 \b #AutoAlign or \b #DontAlign.
   *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
   *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
@@ -97,6 +97,40 @@ namespace Eigen {
   * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
   * </dl>
   *
+  * <i><b>ABI and storage layout</b></i>
+  * 
+  * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
+  * <table  class="manual">
+  * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
+  * <tr><td>\code Matrix<T,Dynamic,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code
+  * Matrix<T,Dynamic,1>
+  * Matrix<T,1,Dynamic> \endcode</td><td>\code
+  * struct {
+  *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+  *   Eigen::Index size;
+  *  };
+  * \endcode</td></tr>
+  * <tr><td>\code Matrix<T,Rows,Cols> \endcode</td><td>\code
+  * struct {
+  *   T data[Rows*Cols];        // with (size_t(data)%A(Rows*Cols*sizeof(T)))==0
+  *  };
+  * \endcode</td></tr>
+  * <tr class="alt"><td>\code Matrix<T,Dynamic,Dynamic,0,MaxRows,MaxCols> \endcode</td><td>\code
+  * struct {
+  *   T data[MaxRows*MaxCols];  // with (size_t(data)%A(MaxRows*MaxCols*sizeof(T)))==0
+  *   Eigen::Index rows, cols;
+  *  };
+  * \endcode</td></tr>
+  * </table>
+  * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest possible power-of-two
+  * smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
+  * 
   * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
   * \ref TopicStorageOrders 
   */
@@ -105,9 +139,23 @@ namespace internal {
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
 {
+private:
+  enum { size = internal::size_at_compile_time<_Rows,_Cols>::ret };
+  typedef typename find_best_packet<_Scalar,size>::type PacketScalar;
+  enum {
+      row_major_bit = _Options&RowMajor ? RowMajorBit : 0,
+      is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic,
+      max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols,
+      default_alignment = compute_default_alignment<_Scalar,max_size>::value,
+      actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0,
+      required_alignment = unpacket_traits<PacketScalar>::alignment,
+      packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0
+    };
+    
+public:
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
   typedef MatrixXpr XprKind;
   enum {
     RowsAtCompileTime = _Rows,
@@ -115,10 +163,13 @@ struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
     MaxRowsAtCompileTime = _MaxRows,
     MaxColsAtCompileTime = _MaxCols,
     Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
     Options = _Options,
     InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
+    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime,
+    
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
   };
 };
 }
@@ -151,6 +202,7 @@ class Matrix
       *
       * \callgraph
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
     {
       return Base::_set(other);
@@ -167,7 +219,8 @@ class Matrix
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other)
     {
       return Base::_set(other);
     }
@@ -179,12 +232,14 @@ class Matrix
       * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
     {
       return Base::operator=(other);
     }
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
     {
       return Base::operator=(func);
@@ -200,6 +255,7 @@ class Matrix
       *
       * \sa resize(Index,Index)
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix() : Base()
     {
       Base::_check_template_params();
@@ -207,45 +263,87 @@ class Matrix
     }
 
     // FIXME is it still needed
-    Matrix(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 
-    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    Matrix(Matrix&& other)
+      : Base(std::move(other))
     {
       Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
+        Base::_set_noalias(other);
     }
+    EIGEN_DEVICE_FUNC
+    Matrix& operator=(Matrix&& other)
+    {
+      other.swap(*this);
+      return *this;
+    }
+#endif
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
+
+    // This constructor is for both 1x1 matrices and dynamic vectors
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
+    {
+      Base::_check_template_params();
+      Base::template _init1<T>(x);
+    }
+
     template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
     {
       Base::_check_template_params();
       Base::template _init2<T0,T1>(x, y);
     }
     #else
+    /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
+    EIGEN_DEVICE_FUNC
+    explicit Matrix(const Scalar *data);
+
+    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+      *
+      * This is useful for dynamic-size vectors. For fixed-size vectors,
+      * it is redundant to pass these parameters, so one should use the default constructor
+      * Matrix() instead.
+      * 
+      * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
+      * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
+      * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_STRONG_INLINE explicit Matrix(Index dim);
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
+    Matrix(const Scalar& x);
     /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
       *
       * This is useful for dynamic-size matrices. For fixed-size matrices,
       * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
+      * Matrix() instead.
+      * 
+      * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
+      * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
+      * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+      * constructor Matrix() instead, especially when using one of the non standard
+      * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+      */
+    EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
+    
     /** \brief Constructs an initialized 2D vector with given coefficients */
     Matrix(const Scalar& x, const Scalar& y);
     #endif
 
     /** \brief Constructs an initialized 3D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
     {
       Base::_check_template_params();
@@ -255,6 +353,7 @@ class Matrix
       m_storage.data()[2] = z;
     }
     /** \brief Constructs an initialized 4D vector with given coefficients */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
     {
       Base::_check_template_params();
@@ -265,76 +364,33 @@ class Matrix
       m_storage.data()[3] = w;
     }
 
-    explicit Matrix(const Scalar *data);
 
-    /** \brief Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      // This test resides here, to bring the error messages closer to the user. Normally, these checks
-      // are performed deeply within the library, thus causing long and scary error traces.
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
     /** \brief Copy constructor */
-    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const Matrix& other) : Base(other)
+    { }
 
     /** \brief Copy constructor for generic expressions.
       * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
-      //              go for pure _set() implementations, right?
-      *this = other;
-    }
-
-    /** \internal
-      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
-      * of same type it is enough to swap the data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(MatrixBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
+      : Base(other.derived())
+    { }
 
-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return 1; }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return this->innerSize(); }
 
     /////////// Geometry module ///////////
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    template<typename OtherDerived>
-    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    #endif
-
     // allow to extend Matrix outside Eigen
     #ifdef EIGEN_MATRIX_PLUGIN
     #include EIGEN_MATRIX_PLUGIN
diff --git a/nuparu/include/Eigen/src/Core/MatrixBase.h b/nuparu/include/Eigen/src/Core/MatrixBase.h
index 9193b6ab..9d612c85 100644
--- a/nuparu/include/Eigen/src/Core/MatrixBase.h
+++ b/nuparu/include/Eigen/src/Core/MatrixBase.h
@@ -52,7 +52,7 @@ template<typename Derived> class MatrixBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef MatrixBase StorageBaseType;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
@@ -66,8 +66,7 @@ template<typename Derived> class MatrixBase
     using Base::MaxSizeAtCompileTime;
     using Base::IsVectorAtCompileTime;
     using Base::Flags;
-    using Base::CoeffReadCost;
-
+    
     using Base::derived;
     using Base::const_cast_derived;
     using Base::rows;
@@ -81,6 +80,8 @@ template<typename Derived> class MatrixBase
     using Base::operator-=;
     using Base::operator*=;
     using Base::operator/=;
+    using Base::operator*;
+    using Base::operator/;
 
     typedef typename Base::CoeffReturnType CoeffReturnType;
     typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
@@ -98,25 +99,14 @@ template<typename Derived> class MatrixBase
 
     /** \returns the size of the main diagonal, which is min(rows(),cols()).
       * \sa rows(), cols(), SizeAtCompileTime. */
+    EIGEN_DEVICE_FUNC
     inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
 
-    /** \brief The plain matrix type corresponding to this expression.
-      *
-      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
-      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
-      * that the return type of eval() is either PlainObject or const PlainObject&.
-      */
-    typedef Matrix<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
+    typedef typename Base::PlainObject PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
+    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal the return type of MatrixBase::adjoint() */
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
@@ -125,7 +115,7 @@ template<typename Derived> class MatrixBase
     /** \internal Return type of eigenvalues() */
     typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
     /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
+    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,PlainObject> IdentityReturnType;
     /** \internal the return type of unit vectors */
     typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
                   internal::traits<Derived>::RowsAtCompileTime,
@@ -145,39 +135,48 @@ template<typename Derived> class MatrixBase
     /** Special case of the template operator=, in order to prevent the compiler
       * from generating a default operator= (issue hit with g++ 4.1)
       */
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const MatrixBase& other);
 
     // We cannot inherit here via Base::operator= since it is causing
     // trouble with MSVC.
 
     template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const DenseBase<OtherDerived>& other);
 
     template <typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const EigenBase<OtherDerived>& other);
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-
-    template<typename MatrixPower, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator+=(const MatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     Derived& operator-=(const MatrixBase<OtherDerived>& other);
 
+#ifdef __CUDACC__
     template<typename OtherDerived>
-    const typename ProductReturnType<Derived,OtherDerived>::Type
+    EIGEN_DEVICE_FUNC
+    const Product<Derived,OtherDerived,LazyProduct>
+    operator*(const MatrixBase<OtherDerived> &other) const
+    { return this->lazyProduct(other); }
+#else
+
+    template<typename OtherDerived>
+    const Product<Derived,OtherDerived>
     operator*(const MatrixBase<OtherDerived> &other) const;
 
+#endif
+
     template<typename OtherDerived>
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
+    EIGEN_DEVICE_FUNC 
+    const Product<Derived,OtherDerived,LazyProduct>
     lazyProduct(const MatrixBase<OtherDerived> &other) const;
 
     template<typename OtherDerived>
@@ -190,88 +189,91 @@ template<typename Derived> class MatrixBase
     void applyOnTheRight(const EigenBase<OtherDerived>& other);
 
     template<typename DiagonalDerived>
-    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
+    EIGEN_DEVICE_FUNC
+    const Product<Derived, DiagonalDerived, LazyProduct>
     operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
     dot(const MatrixBase<OtherDerived>& other) const;
 
-    #ifdef EIGEN2_SUPPORT
-      template<typename OtherDerived>
-      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
-    #endif
-
-    RealScalar squaredNorm() const;
-    RealScalar norm() const;
+    EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+    EIGEN_DEVICE_FUNC RealScalar norm() const;
     RealScalar stableNorm() const;
     RealScalar blueNorm() const;
     RealScalar hypotNorm() const;
-    const PlainObject normalized() const;
-    void normalize();
+    EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+    EIGEN_DEVICE_FUNC void normalize();
 
-    const AdjointReturnType adjoint() const;
-    void adjointInPlace();
+    EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+    EIGEN_DEVICE_FUNC void adjointInPlace();
 
     typedef Diagonal<Derived> DiagonalReturnType;
+    EIGEN_DEVICE_FUNC
     DiagonalReturnType diagonal();
-	typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    
+    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    EIGEN_DEVICE_FUNC
     ConstDiagonalReturnType diagonal() const;
 
     template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
     template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
 
-    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
-    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-
-    // Note: The "MatrixBase::" prefixes are added to help MSVC9 to match these declarations with the later implementations.
-    // On the other hand they confuse MSVC8...
-    #if (defined _MSC_VER) && (_MSC_VER >= 1500) // 2008 or later
-    typename MatrixBase::template DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
-    typename MatrixBase::template ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #else
-    typename DiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index);
-    typename ConstDiagonalIndexReturnType<DynamicIndex>::Type diagonal(Index index) const;
-    #endif
-
-    #ifdef EIGEN2_SUPPORT
-    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
-    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
+    template<int Index> 
+    EIGEN_DEVICE_FUNC
+    typename DiagonalIndexReturnType<Index>::Type diagonal();
+
+    template<int Index>
+    EIGEN_DEVICE_FUNC
+    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
     
-    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
-    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
-    template<template<typename T, int N> class U>
-    const DiagonalWrapper<ConstDiagonalReturnType> part() const
-    { return diagonal().asDiagonal(); }
-    #endif // EIGEN2_SUPPORT
+    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
+    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
+
+    EIGEN_DEVICE_FUNC
+    DiagonalDynamicIndexReturnType diagonal(Index index);
+    EIGEN_DEVICE_FUNC
+    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
 
     template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
     template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
 
-    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
-    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename TriangularViewReturnType<Mode>::Type triangularView();
+    template<unsigned int Mode>
+    EIGEN_DEVICE_FUNC
+    typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
 
     template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
     template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
-    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> 
+    EIGEN_DEVICE_FUNC
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+    template<unsigned int UpLo>
+    EIGEN_DEVICE_FUNC
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
 
     const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
                                          const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
-    static const IdentityReturnType Identity();
-    static const IdentityReturnType Identity(Index rows, Index cols);
-    static const BasisReturnType Unit(Index size, Index i);
-    static const BasisReturnType Unit(Index i);
-    static const BasisReturnType UnitX();
-    static const BasisReturnType UnitY();
-    static const BasisReturnType UnitZ();
-    static const BasisReturnType UnitW();
-
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
+    EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
+    EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
+
+    EIGEN_DEVICE_FUNC
     const DiagonalWrapper<const Derived> asDiagonal() const;
     const PermutationWrapper<const Derived> asPermutation() const;
 
+    EIGEN_DEVICE_FUNC
     Derived& setIdentity();
+    EIGEN_DEVICE_FUNC
     Derived& setIdentity(Index rows, Index cols);
 
     bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -303,59 +305,49 @@ template<typename Derived> class MatrixBase
 
     NoAlias<Derived,Eigen::MatrixBase > noalias();
 
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
+    // TODO forceAlignedAccess is temporarily disabled
+    // Need to find a nicer workaround.
+    inline const Derived& forceAlignedAccess() const { return derived(); }
+    inline Derived& forceAlignedAccess() { return derived(); }
+    template<bool Enable> inline const Derived& forceAlignedAccessIf() const { return derived(); }
+    template<bool Enable> inline Derived& forceAlignedAccessIf() { return derived(); }
 
-    Scalar trace() const;
+    EIGEN_DEVICE_FUNC Scalar trace() const;
 
-/////////// Array module ///////////
+    template<int p> EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
 
-    template<int p> RealScalar lpNorm() const;
-
-    MatrixBase<Derived>& matrix() { return *this; }
-    const MatrixBase<Derived>& matrix() const { return *this; }
+    EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
+    EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
 
     /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
       * \sa ArrayBase::matrix() */
-    ArrayWrapper<Derived> array() { return derived(); }
-    const ArrayWrapper<const Derived> array() const { return derived(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
+    /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
+      * \sa ArrayBase::matrix() */
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const { return ArrayWrapper<const Derived>(derived()); }
 
 /////////// LU module ///////////
 
-    const FullPivLU<PlainObject> fullPivLu() const;
-    const PartialPivLU<PlainObject> partialPivLu() const;
+    EIGEN_DEVICE_FUNC
+    inline const FullPivLU<PlainObject> fullPivLu() const;
+    EIGEN_DEVICE_FUNC
+    inline const PartialPivLU<PlainObject> partialPivLu() const;
 
-    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-    const LU<PlainObject> lu() const;
-    #endif
+    EIGEN_DEVICE_FUNC
+    inline const PartialPivLU<PlainObject> lu() const;
 
-    #ifdef EIGEN2_SUPPORT
-    const LU<PlainObject> eigen2_lu() const;
-    #endif
-
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    const PartialPivLU<PlainObject> lu() const;
-    #endif
+    EIGEN_DEVICE_FUNC
+    inline const Inverse<Derived> inverse() const;
     
-    #ifdef EIGEN2_SUPPORT
     template<typename ResultType>
-    void computeInverse(MatrixBase<ResultType> *result) const {
-      *result = this->inverse();
-    }
-    #endif
-
-    const internal::inverse_impl<Derived> inverse() const;
-    template<typename ResultType>
-    void computeInverseAndDetWithCheck(
+    inline void computeInverseAndDetWithCheck(
       ResultType& inverse,
       typename ResultType::Scalar& determinant,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
     template<typename ResultType>
-    void computeInverseWithCheck(
+    inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
@@ -364,29 +356,24 @@ template<typename Derived> class MatrixBase
 
 /////////// Cholesky module ///////////
 
-    const LLT<PlainObject>  llt() const;
-    const LDLT<PlainObject> ldlt() const;
+    inline const LLT<PlainObject>  llt() const;
+    inline const LDLT<PlainObject> ldlt() const;
 
 /////////// QR module ///////////
 
-    const HouseholderQR<PlainObject> householderQr() const;
-    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    const QR<PlainObject> qr() const;
-    #endif
+    inline const HouseholderQR<PlainObject> householderQr() const;
+    inline const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
+    inline const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
 
-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
+/////////// Eigenvalues module ///////////
 
-/////////// SVD module ///////////
+    inline EigenvaluesReturnType eigenvalues() const;
+    inline RealScalar operatorNorm() const;
 
-    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+/////////// SVD module ///////////
 
-    #ifdef EIGEN2_SUPPORT
-    SVD<PlainObject> svd() const;
-    #endif
+    inline JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
+    inline BDCSVD<PlainObject>    bdcSvd(unsigned int computationOptions = 0) const;
 
 /////////// Geometry module ///////////
 
@@ -398,20 +385,25 @@ template<typename Derived> class MatrixBase
     };
     #endif // EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
-    typename cross_product_return_type<OtherDerived>::type
+    EIGEN_DEVICE_FUNC
+    inline typename cross_product_return_type<OtherDerived>::type
     cross(const MatrixBase<OtherDerived>& other) const;
+    
     template<typename OtherDerived>
-    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
-    PlainObject unitOrthogonal(void) const;
-    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+    EIGEN_DEVICE_FUNC
+    inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+    
+    EIGEN_DEVICE_FUNC
+    inline PlainObject unitOrthogonal(void) const;
     
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
+    inline Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
+    
+    inline ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
     // put this as separate enum value to work around possible GCC 4.3 bug (?)
-    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
+    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1&&RowsAtCompileTime==1 ? ((internal::traits<Derived>::Flags&RowMajorBit)==RowMajorBit ? Horizontal : Vertical)
+                                          : ColsAtCompileTime==1 ? Vertical : Horizontal };
     typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
-    #endif
+    inline HomogeneousReturnType homogeneous() const;
     
     enum {
       SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
@@ -422,7 +414,7 @@ template<typename Derived> class MatrixBase
     typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
                 const ConstStartMinusOne > HNormalizedReturnType;
 
-    const HNormalizedReturnType hnormalized() const;
+    inline const HNormalizedReturnType hnormalized() const;
 
 ////////// Householder module ///////////
 
@@ -446,6 +438,15 @@ template<typename Derived> class MatrixBase
     template<typename OtherScalar>
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
+///////// SparseCore module /////////
+
+    template<typename OtherDerived>
+    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
+    {
+      return other.cwiseProduct(derived());
+    }
+
 ///////// MatrixFunctions module /////////
 
     typedef typename internal::stem_function<Scalar>::type StemFunction;
@@ -458,49 +459,15 @@ template<typename Derived> class MatrixBase
     const MatrixSquareRootReturnValue<Derived> sqrt() const;
     const MatrixLogarithmReturnValue<Derived> log() const;
     const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-
-#ifdef EIGEN2_SUPPORT
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    /** \deprecated because .lazy() is deprecated
-      * Overloaded for cache friendly product evaluation */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
-    { return lazyAssign(other._expression()); }
-
-    template<unsigned int Added>
-    const Flagged<Derived, Added, 0> marked() const;
-    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
-
-    inline const Cwise<Derived> cwise() const;
-    inline Cwise<Derived> cwise();
-
-    VectorBlock<Derived> start(Index size);
-    const VectorBlock<const Derived> start(Index size) const;
-    VectorBlock<Derived> end(Index size);
-    const VectorBlock<const Derived> end(Index size) const;
-    template<int Size> VectorBlock<Derived,Size> start();
-    template<int Size> const VectorBlock<const Derived,Size> start() const;
-    template<int Size> VectorBlock<Derived,Size> end();
-    template<int Size> const VectorBlock<const Derived,Size> end() const;
-
-    Minor<Derived> minor(Index row, Index col);
-    const Minor<Derived> minor(Index row, Index col) const;
-#endif
+    const MatrixComplexPowerReturnValue<Derived> pow(const std::complex<RealScalar>& p) const;
 
   protected:
-    MatrixBase() : Base() {}
+    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
 
   private:
-    explicit MatrixBase(int);
-    MatrixBase(int,int);
-    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
+    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
+    EIGEN_DEVICE_FUNC MatrixBase(int,int);
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
   protected:
     // mixing arrays and matrices is not legal
     template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
@@ -510,6 +477,51 @@ template<typename Derived> class MatrixBase
     {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
 };
 
+
+/***************************************************************************
+* Implementation of matrix base methods
+***************************************************************************/
+
+/** replaces \c *this by \c *this * \a other.
+  *
+  * \returns a reference to \c *this
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived&
+MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+  return derived();
+}
+
+/** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
+  *
+  * Example: \include MatrixBase_applyOnTheRight.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheRight.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheRight(derived());
+}
+
+/** replaces \c *this by \a other * \c *this.
+  *
+  * Example: \include MatrixBase_applyOnTheLeft.cpp
+  * Output: \verbinclude MatrixBase_applyOnTheLeft.out
+  */
+template<typename Derived>
+template<typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
+{
+  other.derived().applyThisOnTheLeft(derived());
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_MATRIXBASE_H
diff --git a/nuparu/include/Eigen/src/Core/NestByValue.h b/nuparu/include/Eigen/src/Core/NestByValue.h
index a893b176..9aeaf8d1 100644
--- a/nuparu/include/Eigen/src/Core/NestByValue.h
+++ b/nuparu/include/Eigen/src/Core/NestByValue.h
@@ -40,29 +40,29 @@ template<typename ExpressionType> class NestByValue
     typedef typename internal::dense_xpr_base<NestByValue>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
 
-    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+    EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
 
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
 
-    inline const CoeffReturnType coeff(Index row, Index col) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
     {
       return m_expression.coeff(row, col);
     }
 
-    inline Scalar& coeffRef(Index row, Index col)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
     {
       return m_expression.const_cast_derived().coeffRef(row, col);
     }
 
-    inline const CoeffReturnType coeff(Index index) const
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
     {
       return m_expression.coeff(index);
     }
 
-    inline Scalar& coeffRef(Index index)
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
     {
       return m_expression.const_cast_derived().coeffRef(index);
     }
@@ -91,7 +91,7 @@ template<typename ExpressionType> class NestByValue
       m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
     }
 
-    operator const ExpressionType&() const { return m_expression; }
+    EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
   protected:
     const ExpressionType m_expression;
diff --git a/nuparu/include/Eigen/src/Core/NoAlias.h b/nuparu/include/Eigen/src/Core/NoAlias.h
index 768bfb18..0ade7525 100644
--- a/nuparu/include/Eigen/src/Core/NoAlias.h
+++ b/nuparu/include/Eigen/src/Core/NoAlias.h
@@ -30,62 +30,36 @@ namespace Eigen {
 template<typename ExpressionType, template <typename> class StorageBase>
 class NoAlias
 {
-    typedef typename ExpressionType::Scalar Scalar;
   public:
-    NoAlias(ExpressionType& expression) : m_expression(expression) {}
-
-    /** Behaves like MatrixBase::lazyAssign(other)
-      * \sa MatrixBase::lazyAssign() */
+    typedef typename ExpressionType::Scalar Scalar;
+    
+    explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
+    
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
-    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
-
-    /** \sa MatrixBase::operator+= */
+    {
+      call_assignment_no_alias(m_expression, other.derived(), internal::assign_op<Scalar>());
+      return m_expression;
+    }
+    
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
     {
-      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::add_assign_op<Scalar>());
       return m_expression;
     }
-
-    /** \sa MatrixBase::operator-= */
+    
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
     {
-      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
+      call_assignment_no_alias(m_expression, other.derived(), internal::sub_assign_op<Scalar>());
       return m_expression;
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().addTo(m_expression); return m_expression; }
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().subTo(m_expression); return m_expression; }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-    
-    template<typename OtherDerived>
-    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
-    { return m_expression = func; }
-#endif
-
+    EIGEN_DEVICE_FUNC
     ExpressionType& expression() const
     {
       return m_expression;
@@ -126,7 +100,7 @@ class NoAlias
 template<typename Derived>
 NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
 {
-  return derived();
+  return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/NumTraits.h b/nuparu/include/Eigen/src/Core/NumTraits.h
index bac9e50b..1d85dec7 100644
--- a/nuparu/include/Eigen/src/Core/NumTraits.h
+++ b/nuparu/include/Eigen/src/Core/NumTraits.h
@@ -68,21 +68,40 @@ template<typename T> struct GenericNumTraits
                    >::type NonInteger;
   typedef T Nested;
 
-  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon()
+  {
+    #if defined(__CUDA_ARCH__)
+    return internal::device::numeric_limits<T>::epsilon();
+    #else
+    return std::numeric_limits<T>::epsilon();
+    #endif
+  }
+  EIGEN_DEVICE_FUNC
   static inline Real dummy_precision()
   {
     // make sure to override this for floating-point types
     return Real(0);
   }
-  static inline T highest() { return (std::numeric_limits<T>::max)(); }
-  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
-  
-#ifdef EIGEN2_SUPPORT
-  enum {
-    HasFloatingPoint = !IsInteger
-  };
-  typedef NonInteger FloatingPoint;
+
+
+  EIGEN_DEVICE_FUNC
+  static inline T highest() {
+#if defined(__CUDA_ARCH__)
+    return (internal::device::numeric_limits<T>::max)();
+#else
+    return (std::numeric_limits<T>::max)();
 #endif
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline T lowest()  {
+#if defined(__CUDA_ARCH__)
+    return IsInteger ? (internal::device::numeric_limits<T>::min)() : (-(internal::device::numeric_limits<T>::max)());
+#else
+    return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
+#endif
+  }
 };
 
 template<typename T> struct NumTraits : GenericNumTraits<T>
@@ -91,11 +110,13 @@ template<typename T> struct NumTraits : GenericNumTraits<T>
 template<> struct NumTraits<float>
   : GenericNumTraits<float>
 {
+  EIGEN_DEVICE_FUNC
   static inline float dummy_precision() { return 1e-5f; }
 };
 
 template<> struct NumTraits<double> : GenericNumTraits<double>
 {
+  EIGEN_DEVICE_FUNC
   static inline double dummy_precision() { return 1e-12; }
 };
 
@@ -136,9 +157,9 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
     IsInteger = NumTraits<Scalar>::IsInteger,
     IsSigned  = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
   };
   
   static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
diff --git a/nuparu/include/Eigen/src/Core/PermutationMatrix.h b/nuparu/include/Eigen/src/Core/PermutationMatrix.h
index 4fc5dd31..90e1df23 100644
--- a/nuparu/include/Eigen/src/Core/PermutationMatrix.h
+++ b/nuparu/include/Eigen/src/Core/PermutationMatrix.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,8 +13,6 @@
 
 namespace Eigen { 
 
-template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
-
 /** \class PermutationBase
   * \ingroup Core_Module
   *
@@ -41,10 +39,6 @@ template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKi
 
 namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_matrix_product_retval;
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_sparsematrix_product_retval;
 enum PermPermProduct_t {PermPermProduct};
 
 } // end namespace internal
@@ -60,19 +54,20 @@ class PermutationBase : public EigenBase<Derived>
     typedef typename Traits::IndicesType IndicesType;
     enum {
       Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
       RowsAtCompileTime = Traits::RowsAtCompileTime,
       ColsAtCompileTime = Traits::ColsAtCompileTime,
       MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::Index Index;
-    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
             DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
+    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,StorageIndex>
             PlainPermutationType;
+    typedef PlainPermutationType PlainObject;
     using Base::derived;
+    typedef Inverse<Derived> InverseReturnType;
+    typedef void Scalar;
     #endif
 
     /** Copies the other permutation into *this */
@@ -118,7 +113,7 @@ class PermutationBase : public EigenBase<Derived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (int i=0; i<rows();++i)
+      for (Index i=0; i<rows(); ++i)
         other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
     }
     #endif
@@ -147,7 +142,8 @@ class PermutationBase : public EigenBase<Derived>
     /** Sets *this to be the identity permutation matrix */
     void setIdentity()
     {
-      for(Index i = 0; i < size(); ++i)
+      StorageIndex n = StorageIndex(size());
+      for(StorageIndex i = 0; i < n; ++i)
         indices().coeffRef(i) = i;
     }
 
@@ -163,18 +159,18 @@ class PermutationBase : public EigenBase<Derived>
       *
       * \returns a reference to *this.
       *
-      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
+      * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
       * this has linear complexity and requires a lot of branching.
       *
-      * \sa applyTranspositionOnTheRight(int,int)
+      * \sa applyTranspositionOnTheRight(Index,Index)
       */
     Derived& applyTranspositionOnTheLeft(Index i, Index j)
     {
       eigen_assert(i>=0 && j>=0 && i<size() && j<size());
       for(Index k = 0; k < size(); ++k)
       {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
+        if(indices().coeff(k) == i) indices().coeffRef(k) = StorageIndex(j);
+        else if(indices().coeff(k) == j) indices().coeffRef(k) = StorageIndex(i);
       }
       return derived();
     }
@@ -185,7 +181,7 @@ class PermutationBase : public EigenBase<Derived>
       *
       * This is a fast operation, it only consists in swapping two indices.
       *
-      * \sa applyTranspositionOnTheLeft(int,int)
+      * \sa applyTranspositionOnTheLeft(Index,Index)
       */
     Derived& applyTranspositionOnTheRight(Index i, Index j)
     {
@@ -198,14 +194,14 @@ class PermutationBase : public EigenBase<Derived>
       *
       * \note \note_try_to_help_rvo
       */
-    inline Transpose<PermutationBase> inverse() const
-    { return derived(); }
+    inline InverseReturnType inverse() const
+    { return InverseReturnType(derived()); }
     /** \returns the tranpose permutation matrix.
       *
       * \note \note_try_to_help_rvo
       */
-    inline Transpose<PermutationBase> transpose() const
-    { return derived(); }
+    inline InverseReturnType transpose() const
+    { return InverseReturnType(derived()); }
 
     /**** multiplication helpers to hopefully get RVO ****/
 
@@ -215,13 +211,13 @@ class PermutationBase : public EigenBase<Derived>
     template<typename OtherDerived>
     void assignTranspose(const PermutationBase<OtherDerived>& other)
     {
-      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
+      for (Index i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     void assignProduct(const Lhs& lhs, const Rhs& rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows());
-      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+      for (Index i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
     }
 #endif
 
@@ -240,7 +236,7 @@ class PermutationBase : public EigenBase<Derived>
       * \note \note_try_to_help_rvo
       */
     template<typename Other>
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
+    inline PlainPermutationType operator*(const InverseImpl<Other,PermutationStorage>& other) const
     { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
 
     /** \returns the product of an inverse permutation with another permutation.
@@ -248,8 +244,37 @@ class PermutationBase : public EigenBase<Derived>
       * \note \note_try_to_help_rvo
       */
     template<typename Other> friend
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
+    inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other, const PermutationBase& perm)
     { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
+    
+    /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
+      *
+      * This function is O(\c n) procedure allocating a buffer of \c n booleans.
+      */
+    Index determinant() const
+    {
+      Index res = 1;
+      Index n = size();
+      Matrix<bool,RowsAtCompileTime,1,0,MaxRowsAtCompileTime> mask(n);
+      mask.fill(false);
+      Index r = 0;
+      while(r < n)
+      {
+        // search for the next seed
+        while(r<n && mask[r]) r++;
+        if(r>=n)
+          break;
+        // we got one, let's follow it until we are back to the seed
+        Index k0 = r++;
+        mask.coeffRef(k0) = true;
+        for(Index k=indices().coeff(k0); k!=k0; k=indices().coeff(k))
+        {
+          mask.coeffRef(k) = true;
+          res = -res;
+        }
+      }
+      return res;
+    }
 
   protected:
 
@@ -262,7 +287,7 @@ class PermutationBase : public EigenBase<Derived>
   *
   * \param SizeAtCompileTime the number of rows/cols, or Dynamic
   * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param IndexType the interger type of the indices
+  * \param StorageIndex the integer type of the indices
   *
   * This class represents a permutation matrix, internally stored as a vector of integers.
   *
@@ -270,24 +295,29 @@ class PermutationBase : public EigenBase<Derived>
   */
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef IndexType Index;
-  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef PermutationStorage StorageKind;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex> >
 {
     typedef PermutationBase<PermutationMatrix> Base;
     typedef internal::traits<PermutationMatrix> Traits;
   public:
 
+    typedef const PermutationMatrix& Nested;
+
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef typename Traits::IndicesType IndicesType;
+    typedef typename Traits::StorageIndex StorageIndex;
     #endif
 
     inline PermutationMatrix()
@@ -295,8 +325,10 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
     /** Constructs an uninitialized permutation matrix of given size.
       */
-    inline PermutationMatrix(int size) : m_indices(size)
-    {}
+    explicit inline PermutationMatrix(Index size) : m_indices(size)
+    {
+      eigen_internal_assert(size <= NumTraits<StorageIndex>::highest());
+    }
 
     /** Copy constructor. */
     template<typename OtherDerived>
@@ -317,7 +349,7 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
       * array's size.
       */
     template<typename Other>
-    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Convert the Transpositions \a tr to a permutation matrix */
@@ -364,10 +396,13 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Other>
-    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedPermutation().size())
+    PermutationMatrix(const InverseImpl<Other,PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size())
     {
-      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
+      eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
+      StorageIndex end = StorageIndex(m_indices.size());
+      for (StorageIndex i=0; i<end;++i)
+        m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
     }
     template<typename Lhs,typename Rhs>
     PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
@@ -384,18 +419,20 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
 
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
+ : traits<Matrix<_StorageIndex,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef PermutationStorage StorageKind;
+  typedef Map<const Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef void Scalar;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess>
+  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, _StorageIndex>,_PacketAccess> >
 {
     typedef PermutationBase<Map> Base;
     typedef internal::traits<Map> Traits;
@@ -403,14 +440,14 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
     #endif
 
-    inline Map(const Index* indicesPtr)
+    inline Map(const StorageIndex* indicesPtr)
       : m_indices(indicesPtr)
     {}
 
-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndex* indicesPtr, Index size)
       : m_indices(indicesPtr,size)
     {}
 
@@ -457,24 +494,21 @@ class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,
   * \sa class PermutationBase, class PermutationMatrix
   */
 
-struct PermutationStorage {};
-
 template<typename _IndicesType> class TranspositionsWrapper;
 namespace internal {
 template<typename _IndicesType>
 struct traits<PermutationWrapper<_IndicesType> >
 {
   typedef PermutationStorage StorageKind;
-  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar Index;
+  typedef void Scalar;
+  typedef typename _IndicesType::Scalar StorageIndex;
   typedef _IndicesType IndicesType;
   enum {
     RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
     ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
-    Flags = 0,
-    CoeffReadCost = _IndicesType::CoeffReadCost
+    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    Flags = 0
   };
 };
 }
@@ -490,8 +524,8 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
     typedef typename Traits::IndicesType IndicesType;
     #endif
 
-    inline PermutationWrapper(const IndicesType& a_indices)
-      : m_indices(a_indices)
+    inline PermutationWrapper(const IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** const version of indices(). */
@@ -503,178 +537,86 @@ class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesTyp
     typename IndicesType::Nested m_indices;
 };
 
+
 /** \returns the matrix with the permutation applied to the columns.
   */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const PermutationBase<PermutationDerived> &permutation)
+template<typename MatrixDerived, typename PermutationDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const PermutationBase<PermutationDerived>& permutation)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheRight>
-           (permutation.derived(), matrix.derived());
+  return Product<MatrixDerived, PermutationDerived, AliasFreeProduct>
+            (matrix.derived(), permutation.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows.
   */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval
-               <PermutationDerived, Derived, OnTheLeft>
+template<typename PermutationDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
 operator*(const PermutationBase<PermutationDerived> &permutation,
-          const MatrixBase<Derived>& matrix)
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheLeft>
-           (permutation.derived(), matrix.derived());
+  return Product<PermutationDerived, MatrixDerived, AliasFreeProduct>
+            (permutation.derived(), matrix.derived());
 }
 
-namespace internal {
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_matrix_product_retval
- : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+template<typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage>
+  : public EigenBase<Inverse<PermutationType> >
 {
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixType::Index Index;
-
-    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index n = Side==OnTheLeft ? rows() : cols();
-
-      if(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix))
-      {
-        // apply the permutation inplace
-        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
-        mask.fill(false);
-        Index r = 0;
-        while(r < m_permutation.size())
-        {
-          // search for the next seed
-          while(r<m_permutation.size() && mask[r]) r++;
-          if(r>=m_permutation.size())
-            break;
-          // we got one, let's follow it until we are back to the seed
-          Index k0 = r++;
-          Index kPrev = k0;
-          mask.coeffRef(k0) = true;
-          for(Index k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
-          {
-                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
-            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
-
-            mask.coeffRef(k) = true;
-            kPrev = k;
-          }
-        }
-      }
-      else
-      {
-        for(int i = 0; i < n; ++i)
-        {
-          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
-
-          =
-
-          Block<const MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
-               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
-        }
-      }
-    }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
-};
-
-/* Template partial specialization for transposed/inverse permutations */
-
-template<typename Derived>
-struct traits<Transpose<PermutationBase<Derived> > >
- : traits<Derived>
-{};
-
-} // end namespace internal
-
-template<typename Derived>
-class Transpose<PermutationBase<Derived> >
-  : public EigenBase<Transpose<PermutationBase<Derived> > >
-{
-    typedef Derived PermutationType;
-    typedef typename PermutationType::IndicesType IndicesType;
     typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+    typedef internal::traits<PermutationType> PermTraits;
+  protected:
+    InverseImpl() {}
   public:
+    typedef Inverse<PermutationType> InverseType;
+    using EigenBase<Inverse<PermutationType> >::derived;
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef internal::traits<PermutationType> Traits;
-    typedef typename Derived::DenseMatrixType DenseMatrixType;
+    typedef typename PermutationType::DenseMatrixType DenseMatrixType;
     enum {
-      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+      RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+      ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+      MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
     };
-    typedef typename Traits::Scalar Scalar;
     #endif
 
-    Transpose(const PermutationType& p) : m_permutation(p) {}
-
-    inline int rows() const { return m_permutation.rows(); }
-    inline int cols() const { return m_permutation.cols(); }
-
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
     void evalTo(MatrixBase<DenseDerived>& other) const
     {
       other.setZero();
-      for (int i=0; i<rows();++i)
-        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
+      for (Index i=0; i<derived().rows();++i)
+        other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
     }
     #endif
 
     /** \return the equivalent permutation matrix */
-    PlainPermutationType eval() const { return *this; }
+    PlainPermutationType eval() const { return derived(); }
 
-    DenseMatrixType toDenseMatrix() const { return *this; }
+    DenseMatrixType toDenseMatrix() const { return derived(); }
 
     /** \returns the matrix with the inverse permutation applied to the columns.
       */
     template<typename OtherDerived> friend
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
-    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
+    const Product<OtherDerived, InverseType, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const InverseType& trPerm)
     {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
+      return Product<OtherDerived, InverseType, AliasFreeProduct>(matrix.derived(), trPerm.derived());
     }
 
     /** \returns the matrix with the inverse permutation applied to the rows.
       */
     template<typename OtherDerived>
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
+    const Product<InverseType, OtherDerived, AliasFreeProduct>
     operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
+      return Product<InverseType, OtherDerived, AliasFreeProduct>(derived(), matrix.derived());
     }
-
-    const PermutationType& nestedPermutation() const { return m_permutation; }
-
-  protected:
-    const PermutationType& m_permutation;
 };
 
 template<typename Derived>
@@ -683,6 +625,12 @@ const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() con
   return derived();
 }
 
+namespace internal {
+
+template<> struct AssignmentKind<DenseShape,PermutationShape> { typedef EigenBase2EigenBase Kind; };
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_PERMUTATIONMATRIX_H
diff --git a/nuparu/include/Eigen/src/Core/PlainObjectBase.h b/nuparu/include/Eigen/src/Core/PlainObjectBase.h
index af0a479c..1225e85b 100644
--- a/nuparu/include/Eigen/src/Core/PlainObjectBase.h
+++ b/nuparu/include/Eigen/src/Core/PlainObjectBase.h
@@ -28,6 +28,7 @@ namespace internal {
 
 template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
   template<typename Index>
+  EIGEN_DEVICE_FUNC
   static EIGEN_ALWAYS_INLINE void run(Index, Index)
   {
   }
@@ -35,6 +36,7 @@ template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
 
 template<> struct check_rows_cols_for_overflow<Dynamic> {
   template<typename Index>
+  EIGEN_DEVICE_FUNC
   static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
   {
     // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
@@ -47,7 +49,10 @@ template<> struct check_rows_cols_for_overflow<Dynamic> {
   }
 };
 
-template <typename Derived, typename OtherDerived = Derived, bool IsVector = bool(Derived::IsVectorAtCompileTime)> struct conservative_resize_like_impl;
+template <typename Derived,
+          typename OtherDerived = Derived,
+          bool IsVector = bool(Derived::IsVectorAtCompileTime) && bool(OtherDerived::IsVectorAtCompileTime)>
+struct conservative_resize_like_impl;
 
 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct matrix_swap_impl;
 
@@ -64,8 +69,9 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct m
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 namespace internal {
 
-// this is a warkaround to doxygen not being able to understand the inheritence logic
+// this is a workaround to doxygen not being able to understand the inheritance logic
 // when it is hidden by the dense_xpr_base helper struct.
+/** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
 /** This class is just a workaround for Doxygen and it does not not actually exist. */
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
@@ -90,8 +96,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     typedef typename internal::dense_xpr_base<Derived>::type Base;
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Derived DenseType;
@@ -110,28 +116,36 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     typedef Eigen::Map<Derived, Unaligned>  MapType;
     friend  class Eigen::Map<const Derived, Unaligned>;
     typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-    friend  class Eigen::Map<Derived, Aligned>;
-    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
-    friend  class Eigen::Map<const Derived, Aligned>;
-    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
+#if EIGEN_MAX_ALIGN_BYTES>0
+    // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class twice.
+    friend  class Eigen::Map<Derived, AlignedMax>;
+    friend  class Eigen::Map<const Derived, AlignedMax>;
+#endif
+    typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+    typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
     template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
     template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
+    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, AlignedMax, StrideType> type; };
+    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, AlignedMax, StrideType> type; };
 
   protected:
     DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
 
   public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
+    enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment>0) };
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
 
+    EIGEN_DEVICE_FUNC
     Base& base() { return *static_cast<Base*>(this); }
+    EIGEN_DEVICE_FUNC
     const Base& base() const { return *static_cast<const Base*>(this); }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
     {
       if(Flags & RowMajorBit)
@@ -140,11 +154,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
     {
       return m_storage.data()[index];
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
     {
       if(Flags & RowMajorBit)
@@ -153,11 +169,13 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
     {
       return m_storage.data()[index];
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
     {
       if(Flags & RowMajorBit)
@@ -166,6 +184,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
         return m_storage.data()[rowId + colId * m_storage.rows()];
     }
 
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
     {
       return m_storage.data()[index];
@@ -206,11 +225,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 
     /** \returns a const pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE const Scalar *data() const
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const
     { return m_storage.data(); }
 
     /** \returns a pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE Scalar *data()
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data()
     { return m_storage.data(); }
 
     /** Resizes \c *this to a \a rows x \a cols matrix.
@@ -229,22 +248,22 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
-    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
-    {
-      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
-                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
-                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void resize(Index rows, Index cols)
+    {
+      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,rows==RowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,cols==ColsAtCompileTime)
+                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,rows<=MaxRowsAtCompileTime)
+                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,cols<=MaxColsAtCompileTime)
+                   && rows>=0 && cols>=0 && "Invalid sizes when resizing a matrix or array.");
+      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(rows, cols);
       #ifdef EIGEN_INITIALIZE_COEFFS
-        Index size = nbRows*nbCols;
+        Index size = rows*cols;
         bool size_changed = size != this->size();
-        m_storage.resize(size, nbRows, nbCols);
+        m_storage.resize(size, rows, cols);
         if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
       #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
-        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
+        m_storage.resize(rows*cols, rows, cols);
       #endif
     }
 
@@ -259,6 +278,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
       */
+    EIGEN_DEVICE_FUNC
     inline void resize(Index size)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
@@ -283,9 +303,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index,Index)
       */
-    inline void resize(NoChange_t, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    inline void resize(NoChange_t, Index cols)
     {
-      resize(rows(), nbCols);
+      resize(rows(), cols);
     }
 
     /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
@@ -296,9 +317,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \sa resize(Index,Index)
       */
-    inline void resize(Index nbRows, NoChange_t)
+    EIGEN_DEVICE_FUNC
+    inline void resize(Index rows, NoChange_t)
     {
-      resize(nbRows, cols());
+      resize(rows, cols());
     }
 
     /** Resizes \c *this to have the same dimensions as \a other.
@@ -309,6 +331,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
     {
       const OtherDerived& other = _other.derived();
@@ -336,9 +359,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * Matrices are resized relative to the top-left element. In case values need to be 
       * appended to the matrix they will be uninitialized.
       */
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols)
     {
-      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
+      internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -348,10 +372,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * In case the matrix is growing, new rows will be uninitialized.
       */
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(nbRows, cols());
+      conservativeResize(rows, cols());
     }
 
     /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
@@ -361,10 +386,11 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * In case the matrix is growing, new columns will be uninitialized.
       */
-    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols)
     {
       // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(rows(), nbCols);
+      conservativeResize(rows(), cols);
     }
 
     /** Resizes the vector to \a size while retaining old values.
@@ -375,6 +401,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * When values are appended, they will be uninitialized.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void conservativeResize(Index size)
     {
       internal::conservative_resize_like_impl<Derived>::run(*this, size);
@@ -390,6 +417,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * appended to the matrix they will copied from \c other.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
     {
       internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
@@ -398,6 +426,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
       */
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
     {
       return _set(other);
@@ -405,6 +434,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     /** \sa MatrixBase::lazyAssign() */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
     {
       _resize_to_match(other);
@@ -412,12 +442,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     }
 
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
     {
       resize(func.rows(), func.cols());
       return Base::operator=(func);
     }
 
+    // Prevent user from trying to instantiate PlainObjectBase objects
+    // by making all its constructor protected. See bug 1074.
+  protected:
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
     {
 //       _check_template_params();
@@ -427,38 +463,85 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     // FIXME is it still needed ?
     /** \internal */
-    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
+    EIGEN_DEVICE_FUNC
+    explicit PlainObjectBase(internal::constructor_without_unaligned_array_assert)
       : m_storage(internal::constructor_without_unaligned_array_assert())
     {
 //       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 #endif
 
-    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
-      : m_storage(a_size, nbRows, nbCols)
+#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase(PlainObjectBase&& other)
+      : m_storage( std::move(other.m_storage) )
+    {
+    }
+
+    EIGEN_DEVICE_FUNC
+    PlainObjectBase& operator=(PlainObjectBase&& other)
+    {
+      using std::swap;
+      swap(m_storage, other.m_storage);
+      return *this;
+    }
+#endif
+
+    /** Copy constructor */
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
+      : Base(), m_storage(other.m_storage) { }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols)
     {
 //       _check_template_params();
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
-    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
-      */
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
+      : m_storage()
     {
-      _resize_to_match(other);
-      Base::operator=(other.derived());
-      return this->derived();
+      _check_template_params();
+      resizeLike(other);
+      _set_noalias(other);
     }
 
-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+    /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
+      : m_storage()
     {
       _check_template_params();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
+      resizeLike(other);
+      *this = other.derived();
+    }
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other)
+    {
+      _check_template_params();
+      // FIXME this does not automatically transpose vectors if necessary
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
+
+  public:
+
+    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
+    {
+      _resize_to_match(other);
       Base::operator=(other.derived());
+      return this->derived();
     }
 
     /** \name Map
@@ -535,16 +618,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     //@}
 
     using Base::setConstant;
-    Derived& setConstant(Index size, const Scalar& value);
-    Derived& setConstant(Index rows, Index cols, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& value);
+    EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& value);
 
     using Base::setZero;
-    Derived& setZero(Index size);
-    Derived& setZero(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index size);
+    EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
 
     using Base::setOnes;
-    Derived& setOnes(Index size);
-    Derived& setOnes(Index rows, Index cols);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
+    EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
 
     using Base::setRandom;
     Derived& setRandom(Index size);
@@ -563,6 +646,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
     {
       #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -589,25 +673,23 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \internal
       */
+    // aliasing is dealt once in internall::call_assignment
+    // so at this stage we have to assume aliasing... and resising has to be done later.
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
     {
-      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
+      internal::call_assignment(this->derived(), other.derived());
       return this->derived();
     }
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
-
     /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
       * is the case when creating a new matrix) so one can enforce lazy evaluation.
       *
       * \sa operator=(const MatrixBase<OtherDerived>&), _set()
       */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
     {
       // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -615,40 +697,166 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       //_resize_to_match(other);
       // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
       // it wouldn't allow to copy a row-vector into a column-vector.
-      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
+      internal::call_assignment_no_alias(this->derived(), other.derived(), internal::assign_op<Scalar>());
+      return this->derived();
     }
 
     template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
                           bool(NumTraits<T1>::IsInteger),
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(nbRows,nbCols);
+      resize(rows,cols);
     }
+    
     template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
       m_storage.data()[0] = val0;
       m_storage.data()[1] = val1;
     }
+    
+    template<typename T0, typename T1>
+    EIGEN_DEVICE_FUNC 
+    EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<T0,Index>::value)
+                                                                  && (internal::is_same<T1,Index>::value)
+                                                                  && Base::SizeAtCompileTime==2,T1>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+      m_storage.data()[0] = Scalar(val0);
+      m_storage.data()[1] = Scalar(val1);
+    }
+
+    // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
+    // then the argument is meant to be the size of the object.
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(Index size, typename internal::enable_if<    (Base::SizeAtCompileTime!=1 || !internal::is_convertible<T, Scalar>::value)
+                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
+    {
+      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
+      const bool is_integer = NumTraits<T>::IsInteger;
+      EIGEN_STATIC_ASSERT(is_integer,
+                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+      resize(size);
+    }
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = val0;
+    }
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime==1
+                                                                  && internal::is_convertible<T, Scalar>::value,T*>::type* = 0)
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+      m_storage.data()[0] = Scalar(val0);
+    }
+
+    // Initialize a fixed size matrix from a pointer to raw data
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar* data){
+      this->_set_noalias(ConstMapType(data));
+    }
+
+    // Initialize an arbitrary matrix from a dense expression
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other){
+      this->_set_noalias(other);
+    }
+
+    // Initialize an arbitrary matrix from a generic Eigen expression
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other){
+      this->derived() = other;
+    }
+
+    template<typename T, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other)
+    {
+      resize(other.rows(), other.cols());
+      other.evalTo(this->derived());
+    }
 
+    template<typename T, typename OtherDerived, int ColsAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
+    {
+      this->derived() = r;
+    }
+    
+    // For fixed -size arrays:
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Scalar& val0,
+                                    typename internal::enable_if<    Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+    
+    template<typename T>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _init1(const Index& val0,
+                                    typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
+                                                                  && (internal::is_same<Index,T>::value)
+                                                                  && Base::SizeAtCompileTime!=Dynamic
+                                                                  && Base::SizeAtCompileTime!=1
+                                                                  && internal::is_convertible<T, Scalar>::value
+                                                                  && internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value,T*>::type* = 0)
+    {
+      Base::setConstant(val0);
+    }
+    
     template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
     friend struct internal::matrix_swap_impl;
 
-    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
+  public:
+    
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal
+      * \brief Override DenseBase::swap() since for dynamic-sized matrices
+      * of same type it is enough to swap the data pointers.
       */
     template<typename OtherDerived>
-    void _swap(DenseBase<OtherDerived> const & other)
+    EIGEN_DEVICE_FUNC
+    void swap(DenseBase<OtherDerived> & other)
     {
       enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
-      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
+      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
     }
-
-  public:
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+    
+    /** \internal
+      * \brief const version forwarded to DenseBase::swap
+      */
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    void swap(DenseBase<OtherDerived> const & other)
+    { Base::swap(other.derived()); }
+    
+    EIGEN_DEVICE_FUNC 
     static EIGEN_STRONG_INLINE void _check_template_params()
     {
       EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -662,16 +870,16 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                         && (Options & (DontAlign|RowMajor)) == Options),
         INVALID_MATRIX_TEMPLATE_PARAMETERS)
     }
-#endif
 
-private:
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+    enum { IsPlainObjectBase = 1 };
+#endif
 };
 
+namespace internal {
+
 template <typename Derived, typename OtherDerived, bool IsVector>
-struct internal::conservative_resize_like_impl
+struct conservative_resize_like_impl
 {
-  typedef typename Derived::Index Index;
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
@@ -729,12 +937,14 @@ struct internal::conservative_resize_like_impl
   }
 };
 
-namespace internal {
-
+// Here, the specialization for vectors inherits from the general matrix case
+// to allow calling .conservativeResize(rows,cols) on vectors.
 template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
+  : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  typedef typename Derived::Index Index;
+  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
@@ -760,6 +970,7 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     a.base().swap(b);
@@ -769,6 +980,7 @@ struct matrix_swap_impl
 template<typename MatrixTypeA, typename MatrixTypeB>
 struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
diff --git a/nuparu/include/Eigen/src/Core/Product.h b/nuparu/include/Eigen/src/Core/Product.h
new file mode 100644
index 00000000..fdd2fed3
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/Product.h
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCT_H
+#define EIGEN_PRODUCT_H
+
+namespace Eigen {
+
+template<typename Lhs, typename Rhs, int Option, typename StorageKind> class ProductImpl;
+
+/** \class Product
+  * \ingroup Core_Module
+  *
+  * \brief Expression of the product of two arbitrary matrices or vectors
+  *
+  * \param Lhs the type of the left-hand side expression
+  * \param Rhs the type of the right-hand side expression
+  *
+  * This class represents an expression of the product of two arbitrary matrices.
+  * 
+  * The other template parameters are:
+  * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+  *
+  */
+
+
+namespace internal {
+
+// Determine the scalar of Product<Lhs, Rhs>. This is normally the same as Lhs::Scalar times
+// Rhs::Scalar, but product with permutation matrices inherit the scalar of the other factor.
+template<typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape, 
+         typename RhsShape = typename evaluator_traits<Rhs>::Shape >
+struct product_result_scalar
+{
+  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename RhsShape>
+struct product_result_scalar<Lhs, Rhs, PermutationShape, RhsShape>
+{
+  typedef typename Rhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename LhsShape>
+  struct product_result_scalar<Lhs, Rhs, LhsShape, PermutationShape>
+{
+  typedef typename Lhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename RhsShape>
+struct product_result_scalar<Lhs, Rhs, TranspositionsShape, RhsShape>
+{
+  typedef typename Rhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, typename LhsShape>
+  struct product_result_scalar<Lhs, Rhs, LhsShape, TranspositionsShape>
+{
+  typedef typename Lhs::Scalar Scalar;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+struct traits<Product<Lhs, Rhs, Option> >
+{
+  typedef typename remove_all<Lhs>::type LhsCleaned;
+  typedef typename remove_all<Rhs>::type RhsCleaned;
+  typedef traits<LhsCleaned> LhsTraits;
+  typedef traits<RhsCleaned> RhsTraits;
+  
+  typedef MatrixXpr XprKind;
+  
+  typedef typename product_result_scalar<LhsCleaned,RhsCleaned>::Scalar Scalar;
+  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind,
+                                                typename RhsTraits::StorageKind,
+                                                internal::product_type<Lhs,Rhs>::ret>::ret StorageKind;
+  typedef typename promote_index_type<typename LhsTraits::StorageIndex,
+                                      typename RhsTraits::StorageIndex>::type StorageIndex;
+  
+  enum {
+    RowsAtCompileTime    = LhsTraits::RowsAtCompileTime,
+    ColsAtCompileTime    = RhsTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
+    
+    // FIXME: only needed by GeneralMatrixMatrixTriangular
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
+    
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1) ? RowMajorBit
+          : (MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1) ? 0
+          : (   ((LhsTraits::Flags&NoPreferredStorageOrderBit) && (RhsTraits::Flags&RowMajorBit))
+             || ((RhsTraits::Flags&NoPreferredStorageOrderBit) && (LhsTraits::Flags&RowMajorBit)) ) ? RowMajorBit
+          : NoPreferredStorageOrderBit
+  };
+};
+
+} // end namespace internal
+
+
+template<typename _Lhs, typename _Rhs, int Option>
+class Product : public ProductImpl<_Lhs,_Rhs,Option,
+                                   typename internal::product_promote_storage_type<typename internal::traits<_Lhs>::StorageKind,
+                                                                                   typename internal::traits<_Rhs>::StorageKind,
+                                                                                   internal::product_type<_Lhs,_Rhs>::ret>::ret>
+{
+  public:
+    
+    typedef _Lhs Lhs;
+    typedef _Rhs Rhs;
+    
+    typedef typename ProductImpl<
+        Lhs, Rhs, Option,
+        typename internal::product_promote_storage_type<typename internal::traits<Lhs>::StorageKind,
+                                                        typename internal::traits<Rhs>::StorageKind,
+                                                        internal::product_type<Lhs,Rhs>::ret>::ret>::Base Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+    typedef typename internal::ref_selector<Lhs>::type LhsNested;
+    typedef typename internal::ref_selector<Rhs>::type RhsNested;
+    typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+    typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    {
+      eigen_assert(lhs.cols() == rhs.rows()
+        && "invalid matrix product"
+        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+    }
+
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
+
+    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  protected:
+
+    LhsNested m_lhs;
+    RhsNested m_rhs;
+};
+
+namespace internal {
+  
+template<typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs,Rhs>::ret>
+class dense_product_base
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{};
+
+/** Convertion to scalar for inner-products */
+template<typename Lhs, typename Rhs, int Option>
+class dense_product_base<Lhs, Rhs, Option, InnerProduct>
+ : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
+{
+  typedef Product<Lhs,Rhs,Option> ProductXpr;
+  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
+public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+  
+  operator const Scalar() const
+  {
+    return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
+  }
+};
+
+} // namespace internal
+
+// Generic API dispatcher
+template<typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl : public internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Product<Lhs,Rhs,Option>, MatrixXpr, StorageKind>::type Base;
+};
+
+template<typename Lhs, typename Rhs, int Option>
+class ProductImpl<Lhs,Rhs,Option,Dense>
+  : public internal::dense_product_base<Lhs,Rhs,Option>
+{
+    typedef Product<Lhs, Rhs, Option> Derived;
+    
+  public:
+    
+    typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
+    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  protected:
+    enum {
+      IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) && 
+                   (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
+      EnableCoeff = IsOneByOne || Option==LazyProduct
+    };
+    
+  public:
+  
+    EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+      
+      return internal::evaluator<Derived>(derived()).coeff(row,col);
+    }
+
+    EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
+    {
+      EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+      eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
+      
+      return internal::evaluator<Derived>(derived()).coeff(i);
+    }
+    
+  
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_H
diff --git a/nuparu/include/Eigen/src/Core/ProductBase.h b/nuparu/include/Eigen/src/Core/ProductBase.h
deleted file mode 100644
index a494b5f8..00000000
--- a/nuparu/include/Eigen/src/Core/ProductBase.h
+++ /dev/null
@@ -1,278 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PRODUCTBASE_H
-#define EIGEN_PRODUCTBASE_H
-
-namespace Eigen { 
-
-/** \class ProductBase
-  * \ingroup Core_Module
-  *
-  */
-
-namespace internal {
-template<typename Derived, typename _Lhs, typename _Rhs>
-struct traits<ProductBase<Derived,_Lhs,_Rhs> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<_Lhs>::type Lhs;
-  typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  enum {
-    RowsAtCompileTime = traits<Lhs>::RowsAtCompileTime,
-    ColsAtCompileTime = traits<Rhs>::ColsAtCompileTime,
-    MaxRowsAtCompileTime = traits<Lhs>::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = traits<Rhs>::MaxColsAtCompileTime,
-    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit | NestByRefBit,
-                  // Note that EvalBeforeNestingBit and NestByRefBit
-                  // are not used in practice because nested is overloaded for products
-    CoeffReadCost = 0 // FIXME why is it needed ?
-  };
-};
-}
-
-#define EIGEN_PRODUCT_PUBLIC_INTERFACE(Derived) \
-  typedef ProductBase<Derived, Lhs, Rhs > Base; \
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Base::LhsNested LhsNested; \
-  typedef typename Base::_LhsNested _LhsNested; \
-  typedef typename Base::LhsBlasTraits LhsBlasTraits; \
-  typedef typename Base::ActualLhsType ActualLhsType; \
-  typedef typename Base::_ActualLhsType _ActualLhsType; \
-  typedef typename Base::RhsNested RhsNested; \
-  typedef typename Base::_RhsNested _RhsNested; \
-  typedef typename Base::RhsBlasTraits RhsBlasTraits; \
-  typedef typename Base::ActualRhsType ActualRhsType; \
-  typedef typename Base::_ActualRhsType _ActualRhsType; \
-  using Base::m_lhs; \
-  using Base::m_rhs;
-
-template<typename Derived, typename Lhs, typename Rhs>
-class ProductBase : public MatrixBase<Derived>
-{
-  public:
-    typedef MatrixBase<Derived> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ProductBase)
-    
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef internal::blas_traits<_LhsNested> LhsBlasTraits;
-    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
-    typedef typename internal::remove_all<ActualLhsType>::type _ActualLhsType;
-    typedef typename internal::traits<Lhs>::Scalar LhsScalar;
-
-    typedef typename Rhs::Nested RhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-    typedef internal::blas_traits<_RhsNested> RhsBlasTraits;
-    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef typename internal::remove_all<ActualRhsType>::type _ActualRhsType;
-    typedef typename internal::traits<Rhs>::Scalar RhsScalar;
-
-    // Diagonal of a product: no need to evaluate the arguments because they are going to be evaluated only once
-    typedef CoeffBasedProduct<LhsNested, RhsNested, 0> FullyLazyCoeffBaseProductType;
-
-  public:
-
-    typedef typename Base::PlainObject PlainObject;
-
-    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
-      : m_lhs(a_lhs), m_rhs(a_rhs)
-    {
-      eigen_assert(a_lhs.cols() == a_rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { derived().scaleAndAddTo(dst,alpha); }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    operator const PlainObject& () const
-    {
-      m_result.resize(m_lhs.rows(), m_rhs.cols());
-      derived().evalTo(m_result);
-      return m_result;
-    }
-
-    const Diagonal<const FullyLazyCoeffBaseProductType,0> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    template<int Index>
-    const Diagonal<FullyLazyCoeffBaseProductType,Index> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }
-
-    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
-    typename Base::CoeffReturnType coeff(Index row, Index col) const
-    {
-#ifdef EIGEN2_SUPPORT
-      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
-#else
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(row,col);
-#endif
-    }
-
-    typename Base::CoeffReturnType coeff(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(i);
-    }
-
-    const Scalar& coeffRef(Index row, Index col) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(row,col);
-    }
-
-    const Scalar& coeffRef(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(i);
-    }
-
-  protected:
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-namespace internal {
-template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
-struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
-{
-  typedef PlainObject const& type;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct;
-
-// Note that these two operator* functions are not defined as member
-// functions of ProductBase, because, otherwise we would have to
-// define all overloads defined in MatrixBase. Furthermore, Using
-// "using Base::operator*" would not work with MSVC.
-//
-// Also note that here we accept any compatible scalar types
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::Scalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::RealScalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const typename Derived::Scalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const typename Derived::RealScalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-namespace internal {
-template<typename NestedProduct>
-struct traits<ScaledProduct<NestedProduct> >
- : traits<ProductBase<ScaledProduct<NestedProduct>,
-                         typename NestedProduct::_LhsNested,
-                         typename NestedProduct::_RhsNested> >
-{
-  typedef typename traits<NestedProduct>::StorageKind StorageKind;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct
-  : public ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested>
-{
-  public:
-    typedef ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::PlainObject PlainObject;
-//     EIGEN_PRODUCT_PUBLIC_INTERFACE(ScaledProduct)
-
-    ScaledProduct(const NestedProduct& prod, const Scalar& x)
-    : Base(prod.lhs(),prod.rhs()), m_prod(prod), m_alpha(x) {}
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& a_alpha) const { m_prod.derived().scaleAndAddTo(dst,a_alpha * m_alpha); }
-
-    const Scalar& alpha() const { return m_alpha; }
-    
-  protected:
-    const NestedProduct& m_prod;
-    Scalar m_alpha;
-};
-
-/** \internal
-  * Overloaded to perform an efficient C = (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-{
-  other.derived().evalTo(derived());
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCTBASE_H
diff --git a/nuparu/include/Eigen/src/Core/ProductEvaluators.h b/nuparu/include/Eigen/src/Core/ProductEvaluators.h
new file mode 100644
index 00000000..794038a2
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/ProductEvaluators.h
@@ -0,0 +1,1061 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_PRODUCTEVALUATORS_H
+#define EIGEN_PRODUCTEVALUATORS_H
+
+namespace Eigen {
+  
+namespace internal {
+
+/** \internal
+  * Evaluator of a product expression.
+  * Since products require special treatments to handle all possible cases,
+  * we simply deffer the evaluation logic to a product_evaluator class
+  * which offers more partial specialization possibilities.
+  * 
+  * \sa class product_evaluator
+  */
+template<typename Lhs, typename Rhs, int Options>
+struct evaluator<Product<Lhs, Rhs, Options> > 
+ : public product_evaluator<Product<Lhs, Rhs, Options> >
+{
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef product_evaluator<XprType> Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+ 
+// Catch scalar * ( A * B ) and transform it to (A*scalar) * B
+// TODO we should apply that rule only if that's really helpful
+template<typename Lhs, typename Rhs, typename Scalar>
+struct evaluator_traits<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+ : evaluator_traits_base<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > >
+{
+  enum { AssumeAliasing = 1 };
+};
+template<typename Lhs, typename Rhs, typename Scalar>
+struct evaluator<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,  const Product<Lhs, Rhs, DefaultProduct>  > > 
+ : public evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> >
+{
+  typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Product<Lhs, Rhs, DefaultProduct> > XprType;
+  typedef evaluator<Product<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,const Lhs>, Rhs, DefaultProduct> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : Base(xpr.functor().m_other * xpr.nestedExpression().lhs() * xpr.nestedExpression().rhs())
+  {}
+};
+
+
+template<typename Lhs, typename Rhs, int DiagIndex>
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> > 
+ : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> >
+{
+  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
+  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
+        Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
+        xpr.index() ))
+  {}
+};
+
+
+// Helper class to perform a matrix product with the destination at hand.
+// Depending on the sizes of the factors, there are different evaluation strategies
+// as controlled by internal::product_type.
+template< typename Lhs, typename Rhs,
+          typename LhsShape = typename evaluator_traits<Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<Rhs>::Shape,
+          int ProductType = internal::product_type<Lhs,Rhs>::value>
+struct generic_product_impl;
+
+template<typename Lhs, typename Rhs>
+struct evaluator_traits<Product<Lhs, Rhs, DefaultProduct> > 
+ : evaluator_traits_base<Product<Lhs, Rhs, DefaultProduct> >
+{
+  enum { AssumeAliasing = 1 };
+};
+
+template<typename Lhs, typename Rhs>
+struct evaluator_traits<Product<Lhs, Rhs, AliasFreeProduct> > 
+ : evaluator_traits_base<Product<Lhs, Rhs, AliasFreeProduct> >
+{
+  enum { AssumeAliasing = 0 };
+};
+
+// This is the default evaluator implementation for products:
+// It creates a temporary and call generic_product_impl
+template<typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
+  : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject>
+{
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    
+// FIXME shall we handle nested_eval here?,
+// if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in permutation_matrix_product, transposition_matrix_product, etc.)
+//     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+//     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+//     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+//     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+//     
+//     const LhsNested lhs(xpr.lhs());
+//     const RhsNested rhs(xpr.rhs());
+//   
+//     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+
+    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Dense = Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense += Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  {
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<Scalar>, Dense2Dense,
+  typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct),Scalar>::type>
+{
+  typedef Product<Lhs,Rhs,Options> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  {
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+
+// Dense ?= scalar * Product
+// TODO we should apply that rule if that's really helpful
+// for instance, this is not good for inner products
+template< typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis>
+struct Assignment<DstXprType, CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, AssignFunc, Dense2Dense, Scalar>
+{
+  typedef CwiseUnaryOp<internal::scalar_multiple_op<ScalarBis>,
+                       const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
+  {
+    call_assignment_no_alias(dst, (src.functor().m_other * src.nestedExpression().lhs())*src.nestedExpression().rhs(), func);
+  }
+};
+
+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template<typename DstXprType, typename OtherXpr, typename ProductType, typename Scalar, typename Func1, typename Func2>
+struct assignment_from_xpr_plus_product
+{
+  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr, const ProductType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const Func1& func)
+  {
+    call_assignment_no_alias(dst, src.lhs(), func);
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::assign_op<Scalar>, Dense2Dense>
+  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::assign_op<Scalar>, internal::add_assign_op<Scalar> >
+{};
+template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::add_assign_op<Scalar>, Dense2Dense>
+  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::add_assign_op<Scalar>, internal::add_assign_op<Scalar> >
+{};
+template< typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const OtherXpr,
+                                           const Product<Lhs,Rhs,DefaultProduct> >, internal::sub_assign_op<Scalar>, Dense2Dense>
+  : assignment_from_xpr_plus_product<DstXprType, OtherXpr, Product<Lhs,Rhs,DefaultProduct>, Scalar, internal::sub_assign_op<Scalar>, internal::sub_assign_op<Scalar> >
+{};
+//----------------------------------------
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
+{
+  template<typename Dst>
+  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+  
+  template<typename Dst>
+  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
+  }
+  
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
+};
+
+
+/***********************************************************************
+*  Implementation of outer dense * dense vector product
+***********************************************************************/
+
+// Column major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+{
+  evaluator<Rhs> rhsEval(rhs);
+  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dst.cols();
+  for (Index j=0; j<cols; ++j)
+    func(dst.col(j), rhsEval.coeff(0,j) * actual_lhs);
+}
+
+// Row major result
+template<typename Dst, typename Lhs, typename Rhs, typename Func>
+EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+{
+  evaluator<Lhs> lhsEval(lhs);
+  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dst.rows();
+  for (Index i=0; i<rows; ++i)
+    func(dst.row(i), lhsEval.coeff(i,0) * actual_rhs);
+}
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
+{
+  template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct adds {
+    Scalar m_scale;
+    explicit adds(const Scalar& s) : m_scale(s) {}
+    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += m_scale * src;
+    }
+  };
+  
+  template<typename Dst>
+  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+  }
+  
+  template<typename Dst>
+  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+  }
+  
+  template<typename Dst>
+  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+  }
+  
+  template<typename Dst>
+  static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+  }
+  
+};
+
+
+// This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
+template<typename Lhs, typename Rhs, typename Derived>
+struct generic_product_impl_base
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
+
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
+  
+  template<typename Dst>
+  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
+
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+  typedef typename internal::conditional<int(Side)==OnTheRight,Lhs,Rhs>::type MatrixType;
+
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    internal::gemv_dense_selector<Side,
+                            (int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                            bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)
+                           >::run(lhs, rhs, dst, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> 
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<Scalar>());
+  }
+  
+  template<typename Dst>
+  static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<Scalar>());
+  }
+  
+  template<typename Dst>
+  static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<Scalar>());
+  }
+  
+//   template<typename Dst>
+//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
+};
+
+// This specialization enforces the use of a coefficient-based evaluation strategy
+template<typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,LazyCoeffBasedProductMode>
+  : generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> {};
+
+// Case 2: Evaluate coeff by coeff
+//
+// This is mostly taken from CoeffBasedProduct.h
+// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
+// for the inner dimension of the product, because evaluator object do not know their size.
+
+template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl;
+
+template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
+    : evaluator_base<Product<Lhs, Rhs, LazyProduct> >
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()),
+      m_rhs(xpr.rhs()),
+      m_lhsImpl(m_lhs),     // FIXME the creation of the evaluator objects should result in a no-op, but check that!
+      m_rhsImpl(m_rhs),     //       Moreover, they are only useful for the packet path, so we could completely disable them when not needed,
+                            //       or perhaps declare them on the fly on the packet method... We have experiment to check what's best.
+      m_innerDim(xpr.lhs().cols())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  // Everything below here is taken from CoeffBasedProduct.h
+
+  typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+  
+  typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
+  typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
+
+  typedef evaluator<LhsNestedCleaned> LhsEtorType;
+  typedef evaluator<RhsNestedCleaned> RhsEtorType;
+  
+  enum {
+    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
+    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
+    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime,
+      
+    PacketSize = packet_traits<Scalar>::size,
+
+    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
+    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
+    CoeffReadCost = InnerSize==0 ? NumTraits<Scalar>::ReadCost
+                  : InnerSize == Dynamic ? HugeCost
+                  : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
+                    + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+    
+    LhsFlags = LhsEtorType::Flags,
+    RhsFlags = RhsEtorType::Flags,
+    
+    LhsAlignment = LhsEtorType::Alignment,
+    RhsAlignment = RhsEtorType::Alignment,
+    
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+      
+    SameType = is_same<typename LhsNestedCleaned::Scalar,typename RhsNestedCleaned::Scalar>::value,
+
+    CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
+                    && (ColsAtCompileTime == Dynamic || ((ColsAtCompileTime % PacketSize) == 0) ),
+
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
+                    && (RowsAtCompileTime == Dynamic || ((RowsAtCompileTime % PacketSize) == 0) ),
+
+    EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
+                    : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
+                    : (RhsRowMajor && !CanVectorizeLhs),
+
+    Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
+          | (EvalToRowMajor ? RowMajorBit : 0)
+          // TODO enable vectorization for mixed types
+          | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0)
+          | (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
+          
+    LhsOuterStrideBytes = int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes = int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = CanVectorizeLhs ? (LhsOuterStrideBytes<0 || (int(LhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,LhsAlignment))!=0 ? 0 : LhsAlignment)
+              : CanVectorizeRhs ? (RhsOuterStrideBytes<0 || (int(RhsOuterStrideBytes) % EIGEN_PLAIN_ENUM_MAX(1,RhsAlignment))!=0 ? 0 : RhsAlignment)
+              : 0,
+
+    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+    * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+    * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+    * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+    */
+    CanVectorizeInner =    SameType
+                        && LhsRowMajor
+                        && (!RhsRowMajor)
+                        && (LhsFlags & RhsFlags & ActualPacketAccessBit)
+                        && (InnerSize % packet_traits<Scalar>::size == 0)
+  };
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const
+  {
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  }
+
+  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+   * which is why we don't set the LinearAccessBit.
+   * TODO: this seems possible when the result is a vector
+   */
+  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
+  {
+    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    return (m_lhs.row(row).transpose().cwiseProduct( m_rhs.col(col) )).sum();
+  }
+
+  template<int LoadMode, typename PacketType>
+  const PacketType packet(Index row, Index col) const
+  {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags)&RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic,
+                                     LhsEtorType, RhsEtorType, PacketType, LoadMode> PacketImpl;
+    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  template<int LoadMode, typename PacketType>
+  const PacketType packet(Index index) const
+  {
+    const Index row = RowsAtCompileTime == 1 ? 0 : index;
+    const Index col = RowsAtCompileTime == 1 ? index : 0;
+    return packet<LoadMode,PacketType>(row,col);
+  }
+
+protected:
+  const LhsNested m_lhs;
+  const RhsNested m_rhs;
+  
+  LhsEtorType m_lhsImpl;
+  RhsEtorType m_rhsImpl;
+
+  // TODO: Get rid of m_innerDim if known at compile time
+  Index m_innerDim;
+};
+
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+  : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape>
+{
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
+  {}
+};
+
+/****************************************
+*** Coeff based product, Packet path  ***
+****************************************/
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode,Packet>(UnrollingIndex-1, col), res);
+  }
+};
+
+template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet &res)
+  {
+    etor_product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, innerDim, res);
+    res =  pmadd(lhs.template packet<LoadMode,Packet>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode,Packet>(0, col));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index /*innerDim*/, Packet &res)
+  {
+    res = pmul(lhs.template packet<LoadMode,Packet>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet &res)
+  {
+    res = pset1<Packet>(0);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode,Packet>(i, col), res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
+{
+  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res)
+  {
+    res = pset1<Packet>(0);
+    for(Index i = 0; i < innerDim; ++i)
+      res =  pmadd(lhs.template packet<LoadMode,Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+};
+
+
+/***************************************************************************
+* Triangular products
+***************************************************************************/
+template<int Mode, bool LhsIsTriangular,
+         typename Lhs, bool LhsIsVector,
+         typename Rhs, bool RhsIsVector>
+struct triangular_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,TriangularShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Lhs::Mode,true,typename Lhs::MatrixType,false,Rhs, Rhs::ColsAtCompileTime==1>
+        ::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,TriangularShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    triangular_product_impl<Rhs::Mode,false,Lhs,Lhs::RowsAtCompileTime==1, typename Rhs::MatrixType, false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* SelfAdjoint products
+***************************************************************************/
+template <typename Lhs, int LhsMode, bool LhsIsVector,
+          typename Rhs, int RhsMode, bool RhsIsVector>
+struct selfadjoint_product_impl;
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag>
+: generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SelfAdjointShape,ProductTag> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    selfadjoint_product_impl<Lhs,0,Lhs::IsVectorAtCompileTime,typename Rhs::MatrixType,Rhs::Mode,false>::run(dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+
+/***************************************************************************
+* Diagonal products
+***************************************************************************/
+  
+template<typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
+struct diagonal_product_evaluator_base
+  : evaluator_base<Derived>
+{
+   typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+public:
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
+    
+    MatrixFlags = evaluator<MatrixType>::Flags,
+    DiagFlags = evaluator<DiagonalType>::Flags,
+    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
+    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
+                           ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
+    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
+    Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment
+  };
+  
+  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+    : m_diagImpl(diag), m_matImpl(mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
+  {
+    return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+  }
+  
+protected:
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const
+  {
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const
+  {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = EIGEN_PLAIN_ENUM_MIN(LoadMode,((InnerSize%16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment)) // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packet<LoadMode,PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode,PacketType>(id));
+  }
+  
+  evaluator<DiagonalType> m_diagImpl;
+  evaluator<MatrixType>   m_matImpl;
+};
+
+// diagonal * dense
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
+  : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft>
+{
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheLeft> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+  
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  
+  enum {
+    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
+  };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.rhs(), xpr.lhs().diagonal())
+  {
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
+  }
+  
+#ifndef __CUDACC__
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_impl<LoadMode,PacketType>(row,col, row,
+                                 typename internal::conditional<int(StorageOrder)==RowMajor, internal::true_type, internal::false_type>::type());
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+// dense * diagonal
+template<typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
+  : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight>
+{
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>, OnTheRight> Base;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  using Base::coeff;
+  typedef typename Base::Scalar Scalar;
+  
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  
+  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs().diagonal())
+  {
+  }
+  
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
+  {
+    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
+  }
+  
+#ifndef __CUDACC__
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
+  {
+    return this->template packet_impl<LoadMode,PacketType>(row,col, col,
+                                 typename internal::conditional<int(StorageOrder)==ColMajor, internal::true_type, internal::false_type>::type());
+  }
+  
+  template<int LoadMode,typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const
+  {
+    return packet<LoadMode,PacketType>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
+  }
+#endif
+};
+
+/***************************************************************************
+* Products with permutation matrices
+***************************************************************************/
+
+/** \internal
+  * \class permutation_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct permutation_matrix_product;
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+
+    template<typename Dest, typename PermutationType>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
+    {
+      MatrixType mat(xpr);
+      const Index n = Side==OnTheLeft ? mat.rows() : mat.cols();
+      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+      //if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+      if(is_same_dense(dst, mat))
+      {
+        // apply the permutation inplace
+        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+        mask.fill(false);
+        Index r = 0;
+        while(r < perm.size())
+        {
+          // search for the next seed
+          while(r<perm.size() && mask[r]) r++;
+          if(r>=perm.size())
+            break;
+          // we got one, let's follow it until we are back to the seed
+          Index k0 = r++;
+          Index kPrev = k0;
+          mask.coeffRef(k0) = true;
+          for(Index k=perm.indices().coeff(k0); k!=k0; k=perm.indices().coeff(k))
+          {
+                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+            mask.coeffRef(k) = true;
+            kPrev = k;
+          }
+        }
+      }
+      else
+      {
+        for(Index i = 0; i < n; ++i)
+        {
+          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
+               (dst, ((Side==OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+
+          =
+
+          Block<const MatrixTypeCleaned,Side==OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixTypeCleaned::ColsAtCompileTime>
+               (mat, ((Side==OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+        }
+      }
+    }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs)
+  {
+    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs)
+  {
+    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+
+/***************************************************************************
+* Products with transpositions matrices
+***************************************************************************/
+
+// FIXME could we unify Transpositions and Permutation into a single "shape"??
+
+/** \internal
+  * \class transposition_matrix_product
+  * Internal helper class implementing the product between a permutation matrix and a matrix.
+  */
+template<typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product
+{
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
+  
+  template<typename Dest, typename TranspositionType>
+  static inline void run(Dest& dst, const TranspositionType& tr, const ExpressionType& xpr)
+  {
+    MatrixType mat(xpr);
+    typedef typename TranspositionType::StorageIndex StorageIndex;
+    const Index size = tr.size();
+    StorageIndex j = 0;
+
+    if(!(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat)))
+      dst = mat;
+
+    for(Index k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
+      if(Index(j=tr.coeff(k))!=k)
+      {
+        if(Side==OnTheLeft)        dst.row(k).swap(dst.row(j));
+        else if(Side==OnTheRight)  dst.col(k).swap(dst.col(j));
+      }
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs)
+  {
+    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template<typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs)
+  {
+    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PRODUCT_EVALUATORS_H
diff --git a/nuparu/include/Eigen/src/Core/Random.h b/nuparu/include/Eigen/src/Core/Random.h
index 480fea40..02038e9e 100644
--- a/nuparu/include/Eigen/src/Core/Random.h
+++ b/nuparu/include/Eigen/src/Core/Random.h
@@ -28,12 +28,18 @@ struct functor_traits<scalar_random_op<Scalar> >
 
 /** \returns a random matrix expression
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
   * The parameters \a rows and \a cols are the number of rows and of columns of
   * the returned matrix. Must be compatible with this MatrixBase type.
   *
+  * \not_reentrant
+  * 
   * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
   * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
   * instead.
+  * 
   *
   * Example: \include MatrixBase_random_int_int.cpp
   * Output: \verbinclude MatrixBase_random_int_int.out
@@ -41,22 +47,28 @@ struct functor_traits<scalar_random_op<Scalar> >
   * This expression has the "evaluate before nesting" flag so that it will be evaluated into
   * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
+  * 
+  * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
   *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index rows, Index cols)
 {
   return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
 }
 
 /** \returns a random vector expression
+  *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
   *
   * The parameter \a size is the size of the returned vector.
   * Must be compatible with this MatrixBase type.
   *
   * \only_for_vectors
+  * \not_reentrant
   *
   * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
   * it is redundant to pass \a size as argument, so Random() should be used
@@ -69,10 +81,10 @@ DenseBase<Derived>::Random(Index rows, Index cols)
   * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
   *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random(Index size)
 {
   return NullaryExpr(size, internal::scalar_random_op<Scalar>());
@@ -80,6 +92,9 @@ DenseBase<Derived>::Random(Index size)
 
 /** \returns a fixed-size random matrix or vector expression
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
   * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
   * need to use the variants taking size arguments.
   *
@@ -89,11 +104,13 @@ DenseBase<Derived>::Random(Index size)
   * This expression has the "evaluate before nesting" flag so that it will be evaluated into
   * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
   * behavior with expressions involving random matrices.
+  * 
+  * \not_reentrant
   *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
+  * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
   */
 template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
+inline const typename DenseBase<Derived>::RandomReturnType
 DenseBase<Derived>::Random()
 {
   return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
@@ -101,6 +118,11 @@ DenseBase<Derived>::Random()
 
 /** Sets all coefficients in this expression to random values.
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
+  * \not_reentrant
+  * 
   * Example: \include MatrixBase_setRandom.cpp
   * Output: \verbinclude MatrixBase_setRandom.out
   *
@@ -114,12 +136,16 @@ inline Derived& DenseBase<Derived>::setRandom()
 
 /** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
   *
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  * 
   * \only_for_vectors
+  * \not_reentrant
   *
   * Example: \include Matrix_setRandom_int.cpp
   * Output: \verbinclude Matrix_setRandom_int.out
   *
-  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
+  * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
@@ -131,19 +157,24 @@ PlainObjectBase<Derived>::setRandom(Index newSize)
 
 /** Resizes to the given size, and sets all coefficients in this expression to random values.
   *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
+  * Numbers are uniformly spread through their whole definition range for integer types,
+  * and in the [-1:1] range for floating point scalar types.
+  *
+  * \not_reentrant
+  * 
+  * \param rows the new number of rows
+  * \param cols the new number of columns
   *
   * Example: \include Matrix_setRandom_int_int.cpp
   * Output: \verbinclude Matrix_setRandom_int_int.out
   *
-  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
+  * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
   */
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
+PlainObjectBase<Derived>::setRandom(Index rows, Index cols)
 {
-  resize(nbRows, nbCols);
+  resize(rows, cols);
   return setRandom();
 }
 
diff --git a/nuparu/include/Eigen/src/Core/Redux.h b/nuparu/include/Eigen/src/Core/Redux.h
index 50548fa9..d170cae2 100644
--- a/nuparu/include/Eigen/src/Core/Redux.h
+++ b/nuparu/include/Eigen/src/Core/Redux.h
@@ -50,21 +50,34 @@ struct redux_traits
 
 public:
   enum {
-    Cost = (  Derived::SizeAtCompileTime == Dynamic
-           || Derived::CoeffReadCost == Dynamic
-           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
-           ) ? Dynamic
-           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
-               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
+         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
 public:
   enum {
-    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
+    Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling
   };
+  
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug()
+  {
+    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(Derived::Flags)
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    EIGEN_DEBUG_VAR(Traversal)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(Unrolling)
+    std::cerr << std::endl;
+  }
+#endif
 };
 
 /***************************************************************************
@@ -82,6 +95,7 @@ struct redux_novec_unroller
 
   typedef typename Derived::Scalar Scalar;
 
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
@@ -99,6 +113,7 @@ struct redux_novec_unroller<Func, Derived, Start, 1>
 
   typedef typename Derived::Scalar Scalar;
 
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
   {
     return mat.coeffByOuterInner(outer, inner);
@@ -112,6 +127,7 @@ template<typename Func, typename Derived, int Start>
 struct redux_novec_unroller<Func, Derived, Start, 0>
 {
   typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC 
   static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };
 
@@ -143,7 +159,7 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
     index = Start * packet_traits<typename Derived::Scalar>::size,
     outer = index / int(Derived::InnerSizeAtCompileTime),
     inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
+    alignment = Derived::Alignment
   };
 
   typedef typename Derived::Scalar Scalar;
@@ -151,7 +167,7 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
 
   static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
   {
-    return mat.template packetByOuterInner<alignment>(outer, inner);
+    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -169,8 +185,8 @@ template<typename Func, typename Derived>
 struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     Scalar res;
@@ -194,18 +210,18 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
   typedef typename Derived::Scalar Scalar;
   typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
 
-  static Scalar run(const Derived& mat, const Func& func)
+  static Scalar run(const Derived &mat, const Func& func)
   {
     const Index size = mat.size();
-    eigen_assert(size && "you are using an empty matrix");
+    
     const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat);
+    const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
     };
+    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -213,19 +229,19 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
+      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
+        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
@@ -247,14 +263,14 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
+// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
   typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
+  typedef typename packet_traits<Scalar>::type PacketType;
 
-  static Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     const Index innerSize = mat.innerSize();
@@ -266,10 +282,10 @@ struct redux_impl<Func, Derived, SliceVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
+      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
@@ -296,16 +312,83 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
     Size = Derived::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
     eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-    if (VectorizedSize != Size)
-      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
-    return res;
+    if (VectorizedSize > 0) {
+      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
+      if (VectorizedSize != Size)
+        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
+      return res;
+    }
+    else {
+      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
+    }
   }
 };
 
+// evaluator adaptor
+template<typename _XprType>
+class redux_evaluator
+{
+public:
+  typedef _XprType XprType;
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  
+  enum {
+    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
+    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
+    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
+    IsRowMajor = XprType::IsRowMajor,
+    SizeAtCompileTime = XprType::SizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
+    Alignment = evaluator<XprType>::Alignment
+  };
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index index) const
+  { return m_evaluator.coeff(index); }
+
+  template<int LoadMode, typename PacketType>
+  PacketReturnType packet(Index row, Index col) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
+
+  template<int LoadMode, typename PacketType>
+  PacketReturnType packet(Index index) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
+  
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
+  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  template<int LoadMode, typename PacketType>
+  PacketReturnType packetByOuterInner(Index outer, Index inner) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  
+  const XprType & nestedExpression() const { return m_xpr; }
+  
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
+};
+
 } // end namespace internal
 
 /***************************************************************************
@@ -316,18 +399,21 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
   *
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
-  * an associative operator. Both current STL and TR1 functor styles are handled.
+  * an associative operator. Both current C++98 and C++11 functor styles are handled.
   *
   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
 template<typename Func>
-EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
+typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
-  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
-  return internal::redux_impl<Func, ThisNested>
-            ::run(derived(), func);
+  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
+
+  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }
 
 /** \returns the minimum of all coefficients of \c *this.
@@ -337,7 +423,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar>());
 }
 
 /** \returns the maximum of all coefficients of \c *this.
@@ -347,7 +433,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
-  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar>());
 }
 
 /** \returns the sum of all coefficients of *this
@@ -360,7 +446,7 @@ DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(0);
-  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar>());
 }
 
 /** \returns the mean of all coefficients of *this
@@ -371,7 +457,7 @@ template<typename Derived>
 EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
-  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
 }
 
 /** \returns the product of all coefficients of *this
@@ -387,7 +473,7 @@ DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
     return Scalar(1);
-  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
+  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
 }
 
 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
diff --git a/nuparu/include/Eigen/src/Core/Ref.h b/nuparu/include/Eigen/src/Core/Ref.h
index aba795bd..61de5ed1 100644
--- a/nuparu/include/Eigen/src/Core/Ref.h
+++ b/nuparu/include/Eigen/src/Core/Ref.h
@@ -12,24 +12,20 @@
 
 namespace Eigen { 
 
-template<typename Derived> class RefBase;
-template<typename PlainObjectType, int Options = 0,
-         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
-
 /** \class Ref
   * \ingroup Core_Module
   *
-  * \brief A matrix or vector expression mapping an existing expressions
+  * \brief A matrix or vector expression mapping an existing expression
   *
   * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
   *                The default is \c #Unaligned.
   * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accept a variable outer stride (leading dimension).
+  *                   but accepts a variable outer stride (leading dimension).
   *                   This can be overridden by specifying strides.
   *                   The type passed here must be a specialization of the Stride template, see examples below.
   *
-  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
+  * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the number of copies.
   * A Ref<> object can represent either a const expression or a l-value:
   * \code
   * // in-out argument:
@@ -39,10 +35,10 @@ template<typename PlainObjectType, int Options = 0,
   * void foo2(const Ref<const VectorXf>& x);
   * \endcode
   *
-  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
+  * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
   * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
+  * Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float whose column's elements are contiguously stored with
+  * the possibility to have a constant space in-between each column, i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension)
   * can be greater than the number of rows.
   *
   * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
@@ -52,21 +48,22 @@ template<typename PlainObjectType, int Options = 0,
   * VectorXf a;
   * foo1(a.head());             // OK
   * foo1(A.col());              // OK
-  * foo1(A.row());              // compilation error because here innerstride!=1
-  * foo2(A.row());              // The row is copied into a contiguous temporary
+  * foo1(A.row());              // Compilation error because here innerstride!=1
+  * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+  * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
   * foo2(2*a);                  // The expression is evaluated into a temporary
   * foo2(A.col().segment(2,4)); // No temporary
   * \endcode
   *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
+  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
   * Here is an example accepting an innerstride!=1:
   * \code
   * // in-out argument:
   * void foo3(Ref<VectorXf,0,InnerStride<> > x);
   * foo3(A.row());              // OK
   * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
+  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involve more
+  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overload internally calling a
   * template function, e.g.:
   * \code
   * // in the .h:
@@ -94,24 +91,27 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
   typedef _PlainObjectType PlainObjectType;
   typedef _StrideType StrideType;
   enum {
-    Options = _Options
+    Options = _Options,
+    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit,
+    Alignment = traits<Map<_PlainObjectType, _Options, _StrideType> >::Alignment
   };
 
   template<typename Derived> struct match {
     enum {
       HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
       InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                       || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                       || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
       OuterStrideMatch = Derived::IsVectorAtCompileTime
                       || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment)==int(Unaligned)) || (int(evaluator<Derived>::Alignment) >= int(Alignment)), // FIXME the first condition is not very clear, it should be replaced by the required alignment
+      ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
     };
     typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
   };
-
+  
 };
 
 template<typename Derived>
@@ -130,12 +130,12 @@ template<typename Derived> class RefBase
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  inline Index innerStride() const
+  EIGEN_DEVICE_FUNC inline Index innerStride() const
   {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  inline Index outerStride() const
+  EIGEN_DEVICE_FUNC inline Index outerStride() const
   {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
          : IsVectorAtCompileTime ? this->size()
@@ -143,7 +143,7 @@ template<typename Derived> class RefBase
          : this->rows();
   }
 
-  RefBase()
+  EIGEN_DEVICE_FUNC RefBase()
     : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
       // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
       m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
@@ -157,7 +157,7 @@ template<typename Derived> class RefBase
   typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
 
   template<typename Expression>
-  void construct(Expression& expr)
+  EIGEN_DEVICE_FUNC void construct(Expression& expr)
   {
     if(PlainObjectType::RowsAtCompileTime==1)
     {
@@ -171,8 +171,12 @@ template<typename Derived> class RefBase
     }
     else
       ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                 StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+    
+    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
+      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
+    else
+      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
+                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
   }
 
   StrideBase m_stride;
@@ -182,7 +186,11 @@ template<typename Derived> class RefBase
 template<typename PlainObjectType, int Options, typename StrideType> class Ref
   : public RefBase<Ref<PlainObjectType, Options, StrideType> >
 {
+  private:
     typedef internal::traits<Ref> Traits;
+    template<typename Derived>
+    EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
   public:
 
     typedef RefBase<Ref> Base;
@@ -191,20 +199,23 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename Derived>
-    inline Ref(PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
+    EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     {
-      Base::construct(expr);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
     }
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(internal::is_lvalue<Derived>::value&&bool(Traits::template match<Derived>::MatchAtCompileTime)),Derived>::type* = 0,
-               int = Derived::ThisConstantIsPrivateInPlainObjectBase)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
     #else
     template<typename Derived>
     inline Ref(DenseBase<Derived>& expr)
     #endif
     {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
       Base::construct(expr.const_cast_derived());
     }
 
@@ -223,7 +234,8 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
     EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
 
     template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr)
+    EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                                 typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
     {
 //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
 //      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
@@ -231,18 +243,27 @@ template<typename TPlainObjectType, int Options, typename StrideType> class Ref<
       construct(expr.derived(), typename Traits::template match<Derived>::type());
     }
 
+    EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
   protected:
 
     template<typename Expression>
-    void construct(const Expression& expr,internal::true_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr,internal::true_type)
     {
       Base::construct(expr);
     }
 
     template<typename Expression>
-    void construct(const Expression& expr, internal::false_type)
+    EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type)
     {
-      m_object.lazyAssign(expr);
+      internal::call_assignment_no_alias(m_object,expr,internal::assign_op<Scalar>());
       Base::construct(m_object);
     }
 
diff --git a/nuparu/include/Eigen/src/Core/Replicate.h b/nuparu/include/Eigen/src/Core/Replicate.h
index dde86a83..bec59831 100644
--- a/nuparu/include/Eigen/src/Core/Replicate.h
+++ b/nuparu/include/Eigen/src/Core/Replicate.h
@@ -35,10 +35,7 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  enum {
-    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
-  };
-  typedef typename nested<MatrixType,Factor>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
@@ -53,8 +50,9 @@ struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
     IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
                : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
                : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
   };
 };
 }
@@ -68,10 +66,12 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
 
     typedef typename internal::dense_xpr_base<Replicate>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     template<typename OriginalMatrixType>
-    inline explicit Replicate(const OriginalMatrixType& a_matrix)
-      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
+    EIGEN_DEVICE_FUNC
+    inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
@@ -79,41 +79,20 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
     }
 
     template<typename OriginalMatrixType>
-    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
-      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
+    EIGEN_DEVICE_FUNC
+    inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
     {
       EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
                           THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
     }
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
 
-    inline Scalar coeff(Index rowId, Index colId) const
-    {
-      // try to avoid using modulo; this is a pure optimization strategy
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.coeff(actual_row, actual_col);
-    }
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
-    }
-
+    EIGEN_DEVICE_FUNC
     const _MatrixTypeNested& nestedExpression() const
     { 
       return m_matrix; 
@@ -135,27 +114,12 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-inline const Replicate<Derived,RowFactor,ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
 }
 
-/**
-  * \return an expression of the replication of \c *this
-  *
-  * Example: \include MatrixBase_replicate_int_int.cpp
-  * Output: \verbinclude MatrixBase_replicate_int_int.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
-  */
-template<typename Derived>
-inline const Replicate<Derived,Dynamic,Dynamic>
-DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
-{
-  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
-}
-
 /**
   * \return an expression of the replication of each column (or row) of \c *this
   *
diff --git a/nuparu/include/Eigen/src/Core/ReturnByValue.h b/nuparu/include/Eigen/src/Core/ReturnByValue.h
index d66c24ba..7feb6e01 100644
--- a/nuparu/include/Eigen/src/Core/ReturnByValue.h
+++ b/nuparu/include/Eigen/src/Core/ReturnByValue.h
@@ -38,9 +38,10 @@ struct traits<ReturnByValue<Derived> >
  * So internal::nested always gives the plain return matrix type.
  *
  * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
  */
 template<typename Derived,int n,typename PlainObject>
-struct nested<ReturnByValue<Derived>, n, PlainObject>
+struct nested_eval<ReturnByValue<Derived>, n, PlainObject>
 {
   typedef typename traits<Derived>::ReturnType type;
 };
@@ -48,7 +49,7 @@ struct nested<ReturnByValue<Derived>, n, PlainObject>
 } // end namespace internal
 
 template<typename Derived> class ReturnByValue
-  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
+  : public internal::dense_xpr_base< ReturnByValue<Derived> >::type, internal::no_assignment_operator
 {
   public:
     typedef typename internal::traits<Derived>::ReturnType ReturnType;
@@ -57,10 +58,11 @@ template<typename Derived> class ReturnByValue
     EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)
 
     template<typename Dest>
+    EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const
     { static_cast<const Derived*>(this)->evalTo(dst); }
-    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 #define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
@@ -72,6 +74,7 @@ template<typename Derived> class ReturnByValue
     const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
     Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
     Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
+#undef Unusable
 #endif
 };
 
@@ -83,6 +86,33 @@ Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
   return derived();
 }
 
+namespace internal {
+
+// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
+// when a ReturnByValue expression is assigned, the evaluator is not constructed.
+// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
+  
+template<typename Derived>
+struct evaluator<ReturnByValue<Derived> >
+  : public evaluator<typename internal::traits<Derived>::ReturnType>
+{
+  typedef ReturnByValue<Derived> XprType;
+  typedef typename internal::traits<Derived>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    xpr.evalTo(m_result);
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_RETURNBYVALUE_H
diff --git a/nuparu/include/Eigen/src/Core/Reverse.h b/nuparu/include/Eigen/src/Core/Reverse.h
index e30ae3d2..d7c380c7 100644
--- a/nuparu/include/Eigen/src/Core/Reverse.h
+++ b/nuparu/include/Eigen/src/Core/Reverse.h
@@ -37,32 +37,25 @@ struct traits<Reverse<MatrixType, Direction> >
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    // let's enable LinearAccess only with vectorization because of the product overhead
-    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
-                 ? LinearAccessBit : 0,
-
-    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
-
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    Flags = _MatrixTypeNested::Flags & (RowMajorBit | LvalueBit)
   };
 };
 
-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond
+template<typename PacketType, bool ReversePacket> struct reverse_packet_cond
 {
-  static inline PacketScalar run(const PacketScalar& x) { return preverse(x); }
+  static inline PacketType run(const PacketType& x) { return preverse(x); }
 };
 
-template<typename PacketScalar> struct reverse_packet_cond<PacketScalar,false>
+template<typename PacketType> struct reverse_packet_cond<PacketType,false>
 {
-  static inline PacketScalar run(const PacketScalar& x) { return x; }
+  static inline PacketType run(const PacketType& x) { return x; }
 };
 
 } // end namespace internal 
@@ -74,12 +67,9 @@ template<typename MatrixType, int Direction> class Reverse
 
     typedef typename internal::dense_xpr_base<Reverse>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
     using Base::IsRowMajor;
 
-    // next line is necessary because otherwise const version of operator()
-    // is hidden by non-const version defined in this file
-    using Base::operator(); 
-
   protected:
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -95,82 +85,19 @@ template<typename MatrixType, int Direction> class Reverse
     typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
   public:
 
-    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
+    EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.cols(); }
 
-    inline Index innerStride() const
+    EIGEN_DEVICE_FUNC inline Index innerStride() const
     {
       return -m_matrix.innerStride();
     }
 
-    inline Scalar& operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return coeffRef(row, col);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                            ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(m_matrix.size() - index - 1);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
-    }
-
-    inline Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < m_matrix.size());
-      return coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return reverse_packet::run(m_matrix.template packet<LoadMode>(
-                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(
-                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
-                                      reverse_packet::run(x));
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
-    }
-
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
+    EIGEN_DEVICE_FUNC const typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() const 
     {
       return m_matrix;
@@ -190,33 +117,93 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
-  return derived();
+  return ReverseReturnType(derived());
 }
 
-/** This is the const version of reverse(). */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstReverseReturnType
-DenseBase<Derived>::reverse() const
-{
-  return derived();
-}
+
+//reverse const overload moved DenseBase.h due to a CUDA compiler bug
 
 /** This is the "in place" version of reverse: it reverses \c *this.
   *
   * In most cases it is probably better to simply use the reversed expression
   * of a matrix. However, when reversing the matrix data itself is really needed,
   * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
+  * the following additional benefits:
   *  - less error prone: doing the same operation with .reverse() requires special care:
   *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
+  *  - this API enables reverse operations without the need for a temporary
   *  - it allows future optimizations (cache friendliness, etc.)
   *
-  * \sa reverse() */
+  * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
 inline void DenseBase<Derived>::reverseInPlace()
 {
-  derived() = derived().reverse().eval();
+  if(cols()>rows())
+  {
+    Index half = cols()/2;
+    leftCols(half).swap(rightCols(half).reverse());
+    if((cols()%2)==1)
+    {
+      Index half2 = rows()/2;
+      col(half).head(half2).swap(col(half).tail(half2).reverse());
+    }
+  }
+  else
+  {
+    Index half = rows()/2;
+    topRows(half).swap(bottomRows(half).reverse());
+    if((rows()%2)==1)
+    {
+      Index half2 = cols()/2;
+      row(half).head(half2).swap(row(half).tail(half2).reverse());
+    }
+  }
+}
+
+namespace internal {
+  
+template<int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template<>
+struct vectorwise_reverse_inplace_impl<Vertical>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.rows()/2;
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
+  }
+};
+
+template<>
+struct vectorwise_reverse_inplace_impl<Horizontal>
+{
+  template<typename ExpressionType>
+  static void run(ExpressionType &xpr)
+  {
+    Index half = xpr.cols()/2;
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
+  }
+};
+
+} // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+  *
+  * In most cases it is probably better to simply use the reversed expression
+  * of a matrix. However, when reversing the matrix data itself is really needed,
+  * then this "in-place" version is probably the right choice because it provides
+  * the following additional benefits:
+  *  - less error prone: doing the same operation with .reverse() requires special care:
+  *    \code m = m.reverse().eval(); \endcode
+  *  - this API enables reverse operations without the need for a temporary
+  *
+  * \sa DenseBase::reverseInPlace(), reverse() */
+template<typename ExpressionType, int Direction>
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+{
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Select.h b/nuparu/include/Eigen/src/Core/Select.h
index 87993bbb..79eec1b5 100644
--- a/nuparu/include/Eigen/src/Core/Select.h
+++ b/nuparu/include/Eigen/src/Core/Select.h
@@ -43,23 +43,21 @@ struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
-    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
-                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
-                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
+    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & RowMajorBit
   };
 };
 }
 
 template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : internal::no_assignment_operator,
-  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
+class Select : public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type,
+               internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<Select>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Select)
 
+    inline EIGEN_DEVICE_FUNC
     Select(const ConditionMatrixType& a_conditionMatrix,
            const ThenMatrixType& a_thenMatrix,
            const ElseMatrixType& a_elseMatrix)
@@ -69,9 +67,10 @@ class Select : internal::no_assignment_operator,
       eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
     }
 
-    Index rows() const { return m_condition.rows(); }
-    Index cols() const { return m_condition.cols(); }
+    inline EIGEN_DEVICE_FUNC Index rows() const { return m_condition.rows(); }
+    inline EIGEN_DEVICE_FUNC Index cols() const { return m_condition.cols(); }
 
+    inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i, Index j) const
     {
       if (m_condition.coeff(i,j))
@@ -80,6 +79,7 @@ class Select : internal::no_assignment_operator,
         return m_else.coeff(i,j);
     }
 
+    inline EIGEN_DEVICE_FUNC
     const Scalar coeff(Index i) const
     {
       if (m_condition.coeff(i))
@@ -88,17 +88,17 @@ class Select : internal::no_assignment_operator,
         return m_else.coeff(i);
     }
 
-    const ConditionMatrixType& conditionMatrix() const
+    inline EIGEN_DEVICE_FUNC const ConditionMatrixType& conditionMatrix() const
     {
       return m_condition;
     }
 
-    const ThenMatrixType& thenMatrix() const
+    inline EIGEN_DEVICE_FUNC const ThenMatrixType& thenMatrix() const
     {
       return m_then;
     }
 
-    const ElseMatrixType& elseMatrix() const
+    inline EIGEN_DEVICE_FUNC const ElseMatrixType& elseMatrix() const
     {
       return m_else;
     }
diff --git a/nuparu/include/Eigen/src/Core/SelfAdjointView.h b/nuparu/include/Eigen/src/Core/SelfAdjointView.h
index 6fa7cd15..87e87ab3 100644
--- a/nuparu/include/Eigen/src/Core/SelfAdjointView.h
+++ b/nuparu/include/Eigen/src/Core/SelfAdjointView.h
@@ -32,54 +32,57 @@ namespace internal {
 template<typename MatrixType, unsigned int UpLo>
 struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   enum {
     Mode = UpLo | SelfAdjoint,
-    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
-           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits|FlagsLvalueBit)
+           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)) // FIXME these flags should be preserved
   };
 };
 }
 
-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
-struct SelfadjointProductMatrix;
-
 // FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
-template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
-  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
+template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
+  : public TriangularBase<SelfAdjointView<_MatrixType, UpLo> >
 {
   public:
 
+    typedef _MatrixType MatrixType;
     typedef TriangularBase<SelfAdjointView> Base;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
 
     /** \brief The type of coefficients in this matrix */
     typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
-
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
 
     enum {
-      Mode = internal::traits<SelfAdjointView>::Mode
+      Mode = internal::traits<SelfAdjointView>::Mode,
+      Flags = internal::traits<SelfAdjointView>::Flags
     };
     typedef typename MatrixType::PlainObject PlainObject;
 
-    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
+    EIGEN_DEVICE_FUNC
+    explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
     {}
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return m_matrix.outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return m_matrix.innerStride(); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const
     {
       Base::check_coordinates_internal(row, col);
@@ -89,36 +92,46 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
     /** \sa MatrixBase::coeffRef()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col)
     {
+      EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
       Base::check_coordinates_internal(row, col);
       return m_matrix.const_cast_derived().coeffRef(row, col);
     }
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
 
+    EIGEN_DEVICE_FUNC
     const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+    EIGEN_DEVICE_FUNC
     MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
 
-    /** Efficient self-adjoint matrix times vector/matrix product */
+    /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
-    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    const Product<SelfAdjointView,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return SelfadjointProductMatrix
-              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
+      return Product<SelfAdjointView,OtherDerived>(*this, rhs.derived());
     }
 
-    /** Efficient vector/matrix times self-adjoint matrix product */
+    /** Efficient vector/matrix times triangular matrix product */
     template<typename OtherDerived> friend
-    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,SelfAdjointView>
     operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
     {
-      return SelfadjointProductMatrix
-              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
-              (lhs.derived(),rhs.m_matrix);
+      return Product<OtherDerived,SelfAdjointView>(lhs.derived(),rhs);
+    }
+    
+    friend EIGEN_DEVICE_FUNC
+    const SelfAdjointView<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,MatrixType>,UpLo>
+    operator*(const Scalar& s, const SelfAdjointView& mat)
+    {
+      return (s*mat.nestedExpression()).template selfadjointView<UpLo>();
     }
 
     /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
@@ -132,6 +145,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
       * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
       */
     template<typename DerivedU, typename DerivedV>
+    EIGEN_DEVICE_FUNC
     SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));
 
     /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -145,6 +159,7 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
       * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
       */
     template<typename DerivedU>
+    EIGEN_DEVICE_FUNC
     SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
 
 /////////// Cholesky module ///////////
@@ -159,31 +174,10 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
     /** Return type of eigenvalues() */
     typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
+    EIGEN_DEVICE_FUNC
     EigenvaluesReturnType eigenvalues() const;
+    EIGEN_DEVICE_FUNC
     RealScalar operatorNorm() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
-      return *this;
-    }
-    template<typename OtherMatrixType, unsigned int OtherMode>
-    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
-      return *this;
-    }
-    #endif
 
   protected:
     MatrixTypeNested m_matrix;
@@ -201,90 +195,56 @@ template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
 
 namespace internal {
 
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row < col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SelfAdjointView<MatrixType,Mode> >
 {
-  static inline void run(Derived1 &, const Derived2 &) {}
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SelfAdjointShape Shape;
+  
+  static const int AssumeAliasing = 0;
 };
 
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
+template<int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version>
+class triangular_dense_assignment_kernel<UpLo,SelfAdjoint,SetOpposite,DstEvaluatorTypeT,SrcEvaluatorTypeT,Functor,Version>
+  : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
 {
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-
-  static inline void run(Derived1 &dst, const Derived2 &src)
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+  
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+  
+  
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
   {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row > col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+    eigen_internal_assert(row!=col);
+    Scalar tmp = m_src.coeff(row,col);
+    m_functor.assignCoeff(m_dst.coeffRef(row,col), tmp);
+    m_functor.assignCoeff(m_dst.coeffRef(col,row), numext::conj(tmp));
   }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
+  
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
   {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = 0; i < j; ++i)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(j, j, src);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
-{
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-  typedef typename Derived1::Index Index;
-    for(Index i = 0; i < dst.rows(); ++i)
-    {
-      for(Index j = 0; j < i; ++j)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(i, i, src);
-    }
+    Base::assignCoeff(id,id);
   }
+  
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index)
+  { eigen_internal_assert(false && "should never be called"); }
 };
 
 } // end namespace internal
@@ -298,7 +258,7 @@ template<unsigned int UpLo>
 typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
 template<typename Derived>
@@ -306,7 +266,7 @@ template<unsigned int UpLo>
 typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/SelfCwiseBinaryOp.h b/nuparu/include/Eigen/src/Core/SelfCwiseBinaryOp.h
index 22f3047b..38185d9d 100644
--- a/nuparu/include/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/nuparu/include/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -12,183 +12,35 @@
 
 namespace Eigen { 
 
-/** \class SelfCwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for optimizing operators like +=, -=
-  *
-  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
-  * method to directly performs a +=/-= operations in an optimal way. In particular,
-  * this allows to make sure that the input/output data are loaded only once using
-  * aligned packet loads.
-  *
-  * \sa class SwapWrapper for a similar trick.
-  */
-
-namespace internal {
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
+template<typename Derived>
+inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  enum {
-    // Note that it is still a good idea to preserve the DirectAccessBit
-    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
-    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
-    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
-  };
-};
+  typedef typename Derived::PlainObject PlainObject;
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar>());
+  return derived();
 }
 
-template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
-  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
+template<typename Derived>
+inline Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  public:
-
-    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
-
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-    inline const Scalar* data() const { return m_matrix.data(); }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.coeffRef(row, col);
-    }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    inline Scalar& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      Scalar& tmp = m_matrix.coeffRef(row,col);
-      tmp = m_functor(tmp, _other.coeff(row,col));
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      Scalar& tmp = m_matrix.coeffRef(index);
-      tmp = m_functor(tmp, _other.coeff(index));
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      m_matrix.template writePacket<StoreMode>(row, col,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      m_matrix.template writePacket<StoreMode>(index,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
-    }
-
-    // reimplement lazyAssign to handle complex *= real
-    // see CwiseBinaryOp ctor for details
-    template<typename RhsDerived>
-    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
-    {
-      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
-      
-    #ifdef EIGEN_DEBUG_ASSIGN
-      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
-    #endif
-      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
-      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
-    #ifndef EIGEN_NO_DEBUG
-      this->checkTransposeAliasing(rhs.derived());
-    #endif
-      return *this;
-    }
-    
-    // overloaded to honor evaluation of special matrices
-    // maybe another solution would be to not use SelfCwiseBinaryOp
-    // at first...
-    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
-    {
-      typename internal::nested<Rhs>::type rhs(_rhs);
-      return Base::operator=(rhs);
-    }
-
-    Lhs& expression() const 
-    { 
-      return m_matrix;
-    }
-
-    const BinaryOp& functor() const 
-    { 
-      return m_functor;
-    }
-
-  protected:
-    Lhs& m_matrix;
-    const BinaryOp& m_functor;
-
-  private:
-    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
-};
+  typedef typename Derived::PlainObject PlainObject;
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar>());
+  return derived();
+}
 
 template<typename Derived>
-inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
+inline Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
   typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar>());
   return derived();
 }
 
 template<typename Derived>
 inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename internal::conditional<NumTraits<Scalar>::IsInteger,
-                                        internal::scalar_quotient_op<Scalar>,
-                                        internal::scalar_product_op<Scalar> >::type BinOp;
   typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<BinOp, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  Scalar actual_other;
-  if(NumTraits<Scalar>::IsInteger)  actual_other = other;
-  else                              actual_other = Scalar(1)/other;
-  tmp = PlainObject::Constant(rows(),cols(), actual_other);
+  internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar>());
   return derived();
 }
 
diff --git a/nuparu/include/Eigen/src/Core/Solve.h b/nuparu/include/Eigen/src/Core/Solve.h
new file mode 100644
index 00000000..ba2ee53b
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/Solve.h
@@ -0,0 +1,173 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVE_H
+#define EIGEN_SOLVE_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename StorageKind> class SolveImpl;
+  
+/** \class Solve
+  * \ingroup Core_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
+template<typename Decomposition, typename RhsType,typename StorageKind> struct solve_traits;
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Dense>
+{
+  typedef Matrix<typename RhsType::Scalar,
+                 Decomposition::ColsAtCompileTime,
+                 RhsType::ColsAtCompileTime,
+                 RhsType::PlainObject::Options,
+                 Decomposition::MaxColsAtCompileTime,
+                 RhsType::MaxColsAtCompileTime> PlainObject;  
+};
+
+template<typename Decomposition, typename RhsType>
+struct traits<Solve<Decomposition, RhsType> >
+  : traits<typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject>
+{
+  typedef typename solve_traits<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>::PlainObject PlainObject;
+  typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type StorageIndex;
+  typedef traits<PlainObject> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = HugeCost
+  };
+};
+
+}
+
+
+template<typename Decomposition, typename RhsType>
+class Solve : public SolveImpl<Decomposition,RhsType,typename internal::traits<RhsType>::StorageKind>
+{
+public:
+  typedef typename internal::traits<Solve>::PlainObject PlainObject;
+  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
+  
+  Solve(const Decomposition &dec, const RhsType &rhs)
+    : m_dec(dec), m_rhs(rhs)
+  {}
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs() const { return m_rhs; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+};
+
+
+// Specialization of the Solve expression for dense results
+template<typename Decomposition, typename RhsType>
+class SolveImpl<Decomposition,RhsType,Dense>
+  : public MatrixBase<Solve<Decomposition,RhsType> >
+{
+  typedef Solve<Decomposition,RhsType> Derived;
+  
+public:
+  
+  typedef MatrixBase<Solve<Decomposition,RhsType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+private:
+  
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+// Generic API dispatcher
+template<typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type
+{
+  public:
+    typedef typename internal::generic_xpr_base<Solve<Decomposition,RhsType>, MatrixXpr, StorageKind>::type Base;
+};
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template<typename Decomposition, typename RhsType>
+struct evaluator<Solve<Decomposition,RhsType> >
+  : public evaluator<typename Solve<Decomposition,RhsType>::PlainObject>
+{
+  typedef Solve<Decomposition,RhsType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    solve.dec()._solve_impl(solve.rhs(), m_result);
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    // FIXME shall we resize dst here?
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Solve<Transpose<const DecType>,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
+} // end namepsace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVE_H
diff --git a/nuparu/include/Eigen/src/Core/SolveTriangular.h b/nuparu/include/Eigen/src/Core/SolveTriangular.h
index ef17f288..5a201044 100644
--- a/nuparu/include/Eigen/src/Core/SolveTriangular.h
+++ b/nuparu/include/Eigen/src/Core/SolveTriangular.h
@@ -68,7 +68,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
     if(!useRhsDirectly)
       MappedRhs(actualRhs,rhs.size()) = rhs;
 
-    triangular_solve_vector<LhsScalar, RhsScalar, typename Lhs::Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+    triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
                             (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
       ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
 
@@ -82,7 +82,6 @@ template<typename Lhs, typename Rhs, int Side, int Mode>
 struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename Rhs::Index Index;
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
 
@@ -96,7 +95,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
     typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
 
-    BlockingType blocking(rhs.rows(), rhs.cols(), size);
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
 
     triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
                                (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
@@ -108,32 +107,32 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
 * meta-unrolling implementation
 ***************************************************************************/
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
-         bool Stop = Index==Size>
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size,
+         bool Stop = LoopIndex==Size>
 struct triangular_solver_unroller;
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,false> {
   enum {
     IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    DiagIndex  = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0         : DiagIndex+1
   };
   static void run(const Lhs& lhs, Rhs& rhs)
   {
-    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
-                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+    if (LoopIndex>0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex).template segment<LoopIndex>(StartIndex).transpose()
+                                .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex)).sum();
 
     if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+      rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex,DiagIndex);
 
-    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+    triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex+1,Size>::run(lhs,rhs);
   }
 };
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
+template<typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs,Rhs,Mode,LoopIndex,Size,true> {
   static void run(const Lhs&, Rhs&) {}
 };
 
@@ -162,19 +161,12 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 * TriangularView methods
 ***************************************************************************/
 
-/** "in-place" version of TriangularView::solve() where the result is written in \a other
-  *
-  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
-  * This function will const_cast it, so constness isn't honored here.
-  *
-  * See TriangularView:solve() for the details.
-  */
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
-  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
+  eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
@@ -183,39 +175,18 @@ void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived
   OtherCopy otherCopy(other);
 
   internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
-    Side, Mode>::run(nestedExpression(), otherCopy);
+    Side, Mode>::run(derived().nestedExpression(), otherCopy);
 
   if (copy)
     other = otherCopy;
 }
 
-/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
-  *
-  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
-  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
-  * \a Side==OnTheRight.
-  *
-  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
-  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
-  * is an upper (resp. lower) triangular matrix.
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
-  * to the same matrix or vector \a other.
-  *
-  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
-  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
-  *
-  * \sa TriangularView::solveInPlace()
-  */
 template<typename Derived, unsigned int Mode>
 template<int Side, typename Other>
 const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
-TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
+TriangularViewImpl<Derived,Mode,Dense>::solve(const MatrixBase<Other>& other) const
 {
-  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
+  return internal::triangular_solve_retval<Side,TriangularViewType,Other>(derived(), other.derived());
 }
 
 namespace internal {
@@ -232,7 +203,6 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
 {
   typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
   typedef ReturnByValue<triangular_solve_retval> Base;
-  typedef typename Base::Index Index;
 
   triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
     : m_triangularMatrix(tri), m_rhs(rhs)
diff --git a/nuparu/include/Eigen/src/Core/SolverBase.h b/nuparu/include/Eigen/src/Core/SolverBase.h
new file mode 100644
index 00000000..8a4adc22
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/SolverBase.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVERBASE_H
+#define EIGEN_SOLVERBASE_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+
+} // end namespace internal
+
+/** \class SolverBase
+  * \brief A base class for matrix decomposition and solvers
+  *
+  * \tparam Derived the actual type of the decomposition/solver.
+  *
+  * Any matrix decomposition inheriting this base class provide the following API:
+  *
+  * \code
+  * MatrixType A, b, x;
+  * DecompositionType dec(A);
+  * x = dec.solve(b);             // solve A   * x = b
+  * x = dec.transpose().solve(b); // solve A^T * x = b
+  * x = dec.adjoint().solve(b);   // solve A'  * x = b
+  * \endcode
+  *
+  * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
+  *
+  * \sa class PartialPivLU, class FullPivLU
+  */
+template<typename Derived>
+class SolverBase : public EigenBase<Derived>
+{
+  public:
+
+    typedef EigenBase<Derived> Base;
+    typedef typename internal::traits<Derived>::Scalar Scalar;
+    typedef Scalar CoeffReturnType;
+
+    enum {
+      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                          internal::traits<Derived>::ColsAtCompileTime>::ret),
+      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                             internal::traits<Derived>::MaxColsAtCompileTime>::ret),
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
+    };
+
+    /** Default constructor */
+    SolverBase()
+    {}
+
+    ~SolverBase()
+    {}
+
+    using Base::derived;
+
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+
+    /** \internal the return type of transpose() */
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    /** \returns an expression of the transposed of the factored matrix.
+      *
+      * A typical usage is to solve for the transposed problem A^T x = b:
+      * \code x = dec.transpose().solve(b); \endcode
+      *
+      * \sa adjoint(), solve()
+      */
+    inline ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(derived());
+    }
+
+    /** \internal the return type of adjoint() */
+    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                        ConstTransposeReturnType
+                     >::type AdjointReturnType;
+    /** \returns an expression of the adjoint of the factored matrix
+      *
+      * A typical usage is to solve for the adjoint problem A' x = b:
+      * \code x = dec.adjoint().solve(b); \endcode
+      *
+      * For real scalar types, this function is equivalent to transpose().
+      *
+      * \sa transpose(), solve()
+      */
+    inline AdjointReturnType adjoint() const
+    {
+      return AdjointReturnType(derived().transpose());
+    }
+
+  protected:
+};
+
+namespace internal {
+
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, SolverStorage>
+{
+  typedef SolverBase<Derived> type;
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVERBASE_H
diff --git a/nuparu/include/Eigen/src/Core/SpecialFunctions.h b/nuparu/include/Eigen/src/Core/SpecialFunctions.h
new file mode 100644
index 00000000..d43cf23a
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/SpecialFunctions.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+/****************************************************************************
+ * Implementation of lgamma                                                 *
+ ****************************************************************************/
+
+template<typename Scalar>
+struct lgamma_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct lgamma_retval
+{
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template<>
+struct lgamma_impl<float>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const float& x) { return ::lgammaf(x); }
+};
+
+template<>
+struct lgamma_impl<double>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double& x) { return ::lgamma(x); }
+};
+#endif
+
+/****************************************************************************
+ * Implementation of erf                                                    *
+ ****************************************************************************/
+
+template<typename Scalar>
+struct erf_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct erf_retval
+{
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template<>
+struct erf_impl<float>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float& x) { return ::erff(x); }
+};
+
+template<>
+struct erf_impl<double>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double& x) { return ::erf(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc                                                   *
+****************************************************************************/
+
+template<typename Scalar>
+struct erfc_impl
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar&)
+  {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template<typename Scalar>
+struct erfc_retval
+{
+  typedef Scalar type;
+};
+
+#ifdef EIGEN_HAS_C99_MATH
+template<>
+struct erfc_impl<float>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+};
+
+template<>
+struct erfc_impl<double>
+{
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+}  // end namespace internal
+
+
+namespace numext {
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x)
+{
+  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/nuparu/include/Eigen/src/Core/StableNorm.h b/nuparu/include/Eigen/src/Core/StableNorm.h
index c83e955e..7fe39808 100644
--- a/nuparu/include/Eigen/src/Core/StableNorm.h
+++ b/nuparu/include/Eigen/src/Core/StableNorm.h
@@ -17,16 +17,37 @@ namespace internal {
 template<typename ExpressionType, typename Scalar>
 inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
 {
-  Scalar max = bl.cwiseAbs().maxCoeff();
-  if (max>scale)
+  Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
+  
+  if(maxCoeff>scale)
   {
-    ssq = ssq * numext::abs2(scale/max);
-    scale = max;
-    invScale = Scalar(1)/scale;
+    ssq = ssq * numext::abs2(scale/maxCoeff);
+    Scalar tmp = Scalar(1)/maxCoeff;
+    if(tmp > NumTraits<Scalar>::highest())
+    {
+      invScale = NumTraits<Scalar>::highest();
+      scale = Scalar(1)/invScale;
+    }
+    else if(maxCoeff>NumTraits<Scalar>::highest()) // we got a INF
+    {
+      invScale = Scalar(1);
+      scale = maxCoeff;
+    }
+    else
+    {
+      scale = maxCoeff;
+      invScale = tmp;
+    }
+  }
+  else if(maxCoeff!=maxCoeff) // we got a NaN
+  {
+    scale = maxCoeff;
   }
-  // TODO if the max is much much smaller than the current scale,
+  
+  // TODO if the maxCoeff is much much smaller than the current scale,
   // then we can neglect this sub vector
-  ssq += (bl*invScale).squaredNorm();
+  if(scale>Scalar(0)) // if scale==0, then bl is 0 
+    ssq += (bl*invScale).squaredNorm();
 }
 
 template<typename Derived>
@@ -34,15 +55,12 @@ inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
 {
   typedef typename Derived::RealScalar RealScalar;  
-  typedef typename Derived::Index Index;
   using std::pow;
-  using std::min;
-  using std::max;
   using std::sqrt;
   using std::abs;
   const Derived& vec(_vec.derived());
   static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
+  static RealScalar b1, b2, s1m, s2m, rbig, relerr;
   if(!initialized)
   {
     int ibeta, it, iemin, iemax, iexp;
@@ -71,7 +89,6 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
     iexp  = - ((iemax+it)/2);
     s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
 
-    overfl  = rbig*s2m;                                             // overflow boundary for abig
     eps     = RealScalar(pow(double(ibeta), 1-it));
     relerr  = sqrt(eps);                                            // tolerance for neglecting asml
     initialized = true;
@@ -88,13 +105,13 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
     else if(ax < b1) asml += numext::abs2(ax*s1m);
     else             amed += numext::abs2(ax);
   }
+  if(amed!=amed)
+    return amed;  // we got a NaN
   if(abig > RealScalar(0))
   {
     abig = sqrt(abig);
-    if(abig > overfl)
-    {
-      return rbig;
-    }
+    if(abig > rbig) // overflow, or *this contains INF values
+      return abig;  // return INF
     if(amed > RealScalar(0))
     {
       abig = abig/s2m;
@@ -115,8 +132,8 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
   }
   else
     return sqrt(amed);
-  asml = (min)(abig, amed);
-  abig = (max)(abig, amed);
+  asml = numext::mini(abig, amed);
+  abig = numext::maxi(abig, amed);
   if(asml <= abig*relerr)
     return abig;
   else
@@ -139,21 +156,33 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  using std::min;
   using std::sqrt;
+  using std::abs;
   const Index blockSize = 4096;
   RealScalar scale(0);
   RealScalar invScale(1);
   RealScalar ssq(0); // sum of square
+  
+  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
+  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
+  DerivedCopy copy(derived());
+  
   enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
+    CanAlign = (int(Flags)&DirectAccessBit) || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME
   };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
+                                                   typename DerivedCopyClean
+                                                   ::ConstSegmentReturnType>::type SegmentWrapper;
   Index n = size();
-  Index bi = internal::first_aligned(derived());
+  
+  if(n==1)
+    return abs(this->coeff(0));
+  
+  Index bi = internal::first_default_aligned(copy);
   if (bi>0)
-    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
   for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,(min)(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
   return scale * sqrt(ssq);
 }
 
diff --git a/nuparu/include/Eigen/src/Core/Stride.h b/nuparu/include/Eigen/src/Core/Stride.h
index 1e3f5fe9..9a2f4f1e 100644
--- a/nuparu/include/Eigen/src/Core/Stride.h
+++ b/nuparu/include/Eigen/src/Core/Stride.h
@@ -44,13 +44,14 @@ template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
 class Stride
 {
   public:
-    typedef DenseIndex Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
     enum {
       InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
       OuterStrideAtCompileTime = _OuterStrideAtCompileTime
     };
 
     /** Default constructor, for use when strides are fixed at compile time */
+    EIGEN_DEVICE_FUNC
     Stride()
       : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
     {
@@ -58,6 +59,7 @@ class Stride
     }
 
     /** Constructor allowing to pass the strides at runtime */
+    EIGEN_DEVICE_FUNC
     Stride(Index outerStride, Index innerStride)
       : m_outer(outerStride), m_inner(innerStride)
     {
@@ -65,13 +67,16 @@ class Stride
     }
 
     /** Copy constructor */
+    EIGEN_DEVICE_FUNC
     Stride(const Stride& other)
       : m_outer(other.outer()), m_inner(other.inner())
     {}
 
     /** \returns the outer stride */
+    EIGEN_DEVICE_FUNC
     inline Index outer() const { return m_outer.value(); }
     /** \returns the inner stride */
+    EIGEN_DEVICE_FUNC
     inline Index inner() const { return m_inner.value(); }
 
   protected:
@@ -81,26 +86,24 @@ class Stride
 
 /** \brief Convenience specialization of Stride to specify only an inner stride
   * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class InnerStride : public Stride<0, Value>
 {
     typedef Stride<0, Value> Base;
   public:
-    typedef DenseIndex Index;
-    InnerStride() : Base() {}
-    InnerStride(Index v) : Base(0, v) {}
+    EIGEN_DEVICE_FUNC InnerStride() : Base() {}
+    EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {} // FIXME making this explicit could break valid code
 };
 
 /** \brief Convenience specialization of Stride to specify only an outer stride
   * See class Map for some examples */
-template<int Value = Dynamic>
+template<int Value>
 class OuterStride : public Stride<Value, 0>
 {
     typedef Stride<Value, 0> Base;
   public:
-    typedef DenseIndex Index;
-    OuterStride() : Base() {}
-    OuterStride(Index v) : Base(v,0) {}
+    EIGEN_DEVICE_FUNC OuterStride() : Base() {}
+    EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v,0) {} // FIXME making this explicit could break valid code
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Swap.h b/nuparu/include/Eigen/src/Core/Swap.h
index bf58bd59..d7020091 100644
--- a/nuparu/include/Eigen/src/Core/Swap.h
+++ b/nuparu/include/Eigen/src/Core/Swap.h
@@ -12,115 +12,56 @@
 
 namespace Eigen { 
 
-/** \class SwapWrapper
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for swapping two expressions
-  */
 namespace internal {
-template<typename ExpressionType>
-struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
-}
 
-template<typename ExpressionType> class SwapWrapper
-  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
+// Overload default assignPacket behavior for swapping them
+template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
+ : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
 {
-  public:
-
-    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-    
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-                     
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.coeffRef(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index index) const
-    {
-      return m_expression.coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                         && colId >= 0 && colId < cols());
-      Scalar tmp = m_expression.coeff(rowId, colId);
-      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
-      _other.coeffRef(rowId, colId) = tmp;
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Scalar tmp = m_expression.coeff(index);
-      m_expression.coeffRef(index) = _other.coeff(index);
-      _other.coeffRef(index) = tmp;
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                        && colId >= 0 && colId < cols());
-      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
-      m_expression.template writePacket<StoreMode>(rowId, colId,
-        _other.template packet<LoadMode>(rowId, colId)
-      );
-      _other.template writePacket<LoadMode>(rowId, colId, tmp);
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Packet tmp = m_expression.template packet<StoreMode>(index);
-      m_expression.template writePacket<StoreMode>(index,
-        _other.template packet<LoadMode>(index)
-      );
-      _other.template writePacket<LoadMode>(index, tmp);
-    }
-
-    ExpressionType& expression() const { return m_expression; }
-
-  protected:
-    ExpressionType& m_expression;
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> Base;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+  
+public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::DstXprType DstXprType;
+  typedef swap_assign_op<Scalar> Functor;
+  
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  void assignPacket(Index row, Index col)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
+    m_dst.template writePacket<StoreMode>(row,col,tmp);
+  }
+  
+  template<int StoreMode, int LoadMode, typename PacketType>
+  void assignPacket(Index index)
+  {
+    PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
+    const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
+    m_dst.template writePacket<StoreMode>(index,tmp);
+  }
+  
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
+  template<int StoreMode, int LoadMode, typename PacketType>
+  void assignPacketByOuterInner(Index outer, Index inner)
+  {
+    Index row = Base::rowIndexByOuterInner(outer, inner); 
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode,LoadMode,PacketType>(row, col);
+  }
 };
 
+} // namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SWAP_H
diff --git a/nuparu/include/Eigen/src/Core/Transpose.h b/nuparu/include/Eigen/src/Core/Transpose.h
index f21b3aa6..5b66eb5e 100644
--- a/nuparu/include/Eigen/src/Core/Transpose.h
+++ b/nuparu/include/Eigen/src/Core/Transpose.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -29,23 +29,19 @@ namespace Eigen {
 
 namespace internal {
 template<typename MatrixType>
-struct traits<Transpose<MatrixType> > : traits<MatrixType>
+struct traits<Transpose<MatrixType> > : public traits<MatrixType>
 {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
-  typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename traits<MatrixType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = MatrixType::ColsAtCompileTime,
     ColsAtCompileTime = MatrixType::RowsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
     Flags1 = Flags0 | FlagsLvalueBit,
     Flags = Flags1 ^ RowMajorBit,
-    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
     InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
     OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
   };
@@ -61,19 +57,23 @@ template<typename MatrixType> class Transpose
 
     typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
     EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+    typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-    inline Index rows() const { return m_matrix.cols(); }
-    inline Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
 
     /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
+    EIGEN_DEVICE_FUNC
     typename internal::remove_all<typename MatrixType::Nested>::type&
     nestedExpression() { return m_matrix.const_cast_derived(); }
 
@@ -97,17 +97,27 @@ struct TransposeImpl_base<MatrixType, false>
 
 } // end namespace internal
 
+// Generic API dispatcher
+template<typename XprType, typename StorageKind>
+class TransposeImpl
+  : public internal::generic_xpr_base<Transpose<XprType> >::type
+{
+public:
+  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
+};
+
 template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   : public internal::TransposeImpl_base<MatrixType>::type
 {
   public:
 
     typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+    using Base::coeffRef;
     EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -115,64 +125,21 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    inline const Scalar* data() const { return derived().nestedExpression().data(); }
-
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
-    }
-
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(index);
-    }
+    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
 
+    // FIXME: shall we keep the const version of coeffRef?
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return derived().nestedExpression().coeffRef(colId, rowId);
     }
 
+    EIGEN_DEVICE_FUNC
     inline const Scalar& coeffRef(Index index) const
     {
       return derived().nestedExpression().coeffRef(index);
     }
-
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().coeff(colId, rowId);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return derived().nestedExpression().coeff(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
 };
 
 /** \returns an expression of the transpose of *this.
@@ -198,7 +165,7 @@ template<typename Derived>
 inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
-  return derived();
+  return TransposeReturnType(derived());
 }
 
 /** This is the const version of transpose().
@@ -236,8 +203,7 @@ template<typename Derived>
 inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
-  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
-                            // due to implicit conversion to return type
+  return AdjointReturnType(this->transpose());
 }
 
 /***************************************************************************
@@ -247,18 +213,38 @@ MatrixBase<Derived>::adjoint() const
 namespace internal {
 
 template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
+  bool MatchPacketSize =
+        (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
+    &&  (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
 struct inplace_transpose_selector;
 
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
     m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
   }
 };
 
+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
+    PacketBlock<Packet> A;
+    for (Index i=0; i<PacketSize; ++i)
+      A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
+    internal::ptranspose(A);
+    for (Index i=0; i<PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
+  }
+};
+
+template<typename MatrixType,bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
   static void run(MatrixType& m) {
     if (m.rows()==m.cols())
       m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
@@ -284,7 +270,8 @@ struct inplace_transpose_selector<MatrixType,false> { // non square matrix
   * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
   * If you just need the transpose of a matrix, use transpose().
   *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
@@ -315,6 +302,7 @@ inline void DenseBase<Derived>::transposeInPlace()
   * If you just need the adjoint of a matrix, use adjoint().
   *
   * \note if the matrix is not square, then \c *this must be a resizable matrix.
+  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
@@ -329,14 +317,6 @@ inline void MatrixBase<Derived>::adjointInPlace()
 
 namespace internal {
 
-template<typename BinOp,typename NestedXpr,typename Rhs>
-struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
- : blas_traits<NestedXpr>
-{
-  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
-  static inline const XprType extract(const XprType& x) { return x; }
-};
-
 template<bool DestIsTransposed, typename OtherDerived>
 struct check_transpose_aliasing_compile_time_selector
 {
@@ -402,15 +382,15 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
     }
 };
 
-} // end namespace internal
-
-template<typename Derived>
-template<typename OtherDerived>
-void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
+template<typename Dst, typename Src>
+void check_for_aliasing(const Dst &dst, const Src &src)
 {
-    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
+  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }
-#endif
+
+} // end namespace internal
+
+#endif // EIGEN_NO_DEBUG
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/Core/Transpositions.h b/nuparu/include/Eigen/src/Core/Transpositions.h
index e4ba0756..3b1c1815 100644
--- a/nuparu/include/Eigen/src/Core/Transpositions.h
+++ b/nuparu/include/Eigen/src/Core/Transpositions.h
@@ -41,10 +41,6 @@ namespace Eigen {
   * \sa class PermutationMatrix
   */
 
-namespace internal {
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed=false> struct transposition_matrix_product_retval;
-}
-
 template<typename Derived>
 class TranspositionsBase
 {
@@ -53,7 +49,8 @@ class TranspositionsBase
   public:
 
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     Derived& derived() { return *static_cast<Derived*>(this); }
     const Derived& derived() const { return *static_cast<const Derived*>(this); }
@@ -65,7 +62,7 @@ class TranspositionsBase
       indices() = other.indices();
       return derived();
     }
-
+    
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** This is a special case of the templated operator=. Its purpose is to
       * prevent a default operator= from hiding the templated operator=.
@@ -78,20 +75,24 @@ class TranspositionsBase
     #endif
 
     /** \returns the number of transpositions */
-    inline Index size() const { return indices().size(); }
+    Index size() const { return indices().size(); }
+    /** \returns the number of rows of the equivalent permutation matrix */
+    Index rows() const { return indices().size(); }
+    /** \returns the number of columns of the equivalent permutation matrix */
+    Index cols() const { return indices().size(); }
 
     /** Direct access to the underlying index vector */
-    inline const Index& coeff(Index i) const { return indices().coeff(i); }
+    inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
     /** Direct access to the underlying index vector */
-    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
+    inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
     /** Direct access to the underlying index vector */
-    inline const Index& operator()(Index i) const { return indices()(i); }
+    inline const StorageIndex& operator()(Index i) const { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline Index& operator()(Index i) { return indices()(i); }
+    inline StorageIndex& operator()(Index i) { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline const Index& operator[](Index i) const { return indices()(i); }
+    inline const StorageIndex& operator[](Index i) const { return indices()(i); }
     /** Direct access to the underlying index vector */
-    inline Index& operator[](Index i) { return indices()(i); }
+    inline StorageIndex& operator[](Index i) { return indices()(i); }
 
     /** const version of indices(). */
     const IndicesType& indices() const { return derived().indices(); }
@@ -99,7 +100,7 @@ class TranspositionsBase
     IndicesType& indices() { return derived().indices(); }
 
     /** Resizes to given size. */
-    inline void resize(int newSize)
+    inline void resize(Index newSize)
     {
       indices().resize(newSize);
     }
@@ -107,7 +108,7 @@ class TranspositionsBase
     /** Sets \c *this to represents an identity transformation */
     void setIdentity()
     {
-      for(int i = 0; i < indices().size(); ++i)
+      for(StorageIndex i = 0; i < indices().size(); ++i)
         coeffRef(i) = i;
     }
 
@@ -144,23 +145,24 @@ class TranspositionsBase
 };
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
-  typedef IndexType Index;
-  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef Matrix<_StorageIndex, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex>
+class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
     typedef internal::traits<Transpositions> Traits;
   public:
 
     typedef TranspositionsBase<Transpositions> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
     inline Transpositions() {}
 
@@ -177,7 +179,7 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
 
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
-    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
+    explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -215,30 +217,32 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
 
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int _PacketAccess>
+struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,_PacketAccess> >
+ : traits<PermutationMatrix<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex> >
 {
-  typedef IndexType Index;
-  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef Map<const Matrix<_StorageIndex,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+  typedef _StorageIndex StorageIndex;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
+template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename _StorageIndex, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess>
+ : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,_StorageIndex>,PacketAccess> >
 {
     typedef internal::traits<Map> Traits;
   public:
 
     typedef TranspositionsBase<Map> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
-    inline Map(const Index* indicesPtr)
+    explicit inline Map(const StorageIndex* indicesPtr)
       : m_indices(indicesPtr)
     {}
 
-    inline Map(const Index* indicesPtr, Index size)
+    inline Map(const StorageIndex* indicesPtr, Index size)
       : m_indices(indicesPtr,size)
     {}
 
@@ -274,9 +278,9 @@ class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,Packe
 namespace internal {
 template<typename _IndicesType>
 struct traits<TranspositionsWrapper<_IndicesType> >
+ : traits<PermutationWrapper<_IndicesType> >
 {
-  typedef typename _IndicesType::Scalar Index;
-  typedef _IndicesType IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
 }
 
@@ -289,10 +293,10 @@ class TranspositionsWrapper
 
     typedef TranspositionsBase<TranspositionsWrapper> Base;
     typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
+    typedef typename IndicesType::Scalar StorageIndex;
 
-    inline TranspositionsWrapper(IndicesType& a_indices)
-      : m_indices(a_indices)
+    explicit inline TranspositionsWrapper(IndicesType& indices)
+      : m_indices(indices)
     {}
 
     /** Copies the \a other transpositions into \c *this */
@@ -324,80 +328,43 @@ class TranspositionsWrapper
     const typename IndicesType::Nested m_indices;
 };
 
+
+
 /** \returns the \a matrix with the \a transpositions applied to the columns.
   */
-template<typename Derived, typename TranspositionsDerived>
-inline const internal::transposition_matrix_product_retval<TranspositionsDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const TranspositionsBase<TranspositionsDerived> &transpositions)
+template<typename MatrixDerived, typename TranspositionsDerived>
+EIGEN_DEVICE_FUNC
+const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+operator*(const MatrixBase<MatrixDerived> &matrix,
+          const TranspositionsBase<TranspositionsDerived>& transpositions)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionsDerived, Derived, OnTheRight>
-           (transpositions.derived(), matrix.derived());
+  return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>
+            (matrix.derived(), transpositions.derived());
 }
 
 /** \returns the \a matrix with the \a transpositions applied to the rows.
   */
-template<typename Derived, typename TranspositionDerived>
-inline const internal::transposition_matrix_product_retval
-               <TranspositionDerived, Derived, OnTheLeft>
-operator*(const TranspositionsBase<TranspositionDerived> &transpositions,
-          const MatrixBase<Derived>& matrix)
+template<typename TranspositionsDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC
+const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+operator*(const TranspositionsBase<TranspositionsDerived> &transpositions,
+          const MatrixBase<MatrixDerived>& matrix)
 {
-  return internal::transposition_matrix_product_retval
-           <TranspositionDerived, Derived, OnTheLeft>
-           (transpositions.derived(), matrix.derived());
+  return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>
+            (transpositions.derived(), matrix.derived());
 }
 
-namespace internal {
-
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct traits<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct transposition_matrix_product_retval
- : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename TranspositionType::Index Index;
-
-    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
-      : m_transpositions(tr), m_matrix(matrix)
-    {}
-
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
+// Template partial specialization for transposed/inverse transpositions
 
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const int size = m_transpositions.size();
-      Index j = 0;
-
-      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
-        dst = m_matrix;
-
-      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if((j=m_transpositions.coeff(k))!=k)
-        {
-          if(Side==OnTheLeft)
-            dst.row(k).swap(dst.row(j));
-          else if(Side==OnTheRight)
-            dst.col(k).swap(dst.col(j));
-        }
-    }
+namespace internal {
 
-  protected:
-    const TranspositionType& m_transpositions;
-    typename MatrixType::Nested m_matrix;
-};
+template<typename Derived>
+struct traits<Transpose<TranspositionsBase<Derived> > >
+ : traits<Derived>
+{};
 
 } // end namespace internal
 
-/* Template partial specialization for transposed/inverse transpositions */
-
 template<typename TranspositionsDerived>
 class Transpose<TranspositionsBase<TranspositionsDerived> >
 {
@@ -405,27 +372,31 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
     typedef typename TranspositionType::IndicesType IndicesType;
   public:
 
-    Transpose(const TranspositionType& t) : m_transpositions(t) {}
+    explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
 
-    inline int size() const { return m_transpositions.size(); }
+    Index size() const { return m_transpositions.size(); }
+    Index rows() const { return m_transpositions.size(); }
+    Index cols() const { return m_transpositions.size(); }
 
     /** \returns the \a matrix with the inverse transpositions applied to the columns.
       */
-    template<typename Derived> friend
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>
-    operator*(const MatrixBase<Derived>& matrix, const Transpose& trt)
+    template<typename OtherDerived> friend
+    const Product<OtherDerived, Transpose, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
     {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>(trt.m_transpositions, matrix.derived());
+      return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
     }
 
     /** \returns the \a matrix with the inverse transpositions applied to the rows.
       */
-    template<typename Derived>
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>
-    operator*(const MatrixBase<Derived>& matrix) const
+    template<typename OtherDerived>
+    const Product<Transpose, OtherDerived, AliasFreeProduct>
+    operator*(const MatrixBase<OtherDerived>& matrix) const
     {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>(m_transpositions, matrix.derived());
+      return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
     }
+    
+    const TranspositionType& nestedExpression() const { return m_transpositions; }
 
   protected:
     const TranspositionType& m_transpositions;
diff --git a/nuparu/include/Eigen/src/Core/TriangularMatrix.h b/nuparu/include/Eigen/src/Core/TriangularMatrix.h
index fba07365..099a02ec 100644
--- a/nuparu/include/Eigen/src/Core/TriangularMatrix.h
+++ b/nuparu/include/Eigen/src/Core/TriangularMatrix.h
@@ -19,9 +19,7 @@ template<int Side, typename TriangularType, typename Rhs> struct triangular_solv
   
 }
 
-/** \internal
-  *
-  * \class TriangularBase
+/** \class TriangularBase
   * \ingroup Core_Module
   *
   * \brief Base class for triangular part in a matrix
@@ -32,41 +30,69 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 
     enum {
       Mode = internal::traits<Derived>::Mode,
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
       MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime
+      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+      
+      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
+      /**< This is equal to the number of coefficients, i.e. the number of
+          * rows times the number of columns, or to \a Dynamic if this is not
+          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+      
+      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                   internal::traits<Derived>::MaxColsAtCompileTime>::ret)
+        
     };
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+    typedef typename internal::traits<Derived>::FullMatrixType DenseMatrixType;
     typedef DenseMatrixType DenseType;
+    typedef Derived const& Nested;
 
+    EIGEN_DEVICE_FUNC
     inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
 
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return derived().rows(); }
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return derived().cols(); }
+    EIGEN_DEVICE_FUNC
     inline Index outerStride() const { return derived().outerStride(); }
+    EIGEN_DEVICE_FUNC
     inline Index innerStride() const { return derived().innerStride(); }
+    
+    // dummy resize function
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_UNUSED_VARIABLE(rows);
+      EIGEN_UNUSED_VARIABLE(cols);
+      eigen_assert(rows==this->rows() && cols==this->cols());
+    }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }
 
     /** \see MatrixBase::copyCoeff(row,col)
       */
     template<typename Other>
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
     {
       derived().coeffRef(row, col) = other.coeff(row, col);
     }
 
+    EIGEN_DEVICE_FUNC
     inline Scalar operator()(Index row, Index col) const
     {
       check_coordinates(row, col);
       return coeff(row,col);
     }
+    EIGEN_DEVICE_FUNC
     inline Scalar& operator()(Index row, Index col)
     {
       check_coordinates(row, col);
@@ -74,15 +100,20 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
     inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    EIGEN_DEVICE_FUNC
     inline Derived& derived() { return *static_cast<Derived*>(this); }
     #endif // not EIGEN_PARSED_BY_DOXYGEN
 
     template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
     void evalTo(MatrixBase<DenseDerived> &other) const;
     template<typename DenseDerived>
+    EIGEN_DEVICE_FUNC
     void evalToLazy(MatrixBase<DenseDerived> &other) const;
 
+    EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const
     {
       DenseMatrixType res(rows(), cols());
@@ -119,17 +150,17 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
 /** \class TriangularView
   * \ingroup Core_Module
   *
-  * \brief Base class for triangular part in a matrix
+  * \brief Expression of a triangular part in a matrix
   *
   * \param MatrixType the type of the object in which we are taking the triangular part
   * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
   *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
   *             This is in fact a bit field; it must have either #Upper or #Lower, 
-  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
+  *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
   *
   * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
   * matrices one should speak of "trapezoid" parts. This class is the return type
-  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
+  * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it is used.
   *
   * \sa MatrixBase::triangularView()
   */
@@ -137,490 +168,401 @@ namespace internal {
 template<typename MatrixType, unsigned int _Mode>
 struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
 {
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
   typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
   enum {
     Mode = _Mode,
-    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)))
   };
 };
 }
 
-template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-struct TriangularProduct;
+template<typename _MatrixType, unsigned int _Mode, typename StorageKind> class TriangularViewImpl;
 
 template<typename _MatrixType, unsigned int _Mode> class TriangularView
-  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+  : public TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind >
 {
   public:
 
-    typedef TriangularBase<TriangularView> Base;
+    typedef TriangularViewImpl<_MatrixType, _Mode, typename internal::traits<_MatrixType>::StorageKind > Base;
     typedef typename internal::traits<TriangularView>::Scalar Scalar;
-
     typedef _MatrixType MatrixType;
-    typedef typename internal::traits<TriangularView>::DenseMatrixType DenseMatrixType;
-    typedef DenseMatrixType PlainObject;
 
   protected:
     typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
     typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
-    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
     
   public:
-    using Base::evalToLazy;
-  
 
     typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
-    typedef typename internal::traits<TriangularView>::Index Index;
+    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned NestedExpression;
 
     enum {
       Mode = _Mode,
+      Flags = internal::traits<TriangularView>::Flags,
       TransposeMode = (Mode & Upper ? Lower : 0)
                     | (Mode & Lower ? Upper : 0)
                     | (Mode & (UnitDiag))
-                    | (Mode & (ZeroDiag))
+                    | (Mode & (ZeroDiag)),
+      IsVectorAtCompileTime = false
     };
 
-    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
+    // FIXME This, combined with const_cast_derived in transpose() leads to a const-correctness loophole
+    EIGEN_DEVICE_FUNC
+    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
+    
+    using Base::operator=;
+    TriangularView& operator=(const TriangularView &other)
+    { return Base::operator=(other); }
 
+    /** \copydoc EigenBase::rows() */
+    EIGEN_DEVICE_FUNC
     inline Index rows() const { return m_matrix.rows(); }
+    /** \copydoc EigenBase::cols() */
+    EIGEN_DEVICE_FUNC
     inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
+
+    /** \returns a const reference to the nested expression */
+    EIGEN_DEVICE_FUNC
+    const NestedExpression& nestedExpression() const { return m_matrix; }
+
+    /** \returns a reference to the nested expression */
+    EIGEN_DEVICE_FUNC
+    NestedExpression& nestedExpression() { return *const_cast<NestedExpression*>(&m_matrix); }
+    
+    typedef TriangularView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
+    /** \sa MatrixBase::conjugate() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConjugateReturnType conjugate() const
+    { return ConjugateReturnType(m_matrix.conjugate()); }
+
+    typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
+    /** \sa MatrixBase::adjoint() const */
+    EIGEN_DEVICE_FUNC
+    inline const AdjointReturnType adjoint() const
+    { return AdjointReturnType(m_matrix.adjoint()); }
+
+    typedef TriangularView<typename MatrixType::TransposeReturnType,TransposeMode> TransposeReturnType;
+     /** \sa MatrixBase::transpose() */
+    EIGEN_DEVICE_FUNC
+    inline TransposeReturnType transpose()
+    {
+      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+      typename MatrixType::TransposeReturnType tmp(m_matrix.const_cast_derived());
+      return TransposeReturnType(tmp);
+    }
+    
+    typedef TriangularView<const typename MatrixType::ConstTransposeReturnType,TransposeMode> ConstTransposeReturnType;
+    /** \sa MatrixBase::transpose() const */
+    EIGEN_DEVICE_FUNC
+    inline const ConstTransposeReturnType transpose() const
+    {
+      return ConstTransposeReturnType(m_matrix.transpose());
+    }
+
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    inline const Solve<TriangularView, Other> 
+    solve(const MatrixBase<Other>& other) const
+    { return Solve<TriangularView, Other>(*this, other.derived()); }
+    
+  // workaround MSVC ICE
+  #if EIGEN_COMP_MSVC
+    template<int Side, typename Other>
+    EIGEN_DEVICE_FUNC
+    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
+    solve(const MatrixBase<Other>& other) const
+    { return Base::template solve<Side>(other); }
+  #else
+    using Base::solve;
+  #endif
+
+    /** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
+      *
+      * This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
+      * \sa MatrixBase::selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+    /** This is the const version of selfadjointView() */
+    EIGEN_DEVICE_FUNC
+    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
+    {
+      EIGEN_STATIC_ASSERT((Mode&(UnitDiag|ZeroDiag))==0,PROGRAMMING_ERROR);
+      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
+    }
+
+
+    /** \returns the determinant of the triangular matrix
+      * \sa MatrixBase::determinant() */
+    EIGEN_DEVICE_FUNC
+    Scalar determinant() const
+    {
+      if (Mode & UnitDiag)
+        return 1;
+      else if (Mode & ZeroDiag)
+        return 0;
+      else
+        return m_matrix.diagonal().prod();
+    }
+      
+  protected:
+
+    MatrixTypeNested m_matrix;
+};
+
+/** \ingroup Core_Module
+  *
+  * \brief Base class for a triangular part in a \b dense matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which available for dense expressions only.
+  *
+  * \sa class TriangularView, MatrixBase::triangularView()
+  */
+template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_MatrixType,_Mode,Dense>
+  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
+{
+  public:
+
+    typedef TriangularView<_MatrixType, _Mode> TriangularViewType;
+    typedef TriangularBase<TriangularViewType> Base;
+    typedef typename internal::traits<TriangularViewType>::Scalar Scalar;
+
+    typedef _MatrixType MatrixType;
+    typedef typename MatrixType::PlainObject DenseMatrixType;
+    typedef DenseMatrixType PlainObject;
+
+  public:
+    using Base::evalToLazy;
+    using Base::derived;
+
+    typedef typename internal::traits<TriangularViewType>::StorageKind StorageKind;
+
+    enum {
+      Mode = _Mode,
+      Flags = internal::traits<TriangularViewType>::Flags
+    };
+
+    /** \returns the outer-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::outerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    /** \returns the inner-stride of the underlying dense matrix
+      * \sa DenseCoeffsBase::innerStride() */
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
 
     /** \sa MatrixBase::operator+=() */
-    template<typename Other> TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator+=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
+      return derived();
+    }
     /** \sa MatrixBase::operator-=() */
-    template<typename Other> TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
+    template<typename Other>
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator-=(const DenseBase<Other>& other) {
+      internal::call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+      return derived();
+    }
+    
     /** \sa MatrixBase::operator*=() */
-    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
-    /** \sa MatrixBase::operator/=() */
-    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() * other; }
+    /** \sa DenseBase::operator/=() */
+    EIGEN_DEVICE_FUNC
+    TriangularViewType&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = derived().nestedExpression() / other; }
 
     /** \sa MatrixBase::fill() */
+    EIGEN_DEVICE_FUNC
     void fill(const Scalar& value) { setConstant(value); }
     /** \sa MatrixBase::setConstant() */
-    TriangularView& setConstant(const Scalar& value)
-    { return *this = MatrixType::Constant(rows(), cols(), value); }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setConstant(const Scalar& value)
+    { return *this = MatrixType::Constant(derived().rows(), derived().cols(), value); }
     /** \sa MatrixBase::setZero() */
-    TriangularView& setZero() { return setConstant(Scalar(0)); }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setZero() { return setConstant(Scalar(0)); }
     /** \sa MatrixBase::setOnes() */
-    TriangularView& setOnes() { return setConstant(Scalar(1)); }
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& setOnes() { return setConstant(Scalar(1)); }
 
     /** \sa MatrixBase::coeff()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar coeff(Index row, Index col) const
     {
       Base::check_coordinates_internal(row, col);
-      return m_matrix.coeff(row, col);
+      return derived().nestedExpression().coeff(row, col);
     }
 
     /** \sa MatrixBase::coeffRef()
       * \warning the coordinates must fit into the referenced triangular part
       */
+    EIGEN_DEVICE_FUNC
     inline Scalar& coeffRef(Index row, Index col)
     {
+      EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
       Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
+      return derived().nestedExpression().const_cast_derived().coeffRef(row, col);
     }
 
-    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
-
     /** Assigns a triangular matrix to a triangular part of a dense matrix */
     template<typename OtherDerived>
-    TriangularView& operator=(const TriangularBase<OtherDerived>& other);
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
 
+    /** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
     template<typename OtherDerived>
-    TriangularView& operator=(const MatrixBase<OtherDerived>& other);
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
 
-    TriangularView& operator=(const TriangularView& other)
-    { return *this = other.nestedExpression(); }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    EIGEN_DEVICE_FUNC
+    TriangularViewType& operator=(const TriangularViewImpl& other)
+    { return *this = other.derived().nestedExpression(); }
 
+    /** \deprecated */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void lazyAssign(const TriangularBase<OtherDerived>& other);
 
+    /** \deprecated */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void lazyAssign(const MatrixBase<OtherDerived>& other);
-
-    /** \sa MatrixBase::conjugate() */
-    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
-    { return m_matrix.conjugate(); }
-    /** \sa MatrixBase::conjugate() const */
-    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
-    { return m_matrix.conjugate(); }
-
-    /** \sa MatrixBase::adjoint() const */
-    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
-    { return m_matrix.adjoint(); }
-
-    /** \sa MatrixBase::transpose() */
-    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().transpose();
-    }
-    /** \sa MatrixBase::transpose() const */
-    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
-    {
-      return m_matrix.transpose();
-    }
+#endif
 
     /** Efficient triangular matrix times vector/matrix product */
     template<typename OtherDerived>
-    TriangularProduct<Mode,true,MatrixType,false,OtherDerived, OtherDerived::IsVectorAtCompileTime>
+    EIGEN_DEVICE_FUNC
+    const Product<TriangularViewType,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return TriangularProduct
-              <Mode,true,MatrixType,false,OtherDerived,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
+      return Product<TriangularViewType,OtherDerived>(derived(), rhs.derived());
     }
 
     /** Efficient vector/matrix times triangular matrix product */
     template<typename OtherDerived> friend
-    TriangularProduct<Mode,false,OtherDerived,OtherDerived::IsVectorAtCompileTime,MatrixType,false>
-    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
-    {
-      return TriangularProduct
-              <Mode,false,OtherDerived,OtherDerived::IsVectorAtCompileTime,MatrixType,false>
-              (lhs.derived(),rhs.m_matrix);
-    }
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    struct eigen2_product_return_type
-    {
-      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
-      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
-      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
-      typedef typename ProdRetType::PlainObject type;
-    };
-    template<typename OtherDerived>
-    const typename eigen2_product_return_type<OtherDerived>::type
-    operator*(const EigenBase<OtherDerived>& rhs) const
-    {
-      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
-      rhs.evalTo(rhsPlainObject);
-      return this->toDenseMatrix() * rhsPlainObject;
-    }
-    template<typename OtherMatrixType>
-    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other, precision);
-    }
-    #endif // EIGEN2_SUPPORT
-
+    EIGEN_DEVICE_FUNC
+    const Product<OtherDerived,TriangularViewType>
+    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularViewImpl& rhs)
+    {
+      return Product<OtherDerived,TriangularViewType>(lhs.derived(),rhs.derived());
+    }
+
+    /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+      *
+      * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+      * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+      * \a Side==OnTheRight.
+      *
+      * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+      * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+      * is an upper (resp. lower) triangular matrix.
+      *
+      * Example: \include Triangular_solve.cpp
+      * Output: \verbinclude Triangular_solve.out
+      *
+      * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+      * to the same matrix or vector \a other.
+      *
+      * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+      * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+      *
+      * \sa TriangularView::solveInPlace()
+      */
     template<int Side, typename Other>
-    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
+    EIGEN_DEVICE_FUNC
+    inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
+    /** "in-place" version of TriangularView::solve() where the result is written in \a other
+      *
+      * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+      * This function will const_cast it, so constness isn't honored here.
+      *
+      * See TriangularView:solve() for the details.
+      */
     template<int Side, typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void solveInPlace(const MatrixBase<OtherDerived>& other) const;
 
-    template<typename Other>
-    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
-    solve(const MatrixBase<Other>& other) const
-    { return solve<OnTheLeft>(other); }
-
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void solveInPlace(const MatrixBase<OtherDerived>& other) const
     { return solveInPlace<OnTheLeft>(other); }
 
-    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
-    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
-
+    /** Swaps the coefficients of the common triangular parts of two matrices */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    void swap(TriangularBase<OtherDerived> &other)
+#else
     void swap(TriangularBase<OtherDerived> const & other)
+#endif
     {
-      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
+    /** \deprecated
+      * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void swap(MatrixBase<OtherDerived> const & other)
     {
-      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
-      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
+      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
-    Scalar determinant() const
-    {
-      if (Mode & UnitDiag)
-        return 1;
-      else if (Mode & ZeroDiag)
-        return 0;
-      else
-        return m_matrix.diagonal().prod();
-    }
-    
-    // TODO simplify the following:
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      setZero();
-      return assignProduct(other,1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other,1);
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
+        dst = rhs;
+      this->solveInPlace(dst);
     }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other,-1);
-    }
-    
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
-    {
-      setZero();
-      return assignProduct(other,other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other,other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other,-other.alpha());
-    }
-    
-  protected:
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);
 
-    MatrixTypeNested m_matrix;
+    template<typename ProductType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha);
 };
 
 /***************************************************************************
 * Implementation of triangular evaluation/assignment
 ***************************************************************************/
 
-namespace internal {
-
-template<typename Derived1, typename Derived2, unsigned int Mode, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-  
-  typedef typename Derived1::Scalar Scalar;
-
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    eigen_assert( Mode == Upper || Mode == Lower
-            || Mode == StrictlyUpper || Mode == StrictlyLower
-            || Mode == UnitUpper || Mode == UnitLower);
-    if((Mode == Upper && row <= col)
-    || (Mode == Lower && row >= col)
-    || (Mode == StrictlyUpper && row < col)
-    || (Mode == StrictlyLower && row > col)
-    || (Mode == UnitUpper && row < col)
-    || (Mode == UnitLower && row > col))
-      dst.copyCoeff(row, col, src);
-    else if(ClearOpposite)
-    {
-      if (Mode&UnitDiag && row==col)
-        dst.coeffRef(row, col) = Scalar(1);
-      else
-        dst.coeffRef(row, col) = Scalar(0);
-    }
-  }
-};
-
-// prevent buggy user code from causing an infinite recursion
-template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows()-1);
-      for(Index i = 0; i <= maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows());
-      if (ClearOpposite)
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows()-1);
-      if (ClearOpposite)
-        for(Index i = 0; i <= maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
-  }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
-  }
-};
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = maxi+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
-  }
-};
-
-} // end namespace internal
-
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
 inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const MatrixBase<OtherDerived>& other)
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
-  if(OtherDerived::Flags & EvalBeforeAssigningBit)
-  {
-    typename internal::plain_matrix_type<OtherDerived>::type other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived());
-  return *this;
+  internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar>());
+  return derived();
 }
 
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-          && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-          && MatrixType::SizeAtCompileTime*internal::traits<OtherDerived>::CoeffReadCost/2 <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
-
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // do not change the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived());
+  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
 
 
@@ -628,37 +570,19 @@ void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
 inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const TriangularBase<OtherDerived>& other)
+TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
-  if(internal::traits<OtherDerived>::Flags & EvalBeforeAssigningBit)
-  {
-    typename OtherDerived::DenseMatrixType other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived().nestedExpression());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived().nestedExpression());
-  return *this;
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-void TriangularView<MatrixType, Mode>::lazyAssign(const TriangularBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-                   && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-                   && MatrixType::SizeAtCompileTime * internal::traits<OtherDerived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
-
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // preserve the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived().nestedExpression());
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment_no_alias(derived(), other.derived());
 }
 
 /***************************************************************************
@@ -681,27 +605,6 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
     evalToLazy(other.derived());
 }
 
-/** Assigns a triangular or selfadjoint matrix to a dense matrix.
-  * If the matrix is triangular, the opposite part is set to zero. */
-template<typename Derived>
-template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
-{
-  enum {
-    unroll = DenseDerived::SizeAtCompileTime != Dynamic
-                   && internal::traits<Derived>::CoeffReadCost != Dynamic
-                   && DenseDerived::SizeAtCompileTime * internal::traits<Derived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  other.derived().resize(this->rows(), this->cols());
-
-  internal::triangular_assignment_selector
-    <DenseDerived, typename internal::traits<Derived>::MatrixTypeNestedCleaned, Derived::Mode,
-    unroll ? int(DenseDerived::SizeAtCompileTime) : Dynamic,
-    true // clear the opposite triangular part
-    >::run(other.derived(), derived().nestedExpression());
-}
-
 /***************************************************************************
 * Implementation of TriangularView methods
 ***************************************************************************/
@@ -710,49 +613,14 @@ void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 * Implementation of MatrixBase methods
 ***************************************************************************/
 
-#ifdef EIGEN2_SUPPORT
-
-// implementation of part<>(), including the SelfAdjoint case.
-
-namespace internal {
-template<typename MatrixType, unsigned int Mode>
-struct eigen2_part_return_type
-{
-  typedef TriangularView<MatrixType, Mode> type;
-};
-
-template<typename MatrixType>
-struct eigen2_part_return_type<MatrixType, SelfAdjoint>
-{
-  typedef SelfAdjointView<MatrixType, Upper> type;
-};
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
-{
-  return derived();
-}
-#endif
-
 /**
   * \returns an expression of a triangular view extracted from the current matrix
   *
   * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
   * \c #Lower, \c #StrictlyLower, \c #UnitLower.
   *
-  * Example: \include MatrixBase_extract.cpp
-  * Output: \verbinclude MatrixBase_extract.out
+  * Example: \include MatrixBase_triangularView.cpp
+  * Output: \verbinclude MatrixBase_triangularView.out
   *
   * \sa class TriangularView
   */
@@ -761,7 +629,7 @@ template<unsigned int Mode>
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
-  return derived();
+  return typename TriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** This is the const version of MatrixBase::triangularView() */
@@ -770,7 +638,7 @@ template<unsigned int Mode>
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
-  return derived();
+  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** \returns true if *this is approximately equal to an upper triangular matrix,
@@ -825,6 +693,290 @@ bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
   return true;
 }
 
+
+/***************************************************************************
+****************************************************************************
+* Evaluators and Assignment of triangular expressions
+***************************************************************************
+***************************************************************************/
+
+namespace internal {
+
+  
+// TODO currently a triangular expression has the form TriangularView<.,.>
+//      in the future triangular-ness should be defined by the expression traits
+//      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<TriangularView<MatrixType,Mode> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
+  
+  // 1 if assignment A = B assumes aliasing when B is of type T and thus B needs to be evaluated into a
+  // temporary; 0 if not.
+  static const int AssumeAliasing = 0;
+};
+
+template<typename MatrixType, unsigned int Mode>
+struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
+ : evaluator<typename internal::remove_all<MatrixType>::type>
+{
+  typedef TriangularView<MatrixType,Mode> XprType;
+  typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
+  unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
+};
+
+// Additional assignment kinds:
+struct Triangular2Triangular    {};
+struct Triangular2Dense         {};
+struct Dense2Triangular         {};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite> struct triangular_assignment_loop;
+
+ 
+/** \internal Specialization of the dense assignment kernel for triangular matrices.
+  * The main difference is that the triangular, diagonal, and opposite parts are processed through three different functions.
+  * \tparam UpLo must be either Lower or Upper
+  * \tparam Mode must be either 0, UnitDiag, ZeroDiag, or SelfAdjoint
+  */
+template<int UpLo, int Mode, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class triangular_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version>
+{
+protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_src;
+  using Base::m_functor;
+public:
+  
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+  
+  
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+    : Base(dst, src, func, dstExpr)
+  {}
+  
+#ifdef EIGEN_INTERNAL_DEBUGGING
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col)
+  {
+    eigen_internal_assert(row!=col);
+    Base::assignCoeff(row,col);
+  }
+#else
+  using Base::assignCoeff;
+#endif
+  
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id)
+  {
+         if(Mode==UnitDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(1));
+    else if(Mode==ZeroDiag && SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(id,id), Scalar(0));
+    else if(Mode==0)                       Base::assignCoeff(id,id);
+  }
+  
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col)
+  { 
+    eigen_internal_assert(row!=col);
+    if(SetOpposite)
+      m_functor.assignCoeff(m_dst.coeffRef(row,col), Scalar(0));
+  }
+};
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src, const Functor &func)
+{
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+  
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  DstEvaluatorType dstEvaluator(dst);
+  SrcEvaluatorType srcEvaluator(src);
+    
+  typedef triangular_dense_assignment_kernel< Mode&(Lower|Upper),Mode&(UnitDiag|ZeroDiag|SelfAdjoint),SetOpposite,
+                                              DstEvaluatorType,SrcEvaluatorType,Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+  
+  enum {
+      unroll = DstXprType::SizeAtCompileTime != Dynamic
+            && SrcEvaluatorType::CoeffReadCost < HugeCost
+            && DstXprType::SizeAtCompileTime * SrcEvaluatorType::CoeffReadCost / 2 <= EIGEN_UNROLLING_LIMIT
+    };
+  
+  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
+}
+
+template<int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC void call_triangular_assignment_loop(const DstXprType& dst, const SrcXprType& src)
+{
+  call_triangular_assignment_loop<Mode,SetOpposite>(dst, src, internal::assign_op<typename DstXprType::Scalar>());
+}
+
+template<> struct AssignmentKind<TriangularShape,TriangularShape> { typedef Triangular2Triangular Kind; };
+template<> struct AssignmentKind<DenseShape,TriangularShape>      { typedef Triangular2Dense      Kind; };
+template<> struct AssignmentKind<TriangularShape,DenseShape>      { typedef Dense2Triangular      Kind; };
+
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular, Scalar>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
+    
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense, Scalar>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);  
+  }
+};
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular, Scalar>
+{
+  EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);  
+  }
+};
+
+
+template<typename Kernel, unsigned int Mode, int UnrollCount, bool SetOpposite>
+struct triangular_assignment_loop
+{
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+  
+  enum {
+    col = (UnrollCount-1) / DstXprType::RowsAtCompileTime,
+    row = (UnrollCount-1) % DstXprType::RowsAtCompileTime
+  };
+  
+  typedef typename Kernel::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    triangular_assignment_loop<Kernel, Mode, UnrollCount-1, SetOpposite>::run(kernel);
+    
+    if(row==col)
+      kernel.assignDiagonalCoeff(row);
+    else if( ((Mode&Lower) && row>col) || ((Mode&Upper) && row<col) )
+      kernel.assignCoeff(row,col);
+    else if(SetOpposite)
+      kernel.assignOppositeCoeff(row,col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite>
+{
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &) {}
+};
+
+
+
+// TODO: experiment with a recursive assignment procedure splitting the current
+//       triangular part into one rectangular and two triangular parts.
+
+
+template<typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
+{
+  typedef typename Kernel::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  static inline void run(Kernel &kernel)
+  {
+    for(Index j = 0; j < kernel.cols(); ++j)
+    {
+      Index maxi = (std::min)(j, kernel.rows());
+      Index i = 0;
+      if (((Mode&Lower) && SetOpposite) || (Mode&Upper))
+      {
+        for(; i < maxi; ++i)
+          if(Mode&Upper) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+      else
+        i = maxi;
+      
+      if(i<kernel.rows()) // then i==j
+        kernel.assignDiagonalCoeff(i++);
+      
+      if (((Mode&Upper) && SetOpposite) || (Mode&Lower))
+      {
+        for(; i < kernel.rows(); ++i)
+          if(Mode&Lower) kernel.assignCoeff(i, j);
+          else           kernel.assignOppositeCoeff(i, j);
+      }
+    }
+  }
+};
+
+} // end namespace internal
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+  * If the matrix is triangular, the opposite part is set to zero. */
+template<typename Derived>
+template<typename DenseDerived>
+void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+{
+  other.derived().resize(this->rows(), this->cols());
+  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+}
+
+namespace internal {
+  
+// Triangular = Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::assign_op<Scalar>, Dense2Triangular, Scalar>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    dst.setZero();
+    dst._assignProduct(src, 1);
+  }
+};
+
+// Triangular += Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::add_assign_op<Scalar>, Dense2Triangular, Scalar>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar> &)
+  {
+    dst._assignProduct(src, 1);
+  }
+};
+
+// Triangular -= Product
+template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs,Rhs,DefaultProduct>, internal::sub_assign_op<Scalar>, Dense2Triangular, Scalar>
+{
+  typedef Product<Lhs,Rhs,DefaultProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar> &)
+  {
+    dst._assignProduct(src, -1);
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_TRIANGULARMATRIX_H
diff --git a/nuparu/include/Eigen/src/Core/VectorBlock.h b/nuparu/include/Eigen/src/Core/VectorBlock.h
index 1a7330f3..216c568c 100644
--- a/nuparu/include/Eigen/src/Core/VectorBlock.h
+++ b/nuparu/include/Eigen/src/Core/VectorBlock.h
@@ -72,6 +72,7 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Dynamic-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector,
              IsColVector ? start : 0, IsColVector ? 0 : start,
@@ -82,6 +83,7 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Fixed-size constructor
       */
+    EIGEN_DEVICE_FUNC
     inline VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
     {
diff --git a/nuparu/include/Eigen/src/Core/VectorwiseOp.h b/nuparu/include/Eigen/src/Core/VectorwiseOp.h
index 51156487..483f7190 100644
--- a/nuparu/include/Eigen/src/Core/VectorwiseOp.h
+++ b/nuparu/include/Eigen/src/Core/VectorwiseOp.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_PARTIAL_REDUX_H
 #define EIGEN_PARTIAL_REDUX_H
 
-namespace Eigen { 
+namespace Eigen {
 
 /** \class PartialReduxExpr
   * \ingroup Core_Module
@@ -41,63 +41,43 @@ struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
   typedef typename MatrixType::Scalar InputScalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
-    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
-    TraversalSize = Direction==Vertical ? RowsAtCompileTime : ColsAtCompileTime
-  };
-  #if EIGEN_GNUC_AT_LEAST(3,4)
-  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
-  #else
-  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
-  #endif
-  enum {
-    CoeffReadCost = TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
+    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
   };
 };
 }
 
 template< typename MatrixType, typename MemberOp, int Direction>
-class PartialReduxExpr : internal::no_assignment_operator,
-  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
+class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
+                         internal::no_assignment_operator
 {
   public:
 
     typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
-    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;
 
-    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+    EIGEN_DEVICE_FUNC
+    explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
+    EIGEN_DEVICE_FUNC
     Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
+    EIGEN_DEVICE_FUNC
     Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
 
-    EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(j));
-      else
-        return m_functor(m_matrix.row(i));
-    }
+    EIGEN_DEVICE_FUNC
+    typename MatrixType::Nested nestedExpression() const { return m_matrix; }
 
-    const Scalar coeff(Index index) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(index));
-      else
-        return m_functor(m_matrix.row(index));
-    }
+    EIGEN_DEVICE_FUNC
+    const MemberOp& functor() const { return m_functor; }
 
   protected:
-    MatrixTypeNested m_matrix;
+    typename MatrixType::Nested m_matrix;
     const MemberOp m_functor;
 };
 
@@ -109,7 +89,8 @@ class PartialReduxExpr : internal::no_assignment_operator,
     template<typename Scalar, int Size> struct Cost                     \
     { enum { value = COST }; };                                         \
     template<typename XprType>                                          \
-    EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                               \
+    ResultType operator()(const XprType& mat) const                     \
     { return mat.MEMBER(); } \
   }
 
@@ -129,17 +110,27 @@ EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 
+template <int p, typename ResultType>
+struct member_lpnorm {
+  typedef ResultType result_type;
+  template<typename Scalar, int Size> struct Cost
+  { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
+  EIGEN_DEVICE_FUNC member_lpnorm() {}
+  template<typename XprType>
+  EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const
+  { return mat.template lpNorm<p>(); }
+};
 
 template <typename BinaryOp, typename Scalar>
 struct member_redux {
   typedef typename result_of<
-                     BinaryOp(Scalar)
+                     BinaryOp(Scalar,Scalar)
                    >::type  result_type;
   template<typename _Scalar, int Size> struct Cost
   { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  member_redux(const BinaryOp func) : m_functor(func) {}
+  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
   template<typename Derived>
-  inline result_type operator()(const DenseBase<Derived>& mat) const
+  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
   { return mat.redux(m_functor); }
   const BinaryOp m_functor;
 };
@@ -168,16 +159,15 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     typedef typename ExpressionType::Scalar Scalar;
     typedef typename ExpressionType::RealScalar RealScalar;
-    typedef typename ExpressionType::Index Index;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
     typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
 
     template<template<typename _Scalar> class Functor,
-                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
+                      typename Scalar_=Scalar> struct ReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar>,
+                               Functor<Scalar_>,
                                Direction
                               > Type;
     };
@@ -185,23 +175,24 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     template<typename BinaryOp> struct ReduxReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
+                               internal::member_redux<BinaryOp,Scalar>,
                                Direction
                               > Type;
     };
 
     enum {
-      IsVertical   = (Direction==Vertical) ? 1 : 0,
-      IsHorizontal = (Direction==Horizontal) ? 1 : 0
+      isVertical   = (Direction==Vertical) ? 1 : 0,
+      isHorizontal = (Direction==Horizontal) ? 1 : 0
     };
 
   protected:
 
     /** \internal
       * \returns the i-th subvector according to the \c Direction */
-    typedef typename internal::conditional<Direction==Vertical,
+    typedef typename internal::conditional<isVertical,
                                typename ExpressionType::ColXpr,
                                typename ExpressionType::RowXpr>::type SubVector;
+    EIGEN_DEVICE_FUNC
     SubVector subVector(Index i)
     {
       return SubVector(m_matrix.derived(),i);
@@ -209,58 +200,62 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** \internal
       * \returns the number of subvectors in the direction \c Direction */
+    EIGEN_DEVICE_FUNC
     Index subVectors() const
-    { return Direction==Vertical?m_matrix.cols():m_matrix.rows(); }
+    { return isVertical?m_matrix.cols():m_matrix.rows(); }
 
     template<typename OtherDerived> struct ExtendedType {
       typedef Replicate<OtherDerived,
-                        Direction==Vertical   ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Horizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
+                        isHorizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
     };
 
     /** \internal
       * Replicates a vector to match the size of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename ExtendedType<OtherDerived>::Type
     extendedTo(const DenseBase<OtherDerived>& other) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxColsAtCompileTime==1),
                           YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxRowsAtCompileTime==1),
                           YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
       return typename ExtendedType<OtherDerived>::Type
                       (other.derived(),
-                       Direction==Vertical   ? 1 : m_matrix.rows(),
-                       Direction==Horizontal ? 1 : m_matrix.cols());
+                       isVertical   ? 1 : m_matrix.rows(),
+                       isHorizontal ? 1 : m_matrix.cols());
     }
-    
+
     template<typename OtherDerived> struct OppositeExtendedType {
       typedef Replicate<OtherDerived,
-                        Direction==Horizontal ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Vertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
+                        isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                        isVertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
     };
 
     /** \internal
       * Replicates a vector in the opposite direction to match the size of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     typename OppositeExtendedType<OtherDerived>::Type
     extendedToOpposite(const DenseBase<OtherDerived>& other) const
     {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxColsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isHorizontal, OtherDerived::MaxColsAtCompileTime==1),
                           YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxRowsAtCompileTime==1),
+      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(isVertical, OtherDerived::MaxRowsAtCompileTime==1),
                           YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
       return typename OppositeExtendedType<OtherDerived>::Type
                       (other.derived(),
-                       Direction==Horizontal  ? 1 : m_matrix.rows(),
-                       Direction==Vertical    ? 1 : m_matrix.cols());
+                       isHorizontal  ? 1 : m_matrix.rows(),
+                       isVertical    ? 1 : m_matrix.cols());
     }
 
   public:
-
-    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+    EIGEN_DEVICE_FUNC
+    explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
 
     /** \internal */
+    EIGEN_DEVICE_FUNC
     inline const ExpressionType& _expression() const { return m_matrix; }
 
     /** \returns a row or column vector expression of \c *this reduxed by \a func
@@ -271,80 +266,125 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
+    EIGEN_DEVICE_FUNC
     const typename ReduxReturnType<BinaryOp>::Type
     redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
+
+    typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
+    typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
+    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
+    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
+    typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
+    typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
+    typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
+    typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
+    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
+    typedef typename ReturnType<internal::member_all>::Type AllReturnType;
+    typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
+    typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+    typedef Reverse<ExpressionType, Direction> ReverseReturnType;
+
+    template<int p> struct LpNormReturnType {
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
+    };
 
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
-      * 
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
       * Output: \verbinclude PartialRedux_minCoeff.out
       *
       * \sa DenseBase::minCoeff() */
-    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MinCoeffReturnType minCoeff() const
+    { return MinCoeffReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the largest coefficient
       * of each column (or row) of the referenced expression.
-      * 
+      *
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
       * Output: \verbinclude PartialRedux_maxCoeff.out
       *
       * \sa DenseBase::maxCoeff() */
-    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MaxCoeffReturnType maxCoeff() const
+    { return MaxCoeffReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the squared norm
       * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * Example: \include PartialRedux_squaredNorm.cpp
       * Output: \verbinclude PartialRedux_squaredNorm.out
       *
       * \sa DenseBase::squaredNorm() */
-    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const SquaredNormReturnType squaredNorm() const
+    { return SquaredNormReturnType(_expression()); }
+
+    /** \returns a row (or column) vector expression of the norm
+      * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
+      *
+      * Example: \include PartialRedux_norm.cpp
+      * Output: \verbinclude PartialRedux_norm.out
+      *
+      * \sa DenseBase::norm() */
+    EIGEN_DEVICE_FUNC
+    const NormReturnType norm() const
+    { return NormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * Example: \include PartialRedux_norm.cpp
       * Output: \verbinclude PartialRedux_norm.out
       *
       * \sa DenseBase::norm() */
-    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
-    { return _expression(); }
+    template<int p>
+    EIGEN_DEVICE_FUNC
+    const typename LpNormReturnType<p>::Type lpNorm() const
+    { return typename LpNormReturnType<p>::Type(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, using
-      * blue's algorithm.
+      * Blue's algorithm.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::blueNorm() */
-    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const BlueNormReturnType blueNorm() const
+    { return BlueNormReturnType(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, avoiding
       * underflow and overflow.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::stableNorm() */
-    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const StableNormReturnType stableNorm() const
+    { return StableNormReturnType(_expression()); }
 
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression, avoiding
       * underflow and overflow using a concatenation of hypot() calls.
+      * This is a vector with real entries, even if the original matrix has complex entries.
       *
       * \sa DenseBase::hypotNorm() */
-    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const HypotNormReturnType hypotNorm() const
+    { return HypotNormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the sum
       * of each column (or row) of the referenced expression.
@@ -353,39 +393,48 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * Output: \verbinclude PartialRedux_sum.out
       *
       * \sa DenseBase::sum() */
-    const typename ReturnType<internal::member_sum>::Type sum() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const SumReturnType sum() const
+    { return SumReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the mean
     * of each column (or row) of the referenced expression.
     *
     * \sa DenseBase::mean() */
-    const typename ReturnType<internal::member_mean>::Type mean() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const MeanReturnType mean() const
+    { return MeanReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b all coefficients of each respective column (or row) are \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
       *
       * \sa DenseBase::all() */
-    const typename ReturnType<internal::member_all>::Type all() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const AllReturnType all() const
+    { return AllReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b at \b least one coefficient of each respective column (or row) is \c true.
+      * This expression can be assigned to a vector with entries of type \c bool.
       *
       * \sa DenseBase::any() */
-    const typename ReturnType<internal::member_any>::Type any() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const AnyReturnType any() const
+    { return AnyReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * the number of \c true coefficients of each respective column (or row).
+      * This expression can be assigned to a vector whose entries have the same type as is used to
+      * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
       *
       * Example: \include PartialRedux_count.cpp
       * Output: \verbinclude PartialRedux_count.out
       *
       * \sa DenseBase::count() */
-    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const CountReturnType count() const
+    { return CountReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the product
       * of each column (or row) of the referenced expression.
@@ -394,8 +443,9 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * Output: \verbinclude PartialRedux_prod.out
       *
       * \sa DenseBase::prod() */
-    const typename ReturnType<internal::member_prod>::Type prod() const
-    { return _expression(); }
+    EIGEN_DEVICE_FUNC
+    const ProdReturnType prod() const
+    { return ProdReturnType(_expression()); }
 
 
     /** \returns a matrix expression
@@ -405,10 +455,12 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * Output: \verbinclude Vectorwise_reverse.out
       *
       * \sa DenseBase::reverse() */
-    const Reverse<ExpressionType, Direction> reverse() const
-    { return Reverse<ExpressionType, Direction>( _expression() ); }
+    EIGEN_DEVICE_FUNC
+    const ReverseReturnType reverse() const
+    { return ReverseReturnType( _expression() ); }
 
-    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
+    typedef Replicate<ExpressionType,(isVertical?Dynamic:1),(isHorizontal?Dynamic:1)> ReplicateReturnType;
+    EIGEN_DEVICE_FUNC
     const ReplicateReturnType replicate(Index factor) const;
 
     /**
@@ -420,17 +472,20 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
       */
     // NOTE implemented here because of sunstudio's compilation errors
-    template<int Factor> const Replicate<ExpressionType,(IsVertical?Factor:1),(IsHorizontal?Factor:1)>
+    // isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
+    template<int Factor> const Replicate<ExpressionType,isVertical*Factor+isHorizontal,isHorizontal*Factor+isVertical>
+    EIGEN_DEVICE_FUNC
     replicate(Index factor = Factor) const
     {
-      return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
-          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+      return Replicate<ExpressionType,(isVertical?Factor:1),(isHorizontal?Factor:1)>
+          (_expression(),isVertical?factor:1,isHorizontal?factor:1);
     }
 
 /////////// Artithmetic operators ///////////
 
     /** Copies the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -441,6 +496,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Adds the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -450,6 +506,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Substracts the vector \a other to each subvector of \c *this */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -459,6 +516,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Multiples each subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -470,6 +528,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Divides each subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -480,7 +539,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     }
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
@@ -493,6 +552,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
@@ -505,10 +565,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
 
     /** Returns the expression where each subvector is the product of the vector \a other
       * by the corresponding subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
+    template<typename OtherDerived> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_product_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
+    EIGEN_DEVICE_FUNC
     operator*(const DenseBase<OtherDerived>& other) const
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
@@ -520,6 +581,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     /** Returns the expression where each subvector is the quotient of the corresponding
       * subvector of \c *this by the vector \a other */
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename ExtendedType<OtherDerived>::Type>
@@ -530,32 +592,35 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       return m_matrix / extendedTo(other.derived());
     }
-    
+
     /** \returns an expression where each column of row of the referenced matrix are normalized.
       * The referenced matrix is \b not modified.
       * \sa MatrixBase::normalized(), normalize()
       */
+    EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
                   const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
     normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
-    
-    
+
+
     /** Normalize in-place each row or columns of the referenced matrix.
       * \sa MatrixBase::normalize(), normalized()
       */
-    void normalize() {
+    EIGEN_DEVICE_FUNC void normalize() {
       m_matrix = this->normalized();
     }
 
+    EIGEN_DEVICE_FUNC inline void reverseInPlace();
+
 /////////// Geometry module ///////////
 
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    Homogeneous<ExpressionType,Direction> homogeneous() const;
-    #endif
+    typedef Homogeneous<ExpressionType,Direction> HomogeneousReturnType;
+    HomogeneousReturnType homogeneous() const;
 
     typedef typename ExpressionType::PlainObject CrossReturnType;
     template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
 
     enum {
@@ -586,19 +651,8 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     ExpressionTypeNested m_matrix;
 };
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_colwise.cpp
-  * Output: \verbinclude MatrixBase_colwise.out
-  *
-  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstColwiseReturnType
-DenseBase<Derived>::colwise() const
-{
-  return derived();
-}
+//const colwise moved to DenseBase.h due to CUDA compiler bug
+
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
   *
@@ -608,22 +662,11 @@ template<typename Derived>
 inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
-  return derived();
+  return ColwiseReturnType(derived());
 }
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_rowwise.cpp
-  * Output: \verbinclude MatrixBase_rowwise.out
-  *
-  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstRowwiseReturnType
-DenseBase<Derived>::rowwise() const
-{
-  return derived();
-}
+//const rowwise moved to DenseBase.h due to CUDA compiler bug
+
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
   *
@@ -633,7 +676,7 @@ template<typename Derived>
 inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
-  return derived();
+  return RowwiseReturnType(derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/Visitor.h b/nuparu/include/Eigen/src/Core/Visitor.h
index 64867b7a..7aac0b6e 100644
--- a/nuparu/include/Eigen/src/Core/Visitor.h
+++ b/nuparu/include/Eigen/src/Core/Visitor.h
@@ -22,6 +22,7 @@ struct visitor_impl
     row = (UnrollCount-1) % Derived::RowsAtCompileTime
   };
 
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived &mat, Visitor& visitor)
   {
     visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
@@ -32,6 +33,7 @@ struct visitor_impl
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived &mat, Visitor& visitor)
   {
     return visitor.init(mat.coeff(0, 0), 0, 0);
@@ -41,7 +43,7 @@ struct visitor_impl<Visitor, Derived, 1>
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
-  typedef typename Derived::Index Index;
+  EIGEN_DEVICE_FUNC
   static inline void run(const Derived& mat, Visitor& visitor)
   {
     visitor.init(mat.coeff(0,0), 0, 0);
@@ -53,6 +55,33 @@ struct visitor_impl<Visitor, Derived, Dynamic>
   }
 };
 
+// evaluator adaptor
+template<typename XprType>
+class visitor_evaluator
+{
+public:
+  EIGEN_DEVICE_FUNC
+  explicit visitor_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    CoeffReadCost = internal::evaluator<XprType>::CoeffReadCost
+  };
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+  
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
+};
 } // end namespace internal
 
 /** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
@@ -74,16 +103,17 @@ struct visitor_impl<Visitor, Derived, Dynamic>
   */
 template<typename Derived>
 template<typename Visitor>
+EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  enum { unroll = SizeAtCompileTime != Dynamic
-                   && CoeffReadCost != Dynamic
-                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                      <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, Derived,
-      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(derived(), visitor);
+  typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+  
+  enum {
+    unroll =  SizeAtCompileTime != Dynamic
+           && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
+  };
+  return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
 
 namespace internal {
@@ -94,10 +124,10 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
-  typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
   Index row, col;
   Scalar res;
+  EIGEN_DEVICE_FUNC
   inline void init(const Scalar& value, Index i, Index j)
   {
     res = value;
@@ -114,8 +144,8 @@ struct coeff_visitor
 template <typename Derived>
 struct min_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Index Index;
   typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
     if(value < this->res)
@@ -142,8 +172,8 @@ struct functor_traits<min_coeff_visitor<Scalar> > {
 template <typename Derived>
 struct max_coeff_visitor : coeff_visitor<Derived>
 {
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar; 
+  EIGEN_DEVICE_FUNC
   void operator() (const Scalar& value, Index i, Index j)
   {
     if(value > this->res)
@@ -171,6 +201,7 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
@@ -188,13 +219,14 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   internal::min_coeff_visitor<Derived> minVisitor;
   this->visit(minVisitor);
-  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
+  *index = IndexType((RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row);
   return minVisitor.res;
 }
 
@@ -205,6 +237,7 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
@@ -222,6 +255,7 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
   */
 template<typename Derived>
 template<typename IndexType>
+EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
diff --git a/nuparu/include/Eigen/src/Core/arch/AVX/CMakeLists.txt b/nuparu/include/Eigen/src/Core/arch/AVX/CMakeLists.txt
new file mode 100644
index 00000000..bdb71ab9
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/AVX/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_AVX_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
+)
diff --git a/nuparu/include/Eigen/src/Core/arch/AVX/Complex.h b/nuparu/include/Eigen/src/Core/arch/AVX/Complex.h
new file mode 100644
index 00000000..b16e0ddd
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/AVX/Complex.h
@@ -0,0 +1,463 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX_H
+#define EIGEN_COMPLEX_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet4cf
+{
+  EIGEN_STRONG_INLINE Packet4cf() {}
+  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
+  __m256  v;
+};
+
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
+{
+  typedef Packet4cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
+
+template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
+{
+  return Packet4cf(pnegate(a.v));
+}
+template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
+{
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
+  return Packet4cf(_mm256_xor_ps(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
+  __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+  __m256 result = _mm256_addsub_ps(tmp1, tmp2);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
+template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
+{
+  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
+{
+  // FIXME The following might be optimized using _mm256_movedup_pd
+  Packet2cf a = ploaddup<Packet2cf>(from);
+  Packet2cf b = ploaddup<Packet2cf>(from+1);
+  return  Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
+                                 std::imag(from[2*stride]), std::real(from[2*stride]),
+                                 std::imag(from[1*stride]), std::real(from[1*stride]),
+                                 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
+{
+  __m128 low = _mm256_extractf128_ps(from.v, 0);
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+
+  __m128 high = _mm256_extractf128_ps(from.v, 1);
+  to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+  to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
+
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet4cf>(const Packet4cf& a)
+{
+  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+  __m128 low  = _mm256_extractf128_ps(a.v, 0);
+  __m128 high = _mm256_extractf128_ps(a.v, 1);
+  __m128d lowd  = _mm_castps_pd(low);
+  __m128d highd = _mm_castps_pd(high);
+  low  = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
+  high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+  __m256 result = _mm256_setzero_ps();
+  result = _mm256_insertf128_ps(result, low, 1);
+  result = _mm256_insertf128_ps(result, high, 0);
+  return Packet4cf(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
+{
+  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
+                     Packet2cf(_mm256_extractf128_ps(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
+{
+  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t0 = _mm256_hadd_ps(t0,t1);
+  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t2 = _mm256_hadd_ps(t2,t3);
+  
+  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
+  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
+
+  return Packet4cf(_mm256_add_ps(t1,t3));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
+{
+  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
+                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet4cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet8f, Packet4cf, false,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
+  { return Packet4cf(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet4cf, Packet8f, false,false>
+{
+  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
+  { return Packet4cf(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
+{
+  Packet4cf num = pmul(a, pconj(b));
+  __m256 tmp = _mm256_mul_ps(b.v, b.v);
+  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
+  __m256 denom = _mm256_add_ps(tmp, tmp2);
+  return Packet4cf(_mm256_div_ps(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
+{
+  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+}
+
+//---------- double ----------
+struct Packet2cd
+{
+  EIGEN_STRONG_INLINE Packet2cd() {}
+  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
+  __m256d  v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet2cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 2,
+    HasHalfPacket = 1,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
+{
+  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
+  return Packet2cd(_mm256_xor_pd(a.v,mask));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+  __m256d even = _mm256_mul_pd(tmp1, b.v);
+  __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
+  __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
+  __m256d odd  = _mm256_mul_pd(tmp2, tmp3);
+  return Packet2cd(_mm256_addsub_pd(even, odd));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
+{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
+{
+  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
+//   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+    return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
+{
+  return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
+				 std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
+{
+  __m128d low = _mm256_extractf128_pd(from.v, 0);
+  to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+  __m128d high = _mm256_extractf128_pd(from.v, 1);
+  to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
+{
+  __m128d low = _mm256_extractf128_pd(a.v, 0);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0],res[1]);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
+  return Packet2cd(result);
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
+{
+  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
+{
+  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
+  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
+
+  return Packet2cd(_mm256_add_pd(t0,t1));
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
+{
+  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
+                     Packet1cd(_mm256_extractf128_pd(a.v,1))));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> struct conj_helper<Packet4d, Packet2cd, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
+  { return Packet2cd(Eigen::internal::pmul(x, y.v)); }
+};
+
+template<> struct conj_helper<Packet2cd, Packet4d, false,false>
+{
+  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
+  { return padd(c, pmul(x,y)); }
+
+  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
+  { return Packet2cd(Eigen::internal::pmul(x.v, y)); }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
+{
+  Packet2cd num = pmul(a, pconj(b));
+  __m256d tmp = _mm256_mul_pd(b.v, b.v);
+  __m256d denom = _mm256_hadd_pd(tmp, tmp);
+  return Packet2cd(_mm256_div_pd(num.v, denom));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
+{
+  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4cf,4>& kernel) {
+  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cd,2>& kernel) {
+  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
+  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
+ kernel.packet[0].v = tmp;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPLEX_AVX_H
diff --git a/nuparu/include/Eigen/src/Core/arch/AVX/MathFunctions.h b/nuparu/include/Eigen/src/Core/arch/AVX/MathFunctions.h
new file mode 100644
index 00000000..c4bd6bd5
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -0,0 +1,441 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
+#define EIGEN_MATH_FUNCTIONS_AVX_H
+
+// For some reason, this function didn't make it into the avxintirn.h
+// used by the compiler, so we'll just wrap it.
+#define _mm256_setr_m128(lo, hi) \
+  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+
+/* The sin, cos, exp, and log functions of this file are loosely derived from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+namespace Eigen {
+
+namespace internal {
+
+// Sine function
+// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
+// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
+// are (anti-)symmetric and thus have only odd/even coefficients
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+psin<Packet8f>(const Packet8f& _x) {
+  Packet8f x = _x;
+
+  // Some useful values.
+  _EIGEN_DECLARE_CONST_Packet8i(one, 1);
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
+  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
+
+  // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
+  Packet8f z = pmul(x, p8f_one_over_pi);
+  Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
+  x = pmadd(shift, p8f_neg_pi_first, x);
+  x = pmadd(shift, p8f_neg_pi_second, x);
+  x = pmadd(shift, p8f_neg_pi_third, x);
+  z = pmul(x, p8f_four_over_pi);
+
+  // Make a mask for the entries that need flipping, i.e. wherever the shift
+  // is odd.
+  Packet8i shift_ints = _mm256_cvtps_epi32(shift);
+  Packet8i shift_isodd =
+      _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
+#ifdef EIGEN_VECTORIZE_AVX2
+  Packet8i sign_flip_mask = _mm256_slli_epi32(shift_isodd, 31);
+#else
+  __m128i lo =
+      _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 0), 31);
+  __m128i hi =
+      _mm_slli_epi32(_mm256_extractf128_si256(shift_isodd, 1), 31);
+  Packet8i sign_flip_mask = _mm256_setr_m128(lo, hi);
+#endif
+
+  // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
+  // is set to ones for that entry.
+  Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
+
+  // Evaluate the polynomial for the interval [1,3] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
+  Packet8f z_minus_two = psub(z, p8f_two);
+  Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
+  Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
+
+  // Evaluate the polynomial for the interval [-1,1] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
+  Packet8f z2 = pmul(z, z);
+  Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
+  left = pmadd(left, z2, p8f_coeff_left_3);
+  left = pmadd(left, z2, p8f_coeff_left_1);
+  left = pmul(left, z);
+
+  // Assemble the results, i.e. select the left and right polynomials.
+  left = _mm256_andnot_ps(ival_mask, left);
+  right = _mm256_and_ps(ival_mask, right);
+  Packet8f res = _mm256_or_ps(left, right);
+
+  // Flip the sign on the odd intervals and return the result.
+  res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
+  return res;
+}
+
+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+plog<Packet8f>(const Packet8f& _x) {
+  Packet8f x = _x;
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
+
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  // The smallest non denormalized float number.
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
+
+  // Polynomial coefficients.
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
+
+  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
+  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, p8f_min_norm_pos);
+
+// Extract the shifted exponents (No bitwise shifting in regular AVX, so
+// convert to SSE and do it there).
+#ifdef EIGEN_VECTORIZE_AVX2
+  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(x), 23));
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 0), 23);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(x), 1), 23);
+  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi));
+#endif
+  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
+
+  // Set the exponents to -1, i.e. x are in the range [0.5,1).
+  x = _mm256_and_ps(x, p8f_inv_mant_mask);
+  x = _mm256_or_ps(x, p8f_half);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
+  Packet8f tmp = _mm256_and_ps(x, mask);
+  x = psub(x, p8f_1);
+  e = psub(e, _mm256_and_ps(p8f_1, mask));
+  x = padd(x, tmp);
+
+  Packet8f x2 = pmul(x, x);
+  Packet8f x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet8f y, y1, y2;
+  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
+  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
+  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
+  y = pmadd(y, x, p8f_cephes_log_p2);
+  y1 = pmadd(y1, x, p8f_cephes_log_p5);
+  y2 = pmadd(y2, x, p8f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  y1 = pmul(e, p8f_cephes_log_q1);
+  tmp = pmul(x2, p8f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p8f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+
+  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  return _mm256_or_ps(
+      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
+      _mm256_and_ps(iszero_mask, p8f_minus_inf));
+}
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+pexp<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
+
+  // Clamp x.
+  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
+
+// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+// truncation errors. Note that we don't use the "pmadd" function here to
+// ensure that a precision-preserving FMA instruction is used.
+#ifdef EIGEN_VECTORIZE_FMA
+  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
+  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
+#else
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
+  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
+  r = psub(r, pmul(m, p8f_cephes_exp_C2));
+#endif
+
+  Packet8f r2 = pmul(r, r);
+
+  // TODO(gonnet): Split into odd/even polynomials and try to exploit
+  //               instruction-level parallelism.
+  Packet8f y = p8f_cephes_exp_p0;
+  y = pmadd(y, r, p8f_cephes_exp_p1);
+  y = pmadd(y, r, p8f_cephes_exp_p2);
+  y = pmadd(y, r, p8f_cephes_exp_p3);
+  y = pmadd(y, r, p8f_cephes_exp_p4);
+  y = pmadd(y, r, p8f_cephes_exp_p5);
+  y = pmadd(y, r2, r);
+  y = padd(y, p8f_1);
+
+  // Build emm0 = 2^m.
+  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
+#ifdef EIGEN_VECTORIZE_AVX2
+  emm0 = _mm256_slli_epi32(emm0, 23);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23);
+  emm0 = _mm256_setr_m128(lo, hi);
+#endif
+
+  // Return 2^m * exp(r).
+  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
+}
+
+// Functions for sqrt.
+// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
+// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
+// exact solution. The main advantage of this approach is not just speed, but
+// also the fact that it can be inlined and pipelined with other computations,
+// further reducing its effective latency.
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
+psqrt<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f non_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_GE_OQ);
+  Packet8f x = _mm256_and_ps(non_zero_mask, _mm256_rsqrt_ps(_x));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+  // Multiply the original _x by it's reciprocal square root to extract the
+  // square root.
+  return pmul(_x, x);
+}
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f psqrt<Packet8f>(const Packet8f& x) {
+  return _mm256_sqrt_ps(x);
+}
+#endif
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d psqrt<Packet4d>(const Packet4d& x) {
+  return _mm256_sqrt_pd(x);
+}
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
+
+  Packet8f neg_half = pmul(_x, p8f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
+}
+
+#else
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet8f prsqrt<Packet8f>(const Packet8f& x) {
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x));
+}
+#endif
+
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4d prsqrt<Packet4d>(const Packet4d& x) {
+  _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
+  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x));
+}
+
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_AVX_H
diff --git a/nuparu/include/Eigen/src/Core/arch/AVX/PacketMath.h b/nuparu/include/Eigen/src/Core/arch/AVX/PacketMath.h
new file mode 100644
index 00000000..717ae67c
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -0,0 +1,607 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX_H
+#define EIGEN_PACKET_MATH_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#endif
+
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m256  Packet8f;
+typedef __m256i Packet8i;
+typedef __m256d Packet4d;
+
+template<> struct is_arithmetic<__m256>  { enum { value = true }; };
+template<> struct is_arithmetic<__m256i> { enum { value = true }; };
+template<> struct is_arithmetic<__m256d> { enum { value = true }; };
+
+#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
+  const Packet8f p8f_##NAME = pset1<Packet8f>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
+  const Packet4d p4d_##NAME = pset1<Packet4d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \
+  const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X))
+
+#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \
+  const Packet8i p8i_##NAME = pset1<Packet8i>(X)
+
+
+template<> struct packet_traits<float>  : default_packet_traits
+{
+  typedef Packet8f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8,
+    HasHalfPacket = 1,
+
+    HasDiv  = 1,
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+  };
+};
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet4d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 1,
+
+    HasDiv  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+  };
+};
+
+/* Proper support for integers is only provided by AVX2. In the meantime, we'll
+   use SSE instructions and packets to deal with integers.
+template<> struct packet_traits<int>    : default_packet_traits
+{
+  typedef Packet8i type;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=8
+  };
+};
+*/
+
+template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
+
+template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
+
+template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
+template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
+{
+  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
+{
+  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
+
+
+template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
+{ eigen_assert(false && "packet integer division are not supported by AVX");
+  return pset1<Packet8i>(0);
+}
+
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+  // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
+  // and gcc stupidly generates a vfmadd132ps instruction,
+  // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate
+  // the result of the product.
+  Packet8f res = c;
+  __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_ps(a,b,c);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+  // see above
+  Packet4d res = c;
+  __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b));
+  return res;
+#else
+  return _mm256_fmadd_pd(a,b,c);
+#endif
+}
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+
+template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
+template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+
+// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
+template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
+{
+  // TODO try to find a way to avoid the need of a temporary register
+//   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
+//   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+//   return _mm256_unpacklo_ps(tmp,tmp);
+  
+  // _mm256_insertf128_ps is very slow on Haswell, thus:
+  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+  // then we can perform a consistent permutation on the global register to get everything in shape:
+  return  _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+}
+// Loads 2 doubles from memory a returns the packet {a0, a0  a1, a1}
+template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
+{
+  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+  return  _mm256_permute_pd(tmp, 3<<2);
+}
+
+// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
+template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
+{
+  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
+  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+
+// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
+template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
+{
+  return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
+                       from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
+{
+  return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
+{
+  __m128 low = _mm256_extractf128_ps(from, 0);
+  to[stride*0] = _mm_cvtss_f32(low);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+
+  __m128 high = _mm256_extractf128_ps(from, 1);
+  to[stride*4] = _mm_cvtss_f32(high);
+  to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+  to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+  to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
+{
+  __m128d low = _mm256_extractf128_pd(from, 0);
+  to[stride*0] = _mm_cvtsd_f64(low);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+  __m128d high = _mm256_extractf128_pd(from, 1);
+  to[stride*2] = _mm_cvtsd_f64(high);
+  to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+}
+
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
+{
+  Packet8f pa = pset1<Packet8f>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
+{
+  Packet4d pa = pset1<Packet4d>(a);
+  pstore(to, pa);
+}
+template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
+{
+  Packet8i pa = pset1<Packet8i>(a);
+  pstore(to, pa);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {
+  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
+template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
+}
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet8i>(const Packet8i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+
+
+template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
+{
+  __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+  return _mm256_permute2f128_ps(tmp, tmp, 1);
+}
+template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
+{
+   __m256d tmp = _mm256_shuffle_pd(a,a,5);
+  return _mm256_permute2f128_pd(tmp, tmp, 1);
+
+  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
+    return _mm256_permute_pd(swap_halves,5);
+}
+
+// pabs should be ok
+template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
+{
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
+  return _mm256_and_ps(a,mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
+{
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
+  return _mm256_and_pd(a,mask);
+}
+
+// preduxp should be ok
+// FIXME: why is this ok? why isn't the simply implementation working as expected?
+template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
+{
+    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
+    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
+    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
+    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
+
+    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+    return final;
+}
+template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
+{
+ Packet4d tmp0, tmp1;
+
+  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  return _mm256_blend_pd(tmp0, tmp1, 0xC);
+}
+
+template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
+  tmp0 = _mm256_hadd_ps(tmp0,tmp0);
+  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
+}
+template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_hadd_pd(tmp0,tmp0));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f predux4<Packet8f>(const Packet8f& a)
+{
+  return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp;
+  tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp;
+  tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
+{
+  Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
+  tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
+  return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+}
+
+template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
+{
+  Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+  return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
+}
+
+
+template<int Offset>
+struct palign_impl<Offset,Packet8f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_ps(first, second, 1);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_ps(first, second, 3);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_ps(first, second, 7);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
+    }
+    else if (Offset==4)
+    {
+      first = _mm256_blend_ps(first, second, 15);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
+    }
+    else if (Offset==5)
+    {
+      first = _mm256_blend_ps(first, second, 31);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0x88);
+    }
+    else if (Offset==6)
+    {
+      first = _mm256_blend_ps(first, second, 63);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xcc);
+    }
+    else if (Offset==7)
+    {
+      first = _mm256_blend_ps(first, second, 127);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xee);
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_pd(first, second, 1);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 0xA);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_pd(first, second, 3);
+      first = _mm256_permute2f128_pd(first, first, 1);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_pd(first, second, 7);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 5);
+    }
+  }
+};
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,8>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+  __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
+  __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
+  __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
+  __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
+  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
+  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
+  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
+  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
+  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet8f,4>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
+  __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
+  __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
+  __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
+  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4d,4>& kernel) {
+  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
+  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
+  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+
+  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
+  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
+  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
+  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+  const __m256 zero = _mm256_setzero_ps();
+  const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
+}
+template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+  const __m256d zero = _mm256_setzero_pd();
+  const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
+  return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_PACKET_MATH_AVX_H
diff --git a/nuparu/include/Eigen/src/Core/arch/AVX/TypeCasting.h b/nuparu/include/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 00000000..83bfdc60
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+namespace Eigen {
+
+namespace internal {
+
+// For now we use SSE to handle integers, so we can't use AVX instructions to cast
+// from int to float
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 0,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+
+
+template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvtps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/nuparu/include/Eigen/src/Core/arch/AltiVec/Complex.h b/nuparu/include/Eigen/src/Core/arch/AltiVec/Complex.h
index 68d9a2bf..58c29617 100644
--- a/nuparu/include/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/nuparu/include/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -7,20 +7,21 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_COMPLEX_ALTIVEC_H
-#define EIGEN_COMPLEX_ALTIVEC_H
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
 
 namespace Eigen {
 
 namespace internal {
 
 static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+#ifdef _BIG_ENDIAN
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#else
+static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+#endif
 
 //---------- float ----------
 struct Packet2cf
@@ -33,6 +34,7 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
@@ -51,7 +53,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -61,10 +63,26 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
     res.v = pload<Packet4f>((const float *)&from);
   else
     res.v = ploadu<Packet4f>((const float *)&from);
-  res.v = vec_perm(res.v, res.v, p16uc_PSET_HI);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
   return res;
 }
 
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return Packet2cf(vec_ld(0, (const float*)af));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  vec_st(from.v, 0, (float*)af);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
@@ -75,16 +93,16 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   Packet4f v1, v2;
 
   // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE);
+  v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
   // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM);
+  v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
   // multiply a_re * b 
   v1 = vec_madd(v1, b.v, p4f_ZERO);
   // multiply a_im * b and get the conjugate result
   v2 = vec_madd(v2, b.v, p4f_ZERO);
   v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
   // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV);
+  v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
   
   return Packet2cf(vec_add(v1, v2));
 }
@@ -118,7 +136,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
 {
   Packet4f rev_a;
-  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2);
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
   return Packet2cf(rev_a);
 }
 
@@ -133,9 +151,13 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
 template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
   Packet4f b1, b2;
-  
+#ifdef _BIG_ENDIAN  
   b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
   b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+#else
+  b1 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
+#endif
   b2 = (Packet4f) vec_sld(b2, b2, 8);
   b2 = padd(b1, b2);
 
@@ -159,7 +181,11 @@ struct palign_impl<Offset,Packet2cf>
   {
     if (Offset==1)
     {
+#ifdef _BIG_ENDIAN
       first.v = vec_sld(first.v, second.v, 8);
+#else
+      first.v = vec_sld(second.v, first.v, 8);
+#endif
     }
   }
 };
@@ -202,16 +228,203 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   // TODO optimize it for AltiVec
   Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
   Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV))));
+  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
 {
-  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
+{
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+//---------- double ----------
+#ifdef __VSX__
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+{
+  std::complex<double> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet1cd>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+{
+  std::complex<double> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<double> >(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_add(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_sub(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+
+  return Packet1cd(vec_add(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from)
+{
+  return pset1<Packet1cd>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { vec_dstt((long *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<double> >(res, a);
+
+  return res[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
+{
+  return pfirst(a);
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_REVERSE64))));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+#endif // __VSX__
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_ALTIVEC_H
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/nuparu/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/nuparu/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
new file mode 100644
index 00000000..9e37e93f
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -0,0 +1,290 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+  /* the smallest non denormalized float number */
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+  /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+  */
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+
+  Packet4i emm0;
+
+  /* isvalid_mask is 0 if x < 0 or x is NaN. */
+  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
+  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
+                reinterpret_cast<Packet4ui>(p4i_23));
+
+  /* keep only the fractional part */
+  x = pand(x, p4f_inv_mant_mask);
+  x = por(x, p4f_half);
+
+  emm0 = psub(emm0, p4i_0x7f);
+  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
+  Packet4f tmp = pand(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, pand(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  x = vec_sel(x, p4f_minus_inf, iszero_mask);
+  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
+  return x;
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = vec_max(vec_min(x, p4f_exp_hi), p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = vec_cts(fx, 0);
+  emm0 = vec_add(emm0, p4i_0x7f);
+  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+                 isnumber_mask);
+}
+
+#ifdef __VSX__
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 0) || \
+    (EIGEN_GNUC_AT(4, 9) && __GNUC_PATCHLEVEL__ >= 3)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = ConvertToPacket2l(fx);
+
+#ifdef __POWER8_VECTOR__ 
+  static const Packet2l p2l_1023 = { 1023, 1023 };
+  static const Packet2ul p2ul_52 = { 52, 52 };
+
+  emm0 = vec_add(emm0, p2l_1023);
+  emm0 = vec_sl(emm0, p2ul_52);
+#else
+  // Code is a bit complex for POWER7.  There is actually a
+  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+  // So we shift (52-32) bits and do a word swap with zeros.
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
+
+  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+  emm04i = vec_add(emm04i, p4i_1023);
+  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+  static const Packet16uc perm = {
+    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef  _BIG_ENDIAN
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
+#endif
+
+#endif
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
+}
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/nuparu/include/Eigen/src/Core/arch/AltiVec/PacketMath.h b/nuparu/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
index e4089962..0dbbc2e4 100644
--- a/nuparu/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/nuparu/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2008-2014 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,13 +18,17 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
 typedef __vector float          Packet4f;
@@ -46,49 +50,100 @@ typedef __vector unsigned char  Packet16uc;
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
-
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
+#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
+  Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
+  Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
 
+
+// These constants are endian-agnostic
+static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+#ifndef __VSX__
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+#endif
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
+static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
+
+static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
+static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+
+// Mask alignment
+#ifdef __PPC64__
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+#else
+#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
+#endif
+
+#define _EIGEN_ALIGNED_PTR(x)	((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
 // Define global static constants:
-static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
-static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
-static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
-static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
-
-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN  = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#else
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 
+static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif // _BIG_ENDIAN
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);     //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16);                                         //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16);                                         //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#else
+static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+#endif // _BIG_ENDIAN
 
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
+  typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
+    HasHalfPacket=0,
 
     // FIXME check the Has*
+    HasDiv  = 1,
     HasSin  = 0,
     HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
     HasSqrt = 0
   };
 };
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
+  typedef Packet4i half;
   enum {
     // FIXME check the Has*
     Vectorizable = 1,
@@ -97,9 +152,22 @@ template<> struct packet_traits<int>    : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
-/*
+
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+
+inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
+{
+  union {
+    Packet16uc   v;
+    unsigned char n[16];
+  } vt;
+  vt.v = v;
+  for (int i=0; i< 16; i++)
+    s << (int)vt.n[i] << ", ";
+  return s;
+}
+
 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
 {
   union {
@@ -132,7 +200,7 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
   return s;
 }
-
+/*
 inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
 {
   union {
@@ -142,13 +210,21 @@ inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
   vt.v = v;
   s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
   return s;
-}
-*/
+}*/
+
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+
+template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
+
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
   // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   float EIGEN_ALIGN16 af[4];
   af[0] = from;
-  Packet4f vc = vec_ld(0, af);
+  Packet4f vc = pload<Packet4f>(af);
   vc = vec_splat(vc, 0);
   return vc;
 }
@@ -156,13 +232,70 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
   int EIGEN_ALIGN16 ai[4];
   ai[0] = from;
-  Packet4i vc = vec_ld(0, ai);
+  Packet4i vc = pload<Packet4i>(ai);
   vc = vec_splat(vc, 0);
   return vc;
 }
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  af[2] = from[2*stride];
+  af[3] = from[3*stride];
+ return pload<Packet4f>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4i>(ai);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  float EIGEN_ALIGN16 af[4];
+  pstore<float>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+  to[2*stride] = af[2];
+  to[3*stride] = af[3];
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  pstore<int>((int *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
 
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
@@ -215,7 +348,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 */
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  Packet4f t, y_0, y_1, res;
+#ifndef __VSX__  // VSX actually provides a div instruction
+  Packet4f t, y_0, y_1;
 
   // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
   y_0 = vec_re(b);
@@ -224,8 +358,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
   t   = vec_nmsub(y_0, b, p4f_ONE);
   y_1 = vec_madd(y_0, t, y_0);
 
-  res = vec_madd(a, y_1, p4f_ZERO);
-  return res;
+  return vec_madd(a, y_1, p4f_ZERO);
+#else
+  return vec_div(a, b);
+#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
@@ -243,7 +379,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
 
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
 
@@ -256,13 +391,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-
+#ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
   Packet16uc MSQ, LSQ;
   Packet16uc mask;
   MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
@@ -282,25 +414,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   mask = vec_lvsl(0, from);                        // create the permute mask
   return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
+#else
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
+}
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+}
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
 {
   Packet4f p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                              p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
+  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4f>(from);
+  else                             p = ploadu<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
   Packet4i p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                              p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
+  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet4i>(from);
+  else                             p = ploadu<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-
+#ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
 {
   EIGEN_DEBUG_UNALIGNED_STORE
@@ -337,15 +480,30 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
+#else
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
+}
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+}
+#endif
 
+#ifndef __VSX__
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+#endif
 
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE32); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
@@ -392,7 +550,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i sum;
   sum = vec_sums(a, p4i_ZERO);
+#ifdef _BIG_ENDIAN
   sum = vec_sld(sum, p4i_ZERO, 12);
+#else
+  sum = vec_sld(p4i_ZERO, sum, 4);
+#endif
   return pfirst(sum);
 }
 
@@ -479,8 +641,25 @@ struct palign_impl<Offset,Packet4f>
 {
   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
   {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
   }
 };
 
@@ -489,11 +668,270 @@ struct palign_impl<Offset,Packet4i>
 {
   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
   {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
   }
 };
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  Packet4f t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+
+//---------- double ----------
+#ifdef __VSX__
+typedef __vector double              Packet2d;
+typedef __vector unsigned long long  Packet2ul;
+typedef __vector long long           Packet2l;
+
+static Packet2l p2l_ZERO = (Packet2l) p4i_ZERO;
+static Packet2d p2d_ONE = { 1.0, 1.0 }; 
+static Packet2d p2d_ZERO = (Packet2d) p4f_ZERO;
+static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ZERO, (Packet16uc) p2d_ONE, 8);
+#else
+static Packet2d p2d_COUNTDOWN = (Packet2d) vec_sld((Packet16uc) p2d_ONE, (Packet16uc) p2d_ZERO, 8);
+#endif
+
+static EIGEN_STRONG_INLINE Packet2d vec_splat_dbl(Packet2d& a, int index)
+{
+  switch (index) {
+  case 0:
+    return (Packet2d) vec_perm(a, a, p16uc_PSET64_HI);
+  case 1:
+    return (Packet2d) vec_perm(a, a, p16uc_PSET64_LO);
+  }
+  return a;
+}
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasExp  = 1,
+    HasSqrt = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+
+
+inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
+{
+  union {
+    Packet2d   v;
+    double n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+// Need to define them first or we get specialization after instantiation errors
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d) vec_ld(0, (const float *) from); } //FIXME
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st((Packet4f)from, 0, (float *)to); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from;
+  Packet2d vc = pload<Packet2d>(af);
+  vc = vec_splat_dbl(vc, 0);
+  return vc;
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat_dbl(a1, 0);
+  a1 = vec_splat_dbl(a1, 1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat_dbl(a3, 0);
+  a3 = vec_splat_dbl(a3, 1);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return vec_add(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_add(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_sub(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return psub<Packet2d>(p2d_ZERO, a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_ZERO); }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
+{
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
+}
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p;
+  if((ptrdiff_t(from) % 16) == 0)  p = pload<Packet2d>(from);
+  else                             p = ploadu<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
+{
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { vec_dstt((const float *) addr, DST_CTRL(2,2,32), DST_CHAN); }
+
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return (Packet2d)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE64); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = (Packet2d) vec_sld((Packet4ui) a, (Packet4ui)a, 8);
+  sum = vec_add(a, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  Packet2d v[2], sum;
+  v[0] = vec_add(vecs[0], (Packet2d) vec_sld((Packet4ui) vecs[0], (Packet4ui) vecs[0], 8));
+  v[1] = vec_add(vecs[1], (Packet2d) vec_sld((Packet4ui) vecs[1], (Packet4ui) vecs[1], 8));
+ 
+#ifdef _BIG_ENDIAN
+ sum = (Packet2d) vec_sld((Packet4ui) v[0], (Packet4ui) v[1], 8);
+#else
+  sum = (Packet2d) vec_sld((Packet4ui) v[1], (Packet4ui) v[0], 8);
+#endif
+
+  return sum;
+}
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, (Packet2d)vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+}
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(vec_min(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(vec_max(a, (Packet2d) vec_sld((Packet4ui) a, (Packet4ui) a, 8)));
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset == 1)
+#ifdef _BIG_ENDIAN
+      first = (Packet2d) vec_sld((Packet4ui) first, (Packet4ui) second, 8);
+#else
+      first = (Packet2d) vec_sld((Packet4ui) second, (Packet4ui) first, 8);
+#endif
+  }
+};
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0, t1;
+  t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+#endif // __VSX__
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/arch/CMakeLists.txt b/nuparu/include/Eigen/src/Core/arch/CMakeLists.txt
index 8456dec1..42b0b486 100644
--- a/nuparu/include/Eigen/src/Core/arch/CMakeLists.txt
+++ b/nuparu/include/Eigen/src/Core/arch/CMakeLists.txt
@@ -1,4 +1,9 @@
-ADD_SUBDIRECTORY(SSE)
 ADD_SUBDIRECTORY(AltiVec)
-ADD_SUBDIRECTORY(NEON)
+ADD_SUBDIRECTORY(AVX)
+ADD_SUBDIRECTORY(CUDA)
 ADD_SUBDIRECTORY(Default)
+ADD_SUBDIRECTORY(NEON)
+ADD_SUBDIRECTORY(SSE)
+
+
+
diff --git a/nuparu/include/Eigen/src/Core/arch/CUDA/CMakeLists.txt b/nuparu/include/Eigen/src/Core/arch/CUDA/CMakeLists.txt
new file mode 100644
index 00000000..7ba28da7
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/CUDA/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_arch_CUDA_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_arch_CUDA_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/CUDA COMPONENT Devel
+)
diff --git a/nuparu/include/Eigen/src/Core/arch/CUDA/MathFunctions.h b/nuparu/include/Eigen/src/Core/arch/CUDA/MathFunctions.h
new file mode 100644
index 00000000..ecd5c444
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -0,0 +1,112 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plog<float4>(const float4& a)
+{
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plog<double2>(const double2& a)
+{
+  return make_double2(log(a.x), log(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pexp<float4>(const float4& a)
+{
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pexp<double2>(const double2& a)
+{
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 psqrt<float4>(const float4& a)
+{
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 psqrt<double2>(const double2& a)
+{
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 prsqrt<float4>(const float4& a)
+{
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 prsqrt<double2>(const double2& a)
+{
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erf(a.x), erf(a.y), erf(a.z), erf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/nuparu/include/Eigen/src/Core/arch/CUDA/PacketMath.h b/nuparu/include/Eigen/src/Core/arch/CUDA/PacketMath.h
new file mode 100644
index 00000000..cb1b547e
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,309 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+  return __ldg((const float4*)from);
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+  return __ldg((const double2*)from);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+  return make_double2(__ldg(from+0), __ldg(from+1));
+}
+#endif
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/nuparu/include/Eigen/src/Core/arch/NEON/Complex.h b/nuparu/include/Eigen/src/Core/arch/NEON/Complex.h
index f183d31d..d2d46793 100644
--- a/nuparu/include/Eigen/src/Core/arch/NEON/Complex.h
+++ b/nuparu/include/Eigen/src/Core/arch/NEON/Complex.h
@@ -28,10 +28,12 @@ struct Packet2cf
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -46,7 +48,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -71,7 +73,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
 
   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
-  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
   // Multiply the real a with b
   v1 = vmulq_f32(v1, b.v);
@@ -87,7 +89,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
 
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -110,7 +112,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { __pld((float *)addr); }
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  Packet4f res = pset1<Packet4f>(0.f);
+  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
+  res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
+  res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
+  return Packet2cf(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+  to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -235,7 +253,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  // TODO optimize it for NEON
   Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
   Packet4f s, rev_s;
 
@@ -246,6 +264,201 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+//---------- double ----------
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+static uint64x2_t p2ul_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x0, 0x8000000000000000);
+
+struct Packet1cd
+{
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+template<> struct packet_traits<std::complex<double> >  : default_packet_traits
+{
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+    HasHalfPacket = 0,
+
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasSetLinear = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
+
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  Packet2d v1, v2;
+
+  // Get the real values of a 
+  v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
+  // Get the imag values of a
+  v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
+  // Multiply the real a with b
+  v1 = vmulq_f64(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f64(v2, b.v);
+  // Conjugate v2 
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
+  // Swap real/imag elements in v2.
+  v2 = preverse<Packet2d>(v2);
+  // Add and return the result
+  return Packet1cd(vaddq_f64(v1, v2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((double *)addr); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
+  res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
+  return Packet1cd(res);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
+{
+  to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
+
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
+
+template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(a, pconj(b));
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return internal::pmul(pconj(a), b);
+  }
+};
+
+template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
+{
+  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
+  { return padd(pmul(x,y),c); }
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
+  {
+    return pconj(internal::pmul(a, b));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for NEON
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = pmul<Packet2d>(b.v, b.v);
+  Packet2d rev_s = preverse<Packet2d>(s);
+
+  return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s)));
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+#endif // EIGEN_ARCH_ARM64
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/arch/NEON/MathFunctions.h b/nuparu/include/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 00000000..6bb05bb9
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f tmp, fx;
+
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  x = vminq_f32(x, p4f_exp_hi);
+  x = vmaxq_f32(x, p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  Packet4ui mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
+  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p5);
+
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, p4f_1);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, p4i_0x7f);
+  mm = vshlq_n_s32(mm, 23);
+  Packet4f pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/nuparu/include/Eigen/src/Core/arch/NEON/PacketMath.h b/nuparu/include/Eigen/src/Core/arch/NEON/PacketMath.h
index 163bac21..fc4c0d03 100644
--- a/nuparu/include/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/nuparu/include/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -20,14 +20,24 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+#endif
+
 // FIXME NEON has 16 quad registers, but since the current register allocator
 // is so bad, it is much better to reduce it to 8
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
 #endif
 
+typedef float32x2_t Packet2f;
 typedef float32x4_t Packet4f;
 typedef int32x4_t   Packet4i;
+typedef int32x2_t   Packet2i;
 typedef uint32x4_t  Packet4ui;
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
@@ -39,7 +49,7 @@ typedef uint32x4_t  Packet4ui;
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if defined(__llvm__) && !defined(__clang__)
+#if EIGEN_COMP_LLVM && !EIGEN_COMP_CLANG
   //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
   #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
   #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
@@ -48,59 +58,74 @@ typedef uint32x4_t  Packet4ui;
   #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
   #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
 #endif
-    
-#ifndef __pld
-#define __pld(x) asm volatile ( "   pld [%[addr]]\n" :: [addr] "r" (x) : "cc" );
+
+
+// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
+// which available on LLVM and GCC (at least)
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#elif defined __pld
+  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#elif !EIGEN_ARCH_ARM64
+  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#else
+  // by default no explicit prefetching
+  #define EIGEN_ARM_PREFETCH(ADDR)
 #endif
 
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
+  typedef Packet4f half; // Packet2f intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
+    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
    
     HasDiv  = 1,
     // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
-    HasExp  = 0,
+    HasExp  = 1,
     HasSqrt = 0
   };
 };
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
+  typedef Packet4i half; // Packet2i intrinsics not implemented yet
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size=4,
+    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
     // FIXME check the Has*
   };
 };
 
-#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
+#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
 EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
 EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
 EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
   Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
   return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)
 {
   Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
   return vaddq_s32(pset1<Packet4i>(a), countdown);
@@ -123,6 +148,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
+#if EIGEN_ARCH_ARM64
+  return vdivq_f32(a,b);
+#else
   Packet4f inv, restep, div;
 
   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
@@ -141,14 +169,27 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
   div = vmulq_f32(a, inv);
 
   return div;
+#endif
 }
+
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
 { eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4i>(0);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
+#ifdef __ARM_FEATURE_FMA
+// See bug 936.
+// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
+// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
+// MLA is not fused i.e. does 2 roundings.
+// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
+// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
+#else
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
+#endif
+
+// No FMA instruction for int, so use MLA unconditionally.
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@@ -209,8 +250,42 @@ template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& f
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { __pld(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { __pld(addr); }
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+  Packet4f res = pset1<Packet4f>(0.f);
+  res = vsetq_lane_f32(from[0*stride], res, 0);
+  res = vsetq_lane_f32(from[1*stride], res, 1);
+  res = vsetq_lane_f32(from[2*stride], res, 2);
+  res = vsetq_lane_f32(from[3*stride], res, 3);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  Packet4i res = pset1<Packet4i>(0);
+  res = vsetq_lane_s32(from[0*stride], res, 0);
+  res = vsetq_lane_s32(from[1*stride], res, 1);
+  res = vsetq_lane_s32(from[2*stride], res, 2);
+  res = vsetq_lane_s32(from[3*stride], res, 3);
+  return res;
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  to[stride*0] = vgetq_lane_f32(from, 0);
+  to[stride*1] = vgetq_lane_f32(from, 1);
+  to[stride*2] = vgetq_lane_f32(from, 2);
+  to[stride*3] = vgetq_lane_f32(from, 3);
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  to[stride*0] = vgetq_lane_s32(from, 0);
+  to[stride*1] = vgetq_lane_s32(from, 1);
+  to[stride*2] = vgetq_lane_s32(from, 2);
+  to[stride*3] = vgetq_lane_s32(from, 3);
+}
+
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
 
 // FIXME only store the 2 first elements ?
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
@@ -234,6 +309,23 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
   a_hi = vget_high_s32(a_r64);
   return vcombine_s32(a_hi, a_lo);
 }
+
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+  static Packet4f run(const Packet4f& a) {
+    return vextq_f32(a, a, offset);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+  static Packet4i run(const Packet4i& a) {
+    return vextq_s32(a, a, offset);
+  }
+};
+
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
 
@@ -375,6 +467,7 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
   a_lo = vget_low_s32(a);
   a_hi = vget_high_s32(a);
   max = vpmax_s32(a_lo, a_hi);
+  max = vpmax_s32(max, max);
 
   return vget_lane_s32(max, 0);
 }
@@ -400,9 +493,237 @@ PALIGN_NEON(0,Packet4i,vextq_s32)
 PALIGN_NEON(1,Packet4i,vextq_s32)
 PALIGN_NEON(2,Packet4i,vextq_s32)
 PALIGN_NEON(3,Packet4i,vextq_s32)
-    
+
+#undef PALIGN_NEON
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
+  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
+  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
+  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
+  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
+  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
+  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
+  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
+  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+}
+
+//---------- double ----------
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#ifdef __apple_build_version__
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f64 (float64x2_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_f64_u64 (uint64x2_t __a)
+{
+  return (float64x2_t) __a;
+}
+#endif
+
+typedef float64x2_t Packet2d;
+typedef float64x1_t Packet1d;
+
+template<> struct packet_traits<double>  : default_packet_traits
+{
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    HasHalfPacket=0,
+   
+    HasDiv  = 1,
+    // FIXME check the Has*
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 0,
+    HasSqrt = 0
+  };
+};
+
+template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
+{
+  Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
+  return vaddq_f64(pset1<Packet2d>(a), countdown);
+}
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+
+#ifdef __ARM_FEATURE_FMA
+// See bug 936. See above comment about FMA for float.
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
+#else
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
+{
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  return vld1q_dup_f64(from);
+}
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
+
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vsetq_lane_f64(from[0*stride], res, 0);
+  res = vsetq_lane_f64(from[1*stride], res, 1);
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  to[stride*0] = vgetq_lane_f64(from, 0);
+  to[stride*1] = vgetq_lane_f64(from, 1);
+}
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+  static Packet2d run(const Packet2d& a) {
+    return vextq_f64(a, a, offset);
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+// workaround ICE, see bug 907
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+#endif
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  float64x2_t trn1, trn2;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  trn1 = vzip1q_f64(vecs[0], vecs[1]);
+  trn2 = vzip2q_f64(vecs[0], vecs[1]);
+
+  // Do the addition of the resulting vectors
+  return vaddq_f64(trn1, trn2);
+}
+// Other reduction functions:
+// mul
+#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+#else
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+#endif
+
+// min
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
+
+// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
+// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
+#define PALIGN_NEON(Offset,Type,Command) \
+template<>\
+struct palign_impl<Offset,Type>\
+{\
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
+    {\
+        if (Offset!=0)\
+            first = Command(first, second, Offset);\
+    }\
+};\
+
+PALIGN_NEON(0,Packet2d,vextq_f64)
+PALIGN_NEON(1,Packet2d,vextq_f64)
 #undef PALIGN_NEON
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+#endif // EIGEN_ARCH_ARM64 
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/arch/SSE/Complex.h b/nuparu/include/Eigen/src/Core/arch/SSE/Complex.h
index 91bba5e3..4f45ddfb 100644
--- a/nuparu/include/Eigen/src/Core/arch/SSE/Complex.h
+++ b/nuparu/include/Eigen/src/Core/arch/SSE/Complex.h
@@ -22,13 +22,18 @@ struct Packet2cf
   __m128  v;
 };
 
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -39,11 +44,13 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0
+    HasSetLinear = 0,
+    HasBlend = 1
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -60,7 +67,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
                                  _mm_mul_ps(_mm_movehdup_ps(a.v),
@@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
 
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
+
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
+                              std::imag(from[0*stride]), std::real(from[0*stride])));
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+  to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+                                     _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
 
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
@@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Pack
   #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
@@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
 };
 
 template<> struct conj_helper<Packet2cf, Packet4f, false,false>
@@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
+  { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
 };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -248,13 +269,18 @@ struct Packet1cd
   __m128d  v;
 };
 
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet1cd type;
+  typedef Packet1cd half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
+    HasHalfPacket = 0,
 
     HasAdd    = 1,
     HasSub    = 1,
@@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasSetLinear = 0
   };
 };
+#endif
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 {
   const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
@@ -282,9 +309,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for SSE3 and 4
   #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
+  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
                                  _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
                                             vec2d_swizzle1(b.v, 1, 0))));
   #else
@@ -311,8 +337,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
 
 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 
@@ -410,7 +436,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
 };
 
 template<> struct conj_helper<Packet1cd, Packet2d, false,false>
@@ -419,7 +445,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false>
   { return padd(c, pmul(x,y)); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
+  { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
 };
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
@@ -432,7 +458,22 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
 {
-  return Packet1cd(preverse(x.v));
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
+
+  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+  kernel.packet[1].v = tmp;
+}
+
+template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
 }
 
 } // end namespace internal
diff --git a/nuparu/include/Eigen/src/Core/arch/SSE/MathFunctions.h b/nuparu/include/Eigen/src/Core/arch/SSE/MathFunctions.h
index 3376a984..3b8b7303 100644
--- a/nuparu/include/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/nuparu/include/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -52,7 +52,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
 
   Packet4i emm0;
 
-  Packet4f invalid_mask = _mm_cmplt_ps(x, _mm_setzero_ps());
+  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
   Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
 
   x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
@@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
   x = _mm_or_ps(x, p4f_half);
 
   emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
+  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
 
   /* part2:
      if( x < SQRTHF ) {
@@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x)
      } else { x = x - 1.0; }
   */
   Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = _mm_and_ps(x, mask);
+  Packet4f tmp = pand(x, mask);
   x = psub(x, p4f_1);
-  e = psub(e, _mm_and_ps(p4f_1, mask));
+  e = psub(e, pand(p4f_1, mask));
   x = padd(x, tmp);
 
   Packet4f x2 = pmul(x,x);
@@ -126,7 +126,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
   _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
 
-  Packet4f tmp = _mm_setzero_ps(), fx;
+  Packet4f tmp, fx;
   Packet4i emm0;
 
   // clamp x
@@ -166,7 +166,7 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
   emm0 = _mm_cvttps_epi32(fx);
   emm0 = _mm_add_epi32(emm0, p4i_0x7f);
   emm0 = _mm_slli_epi32(emm0, 23);
-  return pmul(y, _mm_castsi128_ps(emm0));
+  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
@@ -195,7 +195,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
   _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
   static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
 
-  Packet2d tmp = _mm_setzero_pd(), fx;
+  Packet2d tmp, fx;
   Packet4i emm0;
 
   // clamp x
@@ -239,7 +239,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
   emm0 = _mm_add_epi32(emm0, p4i_1023_0);
   emm0 = _mm_slli_epi32(emm0, 20);
   emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmul(x, _mm_castsi128_pd(emm0));
+  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }
 
 /* evaluation of 4 sines at onces, using SSE2 intrinsics.
@@ -279,7 +279,7 @@ Packet4f psin<Packet4f>(const Packet4f& _x)
   _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
   _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
 
-  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
 
   Packet4i emm0, emm2;
   sign_bit = x;
@@ -378,7 +378,7 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
   _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
   _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
 
-  Packet4f xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+  Packet4f xmm1, xmm2, xmm3, y;
   Packet4i emm0, emm2;
 
   x = pabs(x);
@@ -442,8 +442,11 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
   return _mm_xor_ps(y, sign_bit);
 }
 
+#if EIGEN_FAST_MATH
+
 // This is based on Quake3's fast inverse square root.
 // For detail see here: http://www.beyond3d.com/content/articles/8/
+// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& _x)
 {
@@ -457,6 +460,62 @@ Packet4f psqrt<Packet4f>(const Packet4f& _x)
   return pmul(_x,x);
 }
 
+#else
+
+template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 
+Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
+
+#if EIGEN_FAST_MATH
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
+  _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
+
+  Packet4f neg_half = pmul(_x, p4f_minus_half);
+
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
+
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+                                        _mm_and_ps(zero_mask, p4f_inf));
+
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm_or_ps(x, infs_and_nans);
+}
+
+#else
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation.
+  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
+}
+
+#endif
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
+  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/arch/SSE/PacketMath.h b/nuparu/include/Eigen/src/Core/arch/SSE/PacketMath.h
index e256f4ba..eb517b87 100644
--- a/nuparu/include/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/nuparu/include/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -22,9 +22,40 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
+#ifdef __FMA__
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
+#endif
+#endif
+
+#if (defined EIGEN_VECTORIZE_AVX) && EIGEN_COMP_GNUC_STRICT && (__GXX_ABI_VERSION < 1004)
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
+// structure:
+template<typename T>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+  
+  T m_val;
+};
+typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128i> Packet4i;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
 typedef __m128  Packet4f;
 typedef __m128i Packet4i;
 typedef __m128d Packet2d;
+#endif
 
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
@@ -58,50 +89,79 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
 
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
+  typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=4,
+    HasHalfPacket = 0,
 
     HasDiv  = 1,
     HasSin  = EIGEN_FAST_MATH,
     HasCos  = EIGEN_FAST_MATH,
     HasLog  = 1,
     HasExp  = 1,
-    HasSqrt = 1
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    ,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+#endif
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
 {
   typedef Packet2d type;
+  typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
+    HasHalfPacket = 0,
 
     HasDiv  = 1,
-    HasExp  = 1
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+    ,
+    HasRound = 1,
+    HasFloor = 1,
+    HasCeil = 1
+#endif
   };
 };
+#endif
 template<> struct packet_traits<int>    : default_packet_traits
 {
   typedef Packet4i type;
+  typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size=4,
+
+    HasBlend = 1
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
-#if defined(_MSC_VER) && (_MSC_VER==1500)
+#if EIGEN_COMP_MSVC==1500
 // Workaround MSVC 9 internal compiler error.
 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
 // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
@@ -109,14 +169,25 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
+// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
+// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
+// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
+// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
+// Also note that with AVX, we want it to generate a vbroadcastss.
+#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
+template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
+  return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+}
+#endif
+  
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
@@ -138,7 +209,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
 {
-  return psub(_mm_setr_epi32(0,0,0,0), a);
+  return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
@@ -165,13 +236,13 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by SSE");
-  return pset1<Packet4i>(0);
-}
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+#ifdef __FMA__
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
+#endif
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
@@ -199,6 +270,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
 #endif
 }
 
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+#endif
+
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
@@ -217,16 +299,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con
 
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
 
-#if defined(_MSC_VER)
+#if EIGEN_COMP_MSVC
   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
     EIGEN_DEBUG_UNALIGNED_LOAD
-    #if (_MSC_VER==1600)
+    #if (EIGEN_COMP_MSVC==1600)
     // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
     // (i.e., it does not generate an unaligned load!!
-    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
-    // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
     __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
     res = _mm_loadh_pi(res, (const __m64*)(from+2));
     return res;
@@ -235,19 +315,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { E
     #endif
   }
   template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
+  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); }
 #else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
 // NOTE: with the code below, MSVC's compiler crashes!
 
-#if defined(__GNUC__) && defined(__i386__)
+#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386 || (EIGEN_ARCH_x86_64 && EIGEN_GNUC_AT_LEAST(4, 8)))
   // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
   #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-#elif defined(__clang__)
+#elif EIGEN_COMP_CLANG
   // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
   #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
 #else
@@ -282,7 +357,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
 #if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 #else
   __m128d res;
   res =  _mm_load_sd((const double*)(from)) ;
@@ -301,46 +376,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
   Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  _mm_storel_pd((to), from);
-  _mm_storeh_pd((to+1), from);
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
+{
+ return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+ return _mm_set_pd(from[1*stride], from[0*stride]);
+}
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+ return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+ }
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
+{
+  to[stride*0] = _mm_cvtss_f32(from);
+  to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+  to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+  to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsd_f64(from);
+  to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  to[stride*0] = _mm_cvtsi128_si32(from);
+  to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
 
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
 {
   Packet4f pa = _mm_set_ss(a);
-  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+  pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
 }
 // some compilers might be tempted to perform multiple moves instead of using a vector path.
 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
 {
   Packet2d pa = _mm_set_sd(a);
-  pstore(to, vec2d_swizzle1(pa,0,0));
+  pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
+#ifndef EIGEN_VECTORIZE_AVX
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+#endif
 
-#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 // Direct of the struct members fixed bug #62.
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#elif EIGEN_COMP_MSVC_STRICT
 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
@@ -358,6 +464,29 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 { return _mm_shuffle_epi32(a,0x1B); }
 
+template<size_t offset>
+struct protate_impl<offset, Packet4f>
+{
+  static Packet4f run(const Packet4f& a) {
+    return vec4f_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet4i>
+{
+  static Packet4i run(const Packet4i& a) {
+    return vec4i_swizzle1(a, offset, (offset + 1) % 4, (offset + 2) % 4, (offset + 3) % 4);
+  }
+};
+
+template<size_t offset>
+struct protate_impl<offset, Packet2d>
+{
+  static Packet2d run(const Packet2d& a) {
+    return vec2d_swizzle1(a, offset, (offset + 1) % 2);
+  }
+};
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
 {
@@ -379,6 +508,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
   #endif
 }
 
+// with AVX, the default implementations based on pload1 are faster
+#ifndef __AVX__
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4f>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
+{
+  a3 = pload<Packet4f>(a);
+  a0 = vec4f_swizzle1(a3, 0,0,0,0);
+  a1 = vec4f_swizzle1(a3, 1,1,1,1);
+  a2 = vec4f_swizzle1(a3, 2,2,2,2);
+  a3 = vec4f_swizzle1(a3, 3,3,3,3);
+}
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  a0 = _mm_loaddup_pd(a+0);
+  a1 = _mm_loaddup_pd(a+1);
+  a2 = _mm_loaddup_pd(a+2);
+  a3 = _mm_loaddup_pd(a+3);
+#else
+  a1 = pload<Packet2d>(a);
+  a0 = vec2d_swizzle1(a1, 0,0);
+  a1 = vec2d_swizzle1(a1, 1,1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec2d_swizzle1(a3, 0,0);
+  a3 = vec2d_swizzle1(a3, 1,1);
+#endif
+}
+#endif
+
 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 {
   vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
@@ -388,7 +549,6 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
 }
 
 #ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
@@ -397,36 +557,24 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 {
   return _mm_hadd_pd(vecs[0], vecs[1]);
 }
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst(_mm_hadd_ps(tmp0, tmp0));
+  return pfirst<Packet4f>(_mm_hadd_ps(tmp0, tmp0));
 }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
-
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
-//   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst<Packet2d>(_mm_hadd_pd(a, a)); }
 #else
 // SSE2 versions
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
@@ -449,10 +597,22 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
 }
 #endif  // SSE3
 
+
+#ifdef EIGEN_VECTORIZE_SSSE3
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+}
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i tmp0 = _mm_hadd_epi32(a,a);
+  return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
+}
+#else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
-  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+  return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
@@ -468,18 +628,18 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
   tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
   return _mm_add_epi32(tmp0, tmp2);
 }
-
+#endif
 // Other reduction functions:
 
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 {
@@ -495,45 +655,55 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
 {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  register int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
-  register int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
+  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
   return aux0<aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
 }
 
 // max
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
   Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+  return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
 }
 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
 {
-  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+  return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
 }
 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
+  return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+#else
   // after some experiments, it is seems this is the fastest way to implement it
   // for GCC (eg., it does not like using std::min after the pstore !!)
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  register int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
-  register int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
+  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
+  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
   return aux0>aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
 }
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
 //   Packet4f res = b;
@@ -641,6 +811,62 @@ struct palign_impl<Offset,Packet2d>
 };
 #endif
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128i false_mask = _mm_cmpeq_epi32(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+  const __m128 zero = _mm_setzero_ps();
+  const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+  __m128 false_mask = _mm_cmpeq_ps(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  const __m128d zero = _mm_setzero_pd();
+  const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
+  __m128d false_mask = _mm_cmpeq_pd(select, zero);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
+#else
+  return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
+#endif
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/arch/SSE/TypeCasting.h b/nuparu/include/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 00000000..c8489323
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, int> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+
+template <>
+struct type_casting_traits<int, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+
+template <>
+struct type_casting_traits<double, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_SSE_H
diff --git a/nuparu/include/Eigen/src/Core/functors/AssignmentFunctors.h b/nuparu/include/Eigen/src/Core/functors/AssignmentFunctors.h
new file mode 100644
index 00000000..d55ae609
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -0,0 +1,166 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
+#define EIGEN_ASSIGNMENT_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+  
+/** \internal
+  * \brief Template functor for scalar/packet assignment
+  *
+  */
+template<typename Scalar> struct assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a = b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,b); }
+};
+template<typename Scalar>
+struct functor_traits<assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost,
+    PacketAccess = packet_traits<Scalar>::Vectorizable
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with addition
+  *
+  */
+template<typename Scalar> struct add_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(add_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a += b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::padd(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<add_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with subtraction
+  *
+  */
+template<typename Scalar> struct sub_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(sub_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a -= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::psub(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<sub_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with multiplication
+  *
+  */
+template<typename DstScalar, typename SrcScalar=DstScalar>
+struct mul_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(mul_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(DstScalar& a, const SrcScalar& b) const { a *= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const
+  { internal::pstoret<DstScalar,Packet,Alignment>(a,internal::pmul(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename DstScalar, typename SrcScalar>
+struct functor_traits<mul_assign_op<DstScalar,SrcScalar> > {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost + NumTraits<DstScalar>::MulCost,
+    PacketAccess = is_same<DstScalar,SrcScalar>::value && packet_traits<DstScalar>::HasMul
+  };
+};
+template<typename DstScalar,typename SrcScalar> struct functor_is_product_like<mul_assign_op<DstScalar,SrcScalar> > { enum { ret = 1 }; };
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with diviving
+  *
+  */
+template<typename Scalar> struct div_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(div_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { a /= b; }
+  
+  template<int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(Scalar* a, const Packet& b) const
+  { internal::pstoret<Scalar,Packet,Alignment>(a,internal::pdiv(internal::ploadt<Packet,Alignment>(a),b)); }
+};
+template<typename Scalar>
+struct functor_traits<div_assign_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::ReadCost + NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasDiv
+  };
+};
+
+
+/** \internal
+  * \brief Template functor for scalar/packet assignment with swapping
+  *
+  * It works as follow. For a non-vectorized evaluation loop, we have:
+  *   for(i) func(A.coeffRef(i), B.coeff(i));
+  * where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
+  * Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
+  * B.coeff already returns a const reference to the underlying scalar value.
+  * 
+  * The case of a vectorized loop is more tricky:
+  *   for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
+  * Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
+  * the actual alignment and Packet type.
+  *
+  */
+template<typename Scalar> struct swap_assign_op {
+
+  EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
+  {
+#ifdef __CUDACC__
+    // FIXME is there some kind of cuda::swap?
+    Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
+#else
+    using std::swap;
+    swap(a,const_cast<Scalar&>(b));
+#endif
+  }
+};
+template<typename Scalar>
+struct functor_traits<swap_assign_op<Scalar> > {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::ReadCost,
+    PacketAccess = packet_traits<Scalar>::Vectorizable
+  };
+};
+
+} // namespace internal
+
+} // namespace Eigen
+
+#endif // EIGEN_ASSIGNMENT_FUNCTORS_H
diff --git a/nuparu/include/Eigen/src/Core/functors/BinaryFunctors.h b/nuparu/include/Eigen/src/Core/functors/BinaryFunctors.h
new file mode 100644
index 00000000..4962d625
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/functors/BinaryFunctors.h
@@ -0,0 +1,523 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BINARY_FUNCTORS_H
+#define EIGEN_BINARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative binary functors ----------
+
+/** \internal
+  * \brief Template functor to compute the sum of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
+  */
+template<typename Scalar> struct scalar_sum_op {
+//   typedef Scalar result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::padd(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sum_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAdd
+  };
+};
+
+/** \internal
+  * \brief Template specialization to deprecate the summation of boolean expressions.
+  * This is required to solve Bug 426.
+  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
+  */
+template<> struct scalar_sum_op<bool> : scalar_sum_op<int> {
+  EIGEN_DEPRECATED
+  scalar_sum_op() {}
+};
+
+
+/** \internal
+  * \brief Template functor to compute the product of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
+  enum {
+    // TODO vectorize mixed product
+    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+  };
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmul(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
+  { return internal::predux_mul(a); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the conjugate product of two scalars
+  *
+  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
+
+  enum {
+    Conj = NumTraits<LhsScalar>::IsComplex
+  };
+  
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
+  
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = NumTraits<LhsScalar>::MulCost,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the min of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
+  */
+template<typename Scalar> struct scalar_min_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::mini(a, b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmin(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux_min(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_min_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasMin
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the max of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
+  */
+template<typename Scalar> struct scalar_max_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return numext::maxi(a, b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pmax(a,b); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
+  { return internal::predux_max(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_max_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasMax
+  };
+};
+
+/** \internal
+  * \brief Template functors for comparison of two scalars
+  * \todo Implement packet-comparisons
+  */
+template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
+
+template<typename Scalar, ComparisonName cmp>
+struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = false
+  };
+};
+
+template<ComparisonName Cmp, typename Scalar>
+struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
+  typedef bool type;
+};
+
+
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GT> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_GE> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a>=b;}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
+};
+template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
+  typedef bool result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
+};
+
+
+/** \internal
+  * \brief Template functor to compute the hypot of two scalars
+  *
+  * \sa MatrixBase::stableNorm(), class Redux
+  */
+template<typename Scalar> struct scalar_hypot_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
+//   typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
+  {
+    using std::sqrt;
+    Scalar p, qp;
+    if(_x>_y)
+    {
+      p = _x;
+      qp = _y / p;
+    }
+    else
+    {
+      p = _y;
+      qp = _x / p;
+    }
+    return p * sqrt(Scalar(1) + qp*qp);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_hypot_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
+};
+
+/** \internal
+  * \brief Template functor to compute the pow of two scalars
+  */
+template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
+  EIGEN_DEVICE_FUNC
+  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
+};
+template<typename Scalar, typename OtherScalar>
+struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+
+
+//---------- non associative binary functors ----------
+
+/** \internal
+  * \brief Template functor to compute the difference of two scalars
+  *
+  * \sa class CwiseBinaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_difference_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::psub(a,b); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_difference_op<Scalar> > {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasSub
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the quotient of two scalars
+  *
+  * \sa class CwiseBinaryOp, Cwise::operator/()
+  */
+template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
+  enum {
+    // TODO vectorize mixed product
+    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
+  };
+  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
+  { return internal::pdiv(a,b); }
+};
+template<typename LhsScalar,typename RhsScalar>
+struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
+  enum {
+    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
+    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
+  };
+};
+
+
+
+/** \internal
+  * \brief Template functor to compute the and of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator&&
+  */
+struct scalar_boolean_and_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
+};
+template<> struct functor_traits<scalar_boolean_and_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the or of two booleans
+  *
+  * \sa class CwiseBinaryOp, ArrayBase::operator||
+  */
+struct scalar_boolean_or_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
+};
+template<> struct functor_traits<scalar_boolean_or_op> {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
+
+//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
+
+/** \internal
+  * \brief Template functor to multiply a scalar by a fixed other one
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
+  */
+/* NOTE why doing the pset1() in packetOp *is* an optimization ?
+ * indeed it seems better to declare m_other as a Packet and do the pset1() once
+ * in the constructor. However, in practice:
+ *  - GCC does not like m_other as a Packet and generate a load every time it needs it
+ *  - on the other hand GCC is able to moves the pset1() outside the loop :)
+ *  - simpler code ;)
+ * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
+ */
+template<typename Scalar>
+struct scalar_multiple_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a, pset1<Packet>(m_other)); }
+  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_multiple_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+template<typename Scalar1, typename Scalar2>
+struct scalar_multiple2_op {
+  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
+  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
+{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to divide a scalar by a fixed other one
+  *
+  * This functor is used to implement the quotient of a matrix by
+  * a scalar where the scalar type is not necessarily a floating point type.
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator/
+  */
+template<typename Scalar>
+struct scalar_quotient1_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(a, pset1<Packet>(m_other)); }
+  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_quotient1_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+template<typename Scalar1, typename Scalar2>
+struct scalar_quotient2_op {
+  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const scalar_quotient2_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_quotient2_op(const Scalar2& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a / m_other; }
+  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
+};
+template<typename Scalar1,typename Scalar2>
+struct functor_traits<scalar_quotient2_op<Scalar1,Scalar2> >
+{ enum { Cost = 2 * NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
+
+// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
+// where the mixing of different types is handled by scalar_product_traits
+// In particular, real * complex<real> is allowed.
+// FIXME move this to functor_traits adding a functor_default
+template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
+
+
+/** \internal
+  * \brief Template functor to add a scalar to a fixed other one
+  * \sa class CwiseUnaryOp, Array::operator+
+  */
+/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
+template<typename Scalar>
+struct scalar_add_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC inline scalar_add_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a + m_other; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::padd(a, pset1<Packet>(m_other)); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_add_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to subtract a fixed scalar to another one
+  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_rsub_op
+  */
+template<typename Scalar>
+struct scalar_sub_op {
+  EIGEN_DEVICE_FUNC inline scalar_sub_op(const scalar_sub_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC inline scalar_sub_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a - m_other; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::psub(a, pset1<Packet>(m_other)); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_sub_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to subtract a scalar to fixed another one
+  * \sa class CwiseUnaryOp, Array::operator-, struct scalar_add_op, struct scalar_sub_op
+  */
+template<typename Scalar>
+struct scalar_rsub_op {
+  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const scalar_rsub_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC inline scalar_rsub_op(const Scalar& other) : m_other(other) { }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other - a; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::psub(pset1<Packet>(m_other), a); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_rsub_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
+
+/** \internal
+  * \brief Template functor to raise a scalar to a power
+  * \sa class CwiseUnaryOp, Cwise::pow
+  */
+template<typename Scalar>
+struct scalar_pow_op {
+  // FIXME default copy constructors seems bugged with std::complex<>
+  EIGEN_DEVICE_FUNC inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
+  EIGEN_DEVICE_FUNC inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC
+  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
+  const Scalar m_exponent;
+};
+template<typename Scalar>
+struct functor_traits<scalar_pow_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to compute the quotient between a scalar and array entries.
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_mult_op {
+  EIGEN_DEVICE_FUNC scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return m_other / a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(m_other),a); }
+  Scalar m_other;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BINARY_FUNCTORS_H
diff --git a/nuparu/include/Eigen/src/Core/functors/CMakeLists.txt b/nuparu/include/Eigen/src/Core/functors/CMakeLists.txt
new file mode 100644
index 00000000..f4b99a9c
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/functors/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_Core_Functor_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_Core_Functor_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/functors COMPONENT Devel
+  )
diff --git a/nuparu/include/Eigen/src/Core/functors/NullaryFunctors.h b/nuparu/include/Eigen/src/Core/functors/NullaryFunctors.h
new file mode 100644
index 00000000..cd9fbf26
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/functors/NullaryFunctors.h
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NULLARY_FUNCTORS_H
+#define EIGEN_NULLARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar>
+struct scalar_constant_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
+  template<typename Index, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp(Index, Index = 0) const { return internal::pset1<PacketType>(m_other); }
+  const Scalar m_other;
+};
+template<typename Scalar>
+struct functor_traits<scalar_constant_op<Scalar> >
+{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
+
+template<typename Scalar> struct scalar_identity_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_identity_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
+
+template <typename Scalar, typename Packet, bool RandomAccess> struct linspaced_op_impl;
+
+// linear access for packet ops:
+// 1) initialization
+//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
+// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
+//   base += [size*step, ..., size*step]
+//
+// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
+//       in order to avoid the padd() in operator() ?
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,false>
+{
+  linspaced_op_impl(const Scalar& low, const Scalar& step) :
+  m_low(low), m_step(step),
+  m_packetStep(pset1<Packet>(unpacket_traits<Packet>::size*step)),
+  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Packet>(-unpacket_traits<Packet>::size)))) {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
+  { 
+    m_base = padd(m_base, pset1<Packet>(m_step));
+    return m_low+Scalar(i)*m_step; 
+  }
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
+
+  const Scalar m_low;
+  const Scalar m_step;
+  const Packet m_packetStep;
+  mutable Packet m_base;
+};
+
+// random access for packet ops:
+// 1) each step
+//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,true>
+{
+  linspaced_op_impl(const Scalar& low, const Scalar& step) :
+  m_low(low), m_step(step),
+  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Packet>(0)) {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
+  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
+
+  const Scalar m_low;
+  const Scalar m_step;
+  const Packet m_lowPacket;
+  const Packet m_stepPacket;
+  const Packet m_interPacket;
+};
+
+// ----- Linspace functor ----------------------------------------------------------------
+
+// Forward declaration (we default to random access which does not really give
+// us a speed gain when using packet access but it allows to use the functor in
+// nested expressions).
+template <typename Scalar, typename PacketType, bool RandomAccess = true> struct linspaced_op;
+template <typename Scalar, typename PacketType, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,PacketType,RandomAccess> >
+{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
+template <typename Scalar, typename PacketType, bool RandomAccess> struct linspaced_op
+{
+  linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
+
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
+
+  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  // there row==0 and col is used for the actual iteration.
+  template<typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
+  {
+    eigen_assert(col==0 || row==0);
+    return impl(col + row);
+  }
+
+  template<typename Index, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
+
+  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
+  // there row==0 and col is used for the actual iteration.
+  template<typename Index, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
+  {
+    eigen_assert(col==0 || row==0);
+    return impl.packetOp(col + row);
+  }
+
+  // This proxy object handles the actual required temporaries, the different
+  // implementations (random vs. sequential access) as well as the
+  // correct piping to size 2/4 packet operations.
+  const linspaced_op_impl<Scalar,PacketType,RandomAccess> impl;
+};
+
+// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
+// to indicate whether a functor allows linear access, just always answering 'yes' except for
+// scalar_identity_op.
+template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
+template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_NULLARY_FUNCTORS_H
diff --git a/nuparu/include/Eigen/src/Core/functors/StlFunctors.h b/nuparu/include/Eigen/src/Core/functors/StlFunctors.h
new file mode 100644
index 00000000..0b4e5a29
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/functors/StlFunctors.h
@@ -0,0 +1,132 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STL_FUNCTORS_H
+#define EIGEN_STL_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// default functor traits for STL functors:
+
+template<typename T>
+struct functor_traits<std::multiplies<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::divides<T> >
+{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::plus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::minus<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::negate<T> >
+{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_or<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_and<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::logical_not<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::greater_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::less_equal<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::not_equal_to<T> >
+{ enum { Cost = 1, PacketAccess = false }; };
+
+#if(__cplusplus < 201103L)
+// std::binder* are deprecated since c++11 and will be removed in c++17
+template<typename T>
+struct functor_traits<std::binder2nd<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binder1st<T> >
+{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
+#endif
+
+template<typename T>
+struct functor_traits<std::unary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+template<typename T>
+struct functor_traits<std::binary_negate<T> >
+{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
+
+#ifdef EIGEN_STDEXT_SUPPORT
+
+template<typename T0,typename T1>
+struct functor_traits<std::project1st<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::project2nd<T0,T1> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select2nd<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::select1st<std::pair<T0,T1> > >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+template<typename T0,typename T1>
+struct functor_traits<std::unary_compose<T0,T1> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
+
+template<typename T0,typename T1,typename T2>
+struct functor_traits<std::binary_compose<T0,T1,T2> >
+{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
+
+#endif // EIGEN_STDEXT_SUPPORT
+
+// allow to add new functors and specializations of functor_traits from outside Eigen.
+// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
+#ifdef EIGEN_FUNCTORS_PLUGIN
+#include EIGEN_FUNCTORS_PLUGIN
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_STL_FUNCTORS_H
diff --git a/nuparu/include/Eigen/src/Core/functors/UnaryFunctors.h b/nuparu/include/Eigen/src/Core/functors/UnaryFunctors.h
new file mode 100644
index 00000000..6891cfdd
--- /dev/null
+++ b/nuparu/include/Eigen/src/Core/functors/UnaryFunctors.h
@@ -0,0 +1,778 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UNARY_FUNCTORS_H
+#define EIGEN_UNARY_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \brief Template functor to compute the opposite of a scalar
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::operator-
+  */
+template<typename Scalar> struct scalar_opposite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pnegate(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_opposite_op<Scalar> >
+{ enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNegate };
+};
+
+/** \internal
+  * \brief Template functor to compute the absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs
+  */
+template<typename Scalar> struct scalar_abs_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pabs(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasAbs
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the score of a scalar, to chose a pivot
+  *
+  * \sa class CwiseUnaryOp
+  */
+template<typename Scalar> struct scalar_score_coeff_op : scalar_abs_op<Scalar>
+{
+  typedef void Score_is_abs;
+};
+template<typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar> > : functor_traits<scalar_abs_op<Scalar> > {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template<typename Scalar, typename=void> struct abs_knowing_score
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a, const Score&) const { using std::abs; return abs(a); }
+};
+template<typename Scalar> struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(abs_knowing_score)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template<typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scal&, const result_type& a) const { return a; }
+};
+
+/** \internal
+  * \brief Template functor to compute the squared absolute value of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::abs2
+  */
+template<typename Scalar> struct scalar_abs2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_abs2_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
+
+/** \internal
+  * \brief Template functor to compute the conjugate of a complex value
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
+  */
+template<typename Scalar> struct scalar_conjugate_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_conjugate_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
+    PacketAccess = packet_traits<Scalar>::HasConj
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the phase angle of a complex
+  *
+  * \sa class CwiseUnaryOp, Cwise::arg
+  */
+template<typename Scalar> struct scalar_arg_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
+  { return internal::parg(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_arg_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? 5 * NumTraits<Scalar>::MulCost : NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasArg
+  };
+};
+/** \internal
+  * \brief Template functor to cast a scalar to another type
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::cast()
+  */
+template<typename Scalar, typename NewType>
+struct scalar_cast_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef NewType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
+};
+template<typename Scalar, typename NewType>
+struct functor_traits<scalar_cast_op<Scalar,NewType> >
+{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the real part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::real()
+  */
+template<typename Scalar>
+struct scalar_real_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_real_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  * \brief Template functor to extract the imaginary part of a complex as a reference
+  *
+  * \sa class CwiseUnaryOp, MatrixBase::imag()
+  */
+template<typename Scalar>
+struct scalar_imag_ref_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_imag_ref_op<Scalar> >
+{ enum { Cost = 0, PacketAccess = false }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the exponential of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::exp()
+  */
+template<typename Scalar> struct scalar_exp_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log()
+  */
+template<typename Scalar> struct scalar_log_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
+
+/** \internal
+  *
+  * \brief Template functor to compute the base-10 logarithm of a scalar
+  *
+  * \sa class CwiseUnaryOp, Cwise::log10()
+  */
+template<typename Scalar> struct scalar_log10_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_log10_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::log10; return log10(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plog10(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_log10_op<Scalar> >
+{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 }; };
+
+/** \internal
+  * \brief Template functor to compute the square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sqrt()
+  */
+template<typename Scalar> struct scalar_sqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the reciprocal square root of a scalar
+  * \sa class CwiseUnaryOp, Cwise::rsqrt()
+  */
+template<typename Scalar> struct scalar_rsqrt_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_rsqrt_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return Scalar(1)/sqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::prsqrt(a); }
+};
+
+template<typename Scalar>
+struct functor_traits<scalar_rsqrt_op<Scalar> >
+{ enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRsqrt
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cos()
+  */
+template<typename Scalar> struct scalar_cos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sin()
+  */
+template<typename Scalar> struct scalar_sin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSin
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the tan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tan()
+  */
+template<typename Scalar> struct scalar_tan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTan
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc cosine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::acos()
+  */
+template<typename Scalar> struct scalar_acos_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_acos_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasACos
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the arc sine of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::asin()
+  */
+template<typename Scalar> struct scalar_asin_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_asin_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasASin
+  };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::lgamma; return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erf; return erf(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErf
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erfc; return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the atan of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::atan()
+  */
+template<typename Scalar> struct scalar_atan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::atan; return atan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::patan(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_atan_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATan
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the tanh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::tanh()
+  */
+template<typename Scalar> struct scalar_tanh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::tanh; return tanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::ptanh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_tanh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasTanh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the sinh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sinh()
+  */
+template<typename Scalar> struct scalar_sinh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sinh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::sinh; return sinh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psinh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sinh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasSinh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the cosh of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::cosh()
+  */
+template<typename Scalar> struct scalar_cosh_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cosh_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { using std::cosh; return cosh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pcosh(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cosh_op<Scalar> >
+{
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCosh
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the inverse of a scalar
+  * \sa class CwiseUnaryOp, Cwise::inverse()
+  */
+template<typename Scalar>
+struct scalar_inverse_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
+
+/** \internal
+  * \brief Template functor to compute the square of a scalar
+  * \sa class CwiseUnaryOp, Cwise::square()
+  */
+template<typename Scalar>
+struct scalar_square_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_square_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+/** \internal
+  * \brief Template functor to compute the cube of a scalar
+  * \sa class CwiseUnaryOp, Cwise::cube()
+  */
+template<typename Scalar>
+struct scalar_cube_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a*a*a; }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
+  { return internal::pmul(a,pmul(a,a)); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_cube_op<Scalar> >
+{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
+
+/** \internal
+  * \brief Template functor to compute the rounded value of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::round()
+  */
+template<typename Scalar> struct scalar_round_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_round_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::round(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pround(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_round_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the floor of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::floor()
+  */
+template<typename Scalar> struct scalar_floor_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_floor_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::floor(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pfloor(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_floor_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasFloor
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the ceil of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::ceil()
+  */
+template<typename Scalar> struct scalar_ceil_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_ceil_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::ceil(a); }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pceil(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ceil_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasCeil
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute whether a scalar is NaN
+  * \sa class CwiseUnaryOp, ArrayBase::isnan()
+  */
+template<typename Scalar> struct scalar_isnan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isnan_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar is +/-inf
+  * \sa class CwiseUnaryOp, ArrayBase::isinf()
+  */
+template<typename Scalar> struct scalar_isinf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isinf_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to check whether a scalar has a finite value
+  * \sa class CwiseUnaryOp, ArrayBase::isfinite()
+  */
+template<typename Scalar> struct scalar_isfinite_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_isfinite_op<Scalar> >
+{
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the logical not of a boolean
+  *
+  * \sa class CwiseUnaryOp, ArrayBase::operator!
+  */
+template<typename Scalar> struct scalar_boolean_not_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_not_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a) const { return !a; }
+};
+template<typename Scalar>
+struct functor_traits<scalar_boolean_not_op<Scalar> > {
+  enum {
+    Cost = NumTraits<bool>::AddCost,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+  * \brief Template functor to compute the signum of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sign()
+  */
+template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
+template<typename Scalar> 
+struct scalar_sign_op<Scalar,false> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const 
+  {
+      return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar> 
+struct scalar_sign_op<Scalar,true> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const 
+  {
+    using std::abs;
+    typedef typename NumTraits<Scalar>::Real real_type;
+    real_type aa = abs(a);
+    if (aa==0)
+      return Scalar(0); 
+    aa = 1./aa; 
+    return Scalar(real(a)*aa, imag(a)*aa );
+  }
+  //TODO
+  //template <typename Packet>
+  //EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::psign(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sign_op<Scalar> >
+{ enum {
+    Cost = 
+        NumTraits<Scalar>::IsComplex
+        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
+        : ( 3*NumTraits<Scalar>::AddCost),
+    PacketAccess = packet_traits<Scalar>::HasSign
+  };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_FUNCTORS_H
diff --git a/nuparu/include/Eigen/src/Core/products/CoeffBasedProduct.h b/nuparu/include/Eigen/src/Core/products/CoeffBasedProduct.h
deleted file mode 100644
index c06a0df1..00000000
--- a/nuparu/include/Eigen/src/Core/products/CoeffBasedProduct.h
+++ /dev/null
@@ -1,441 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COEFFBASED_PRODUCT_H
-#define EIGEN_COEFFBASED_PRODUCT_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/*********************************************************************************
-*  Coefficient based product implementation.
-*  It is designed for the following use cases:
-*  - small fixed sizes
-*  - lazy products
-*********************************************************************************/
-
-/* Since the all the dimensions of the product are small, here we can rely
- * on the generic Assign mechanism to evaluate the product per coeff (or packet).
- *
- * Note that here the inner-loops should always be unrolled.
- */
-
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl;
-
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl;
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
-                                           typename traits<_RhsNested>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-      LhsFlags = _LhsNested::Flags,
-      RhsFlags = _RhsNested::Flags,
-
-      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
-      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
-      InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-      LhsRowMajor = LhsFlags & RowMajorBit,
-      RhsRowMajor = RhsFlags & RowMajorBit,
-
-      SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-
-      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                      && (ColsAtCompileTime == Dynamic
-                          || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (RhsFlags&AlignedBit)
-                             )
-                         ),
-
-      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                      && (RowsAtCompileTime == Dynamic
-                          || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (LhsFlags&AlignedBit)
-                             )
-                         ),
-
-      EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                     : (RhsRowMajor && !CanVectorizeLhs),
-
-      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
-            | (EvalToRowMajor ? RowMajorBit : 0)
-            | NestingFlags
-            | (LhsFlags & RhsFlags & AlignedBit)
-            // TODO enable vectorization for mixed types
-            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
-
-      CoeffReadCost = InnerSize == Dynamic ? Dynamic
-                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-      */
-      CanVectorizeInner =    SameType
-                          && LhsRowMajor
-                          && (!RhsRowMajor)
-                          && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                          && (LhsFlags & RhsFlags & AlignedBit)
-                          && (InnerSize % packet_traits<Scalar>::size == 0)
-    };
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-class CoeffBasedProduct
-  : internal::no_assignment_operator,
-    public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
-{
-  public:
-
-    typedef MatrixBase<CoeffBasedProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
-    typedef typename Base::PlainObject PlainObject;
-
-  private:
-
-    typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
-      Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-      CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
-    };
-
-    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                   Unroll ? InnerSize-1 : Dynamic,
-                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
-
-    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
-
-  public:
-
-    inline CoeffBasedProduct(const CoeffBasedProduct& other)
-      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
-    {}
-
-    template<typename Lhs, typename Rhs>
-    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
-      // We still allow to mix T and complex<T>.
-      EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      eigen_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      Scalar res;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-     * which is why we don't set the LinearAccessBit.
-     */
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      Scalar res;
-      const Index row = RowsAtCompileTime == 1 ? 0 : index;
-      const Index col = RowsAtCompileTime == 1 ? index : 0;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
-    {
-      PacketScalar res;
-      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                              Unroll ? InnerSize-1 : Dynamic,
-                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
-        ::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    EIGEN_STRONG_INLINE operator const PlainObject& () const
-    {
-      m_result.lazyAssign(*this);
-      return m_result;
-    }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    template<int DiagonalIndex>
-    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }
-
-  protected:
-    typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
-    typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-namespace internal {
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-template<typename Lhs, typename Rhs, int N, typename PlainObject>
-struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
-{
-  typedef PlainObject const& type;
-};
-
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
-  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res += lhs.coeff(row, i) * rhs.coeff(i, col);
-  }
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller
-{
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar Packet;
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    Packet pres;
-    product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res);
-    res = predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct product_coeff_vectorized_dyn_selector
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res);
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix");
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-      for(Index i = 1; i < lhs.cols(); ++i)
-        res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COEFFBASED_PRODUCT_H
diff --git a/nuparu/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/nuparu/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 780fa74d..229e96ce 100644
--- a/nuparu/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/nuparu/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -10,6 +10,7 @@
 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
+
 namespace Eigen { 
   
 namespace internal {
@@ -24,29 +25,51 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
   return a<=0 ? b : a;
 }
 
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
+#else
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
+#endif
+
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
-{
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  if(m_l2CacheSize==0)
-  {
-    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
-    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
+struct CacheSizes { 
+  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
   }
-  
+
+  std::ptrdiff_t m_l1;
+  std::ptrdiff_t m_l2;
+  std::ptrdiff_t m_l3;
+};
+
+
+/** \internal */
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
+{
+  static CacheSizes m_cacheSizes;
+
   if(action==SetAction)
   {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
     eigen_internal_assert(l1!=0 && l2!=0);
-    m_l1CacheSize = *l1;
-    m_l2CacheSize = *l2;
+    m_cacheSizes.m_l1 = *l1;
+    m_cacheSizes.m_l2 = *l2;
+    m_cacheSizes.m_l3 = *l3;
   }
   else if(action==GetAction)
   {
     eigen_internal_assert(l1!=0 && l2!=0);
-    *l1 = m_l1CacheSize;
-    *l2 = m_l2CacheSize;
+    *l1 = m_cacheSizes.m_l1;
+    *l2 = m_cacheSizes.m_l2;
+    *l3 = m_cacheSizes.m_l3;
   }
   else
   {
@@ -54,6 +77,209 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   }
 }
 
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+
+  // Explanations:
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+  if (num_threads > 1) {
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
+      k_mask = -8,
+
+      mr = Traits::mr,
+      mr_mask = -mr,
+
+      nr = Traits::nr,
+      nr_mask = -nr
+    };
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    const Index k_cache = (std::min<Index>)((l1-ksub)/kdiv, 320);
+    if (k_cache < k) {
+      k = k_cache & k_mask;
+      eigen_internal_assert(k > 0);
+    }
+
+    const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+      n = n_cache & nr_mask;
+      eigen_internal_assert(n > 0);
+    } else {
+      n = (std::min<Index>)(n, (n_per_thread + nr - 1) & nr_mask);
+    }
+
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
+      if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+        m = m_cache & mr_mask;
+        eigen_internal_assert(m > 0);
+      } else {
+        m = (std::min<Index>)(m, (m_per_thread + mr - 1) & mr_mask);
+      }
+    }
+  }
+  else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 9*1024;
+    l2 = 32*1024;
+    l3 = 512*1024;
+#endif
+    
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if((std::max)(k,(std::max)(m,n))<48)
+      return;
+    
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
+    };
+    
+    // ---- 1st level of blocking on L1, yields kc ----
+    
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
+    const Index old_k = k;
+    if(k>max_kc)
+    {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k%max_kc)==0 ? max_kc
+                        : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
+                        
+      eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
+    }
+    
+    // ---- 2nd level of blocking on max(L2,L3), yields nc ----
+    
+    // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+    //      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+    // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+    // For instance, it corresponds to 6MB of L3 shared among 4 cores.
+    #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+    #else
+    const Index actual_l2 = 1572864; // == 1.5 MB
+    #endif
+    
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1- k_sub - lhs_bytes;
+    if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
+    {
+      // L1 blocking
+      max_nc = remaining_l1 / (k*sizeof(RhsScalar));
+    }
+    else
+    {
+      // L2 blocking
+      max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
+    }
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
+    if(n>nc)
+    {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n%nc)==0 ? nc
+                    : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
+    }
+    else if(old_k==k)
+    {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
+      Index problem_size = k*n*sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if(problem_size<=1024)
+      {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      }
+      else if(l3!=0 && problem_size<=32768)
+      {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = 576;
+      }
+      Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr) mc -= mc % Traits::mr;
+      else if (mc==0) return;
+      m = (m%mc)==0 ? mc
+                    : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
+    }
+  }
+}
+
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+#endif
+  return false;
+}
+
 /** \brief Computes the blocking parameters for a m x k times k x n matrix product
   *
   * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
@@ -62,48 +288,40 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
   *
   * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
   * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using fixed prescribed values (for testing purposes).
   *
   * \sa setCpuCacheSizes */
-template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  EIGEN_UNUSED_VARIABLE(n);
-  // Explanations:
-  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
-  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
-  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
-  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
-  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
-  // stay in L1 cache.
-  std::ptrdiff_t l1, l2;
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+  }
 
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
   enum {
-    kdiv = KcFactor * 2 * Traits::nr
-         * Traits::RhsProgress * sizeof(RhsScalar),
-    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
-    mr_mask = (0xffffffff/mr)*mr
+    kr = 8,
+    mr = Traits::mr,
+    nr = Traits::nr
   };
-
-  manage_caching_sizes(GetAction, &l1, &l2);
-  k = std::min<SizeType>(k, l1/kdiv);
-  SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-  if(_m<m) m = _m & mr_mask;
+  if (k > kr) k -= k % kr;
+  if (m > mr) m -= m % mr;
+  if (n > nr) n -= n % nr;
 }
 
-template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
+template<typename LhsScalar, typename RhsScalar>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
+  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n, num_threads);
 }
 
-#ifdef EIGEN_HAS_FUSE_CJMADD
-  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
+  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
 #else
 
   // FIXME (a bit overkill maybe ?)
@@ -128,8 +346,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
     gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
   }
 
-  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
+//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif
 
 /* Vectorization logic
@@ -160,16 +378,22 @@ class gebp_traits
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
-    // register block size along the N direction (must be either 2 or 4)
-    nr = NumberOfRegisters/4,
+    // register block size along the N direction must be 1 or 4
+    nr = 4,
 
     // register block size along the M direction (currently, this one cannot be modified)
-    mr = 2 * LhsPacketSize,
+    default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    mr = Vectorizable ? 3*LhsPacketSize : default_mr,
+#else
+    mr = default_mr,
+#endif
     
-    WorkSpaceFactor = nr * RhsPacketSize,
-
     LhsProgress = LhsPacketSize,
-    RhsProgress = RhsPacketSize
+    RhsProgress = 1
   };
 
   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
@@ -186,32 +410,65 @@ class gebp_traits
   {
     p = pset1<ResPacket>(ResScalar(0));
   }
-
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
   {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+  
+  template<typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  {
+    dest = pset1<RhsPacketType>(*b);
+  }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    dest = ploadquad<RhsPacket>(b);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = pload<RhsPacket>(b);
+    dest = pload<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
+  template<typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
   {
-    dest = pload<LhsPacket>(a);
+    dest = ploadu<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
   {
+    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+    // let gcc allocate the register in which to store the result of the pmul
+    // (in the case where there is no FMA) gcc fails to figure out how to avoid
+    // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c = pmadd(a,b,c);
+#else
     tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+#endif
   }
 
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     r = pmadd(c,alpha,r);
   }
+  
+  template<typename ResPacketHalf>
+  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
+  {
+    r = pmadd(c,alpha,r);
+  }
 
 protected:
 //   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
@@ -235,12 +492,16 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-    nr = NumberOfRegisters/4,
-    mr = 2 * LhsPacketSize,
-    WorkSpaceFactor = nr*RhsPacketSize,
+    nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    mr = 3*LhsPacketSize,
+#else
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
+#endif
 
     LhsProgress = LhsPacketSize,
-    RhsProgress = RhsPacketSize
+    RhsProgress = 1
   };
 
   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
@@ -258,15 +519,14 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+    dest = pset1<RhsPacket>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    dest = pload<RhsPacket>(b);
+    dest = pset1<RhsPacket>(*b);
   }
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
@@ -274,6 +534,21 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
     dest = pload<LhsPacket>(a);
   }
 
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploadu<LhsPacket>(a);
+  }
+
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
@@ -281,7 +556,12 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a.v,b,c.v);
+#else
     tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+#endif
   }
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -298,6 +578,38 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
   conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
 };
 
+template<typename Packet>
+struct DoublePacket
+{
+  Packet first;
+  Packet second;
+};
+
+template<typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+{
+  DoublePacket<Packet> res;
+  res.first  = padd(a.first, b.first);
+  res.second = padd(a.second,b.second);
+  return res;
+}
+
+template<typename Packet>
+const DoublePacket<Packet>& predux4(const DoublePacket<Packet> &a)
+{
+  return a;
+}
+
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+//   DoublePacket<Packet> res;
+//   res.first  = padd(a.first, b.first);
+//   res.second = padd(a.second,b.second);
+//   return res;
+// }
+
 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
 {
@@ -314,60 +626,80 @@ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs,
                 && packet_traits<Scalar>::Vectorizable,
     RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
     ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    
-    nr = 2,
-    mr = 2 * ResPacketSize,
-    WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+
+    // FIXME: should depend on NumberOfRegisters
+    nr = 4,
+    mr = ResPacketSize,
 
     LhsProgress = ResPacketSize,
-    RhsProgress = Vectorizable ? 2*ResPacketSize : 1
+    RhsProgress = 1
   };
   
   typedef typename packet_traits<RealScalar>::type RealPacket;
   typedef typename packet_traits<Scalar>::type     ScalarPacket;
-  struct DoublePacket
-  {
-    RealPacket first;
-    RealPacket second;
-  };
+  typedef DoublePacket<RealPacket> DoublePacketType;
 
   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
-  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
+  typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
   
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
-  EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
   {
     p.first   = pset1<RealPacket>(RealScalar(0));
     p.second  = pset1<RealPacket>(RealScalar(0));
   }
 
-  /* Unpack the rhs coeff such that each complex coefficient is spread into
-   * two packects containing respectively the real and imaginary coefficient
-   * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
-   */
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
+  // Scalar path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-    {
-      if(Vectorizable)
-      {
-        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0],             real(rhs[k]));
-        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
-      }
-      else
-        b[k] = rhs[k];
-    }
+    dest = pset1<ResPacket>(*b);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
+  // Vectorized path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    dest.first  = pset1<RealPacket>(real(*b));
+    dest.second = pset1<RealPacket>(imag(*b));
+  }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
+  {
+    loadRhs(b,dest);
+  }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
+  {
+    eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+  
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+    loadRhs(b+2, b2);
+    loadRhs(b+3, b3);
+  }
+  
+  // Vectorized path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+  }
+  
+  // Scalar path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
   {
-    dest.first  = pload<RealPacket>((const RealScalar*)b);
-    dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
   }
 
   // nothing special here
@@ -376,7 +708,12 @@ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs,
     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
   {
     c.first   = padd(pmul(a,b.first), c.first);
     c.second  = padd(pmul(a,b.second),c.second);
@@ -389,7 +726,7 @@ class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs,
   
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
   
-  EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
+  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
   {
     // assemble c
     ResPacket tmp;
@@ -440,12 +777,12 @@ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
     ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    // FIXME: should depend on NumberOfRegisters
     nr = 4,
-    mr = 2*ResPacketSize,
-    WorkSpaceFactor = nr*RhsPacketSize,
+    mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
 
     LhsProgress = ResPacketSize,
-    RhsProgress = ResPacketSize
+    RhsProgress = 1
   };
 
   typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
@@ -463,21 +800,38 @@ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+    dest = pset1<RhsPacket>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
+  
+  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
   {
-    dest = pload<RhsPacket>(b);
+    pbroadcast4(b, b0, b1, b2, b3);
   }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     // FIXME not sure that's the best way to implement it!
+//     b0 = pload1<RhsPacket>(b+0);
+//     b1 = pload1<RhsPacket>(b+1);
+//   }
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
     dest = ploaddup<LhsPacket>(a);
   }
+  
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
+  {
+    eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
+  {
+    dest = ploaddup<LhsPacket>(a);
+  }
 
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
   {
@@ -486,7 +840,13 @@ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a,b.v,c.v);
+#else
     tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+#endif
+    
   }
 
   EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
@@ -503,6 +863,80 @@ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
   conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };
 
+// helper for the rotating kernel below
+template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
+struct PossiblyRotatingKernelHelper
+{
+  // default implementation, not rotating
+
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
+  }
+
+  void unrotateResult(AccPacket&,
+                      AccPacket&,
+                      AccPacket&,
+                      AccPacket&)
+  {
+  }
+};
+
+// rotating implementation
+template <typename GebpKernel>
+struct PossiblyRotatingKernelHelper<GebpKernel, true>
+{
+  typedef typename GebpKernel::Traits Traits;
+  typedef typename Traits::RhsScalar RhsScalar;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::AccPacket AccPacket;
+
+  const Traits& traits;
+  PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
+
+  template <size_t K, size_t Index>
+  void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
+  {
+    if (Index == 0) {
+      to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
+    } else {
+      EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
+      to = protate<1>(to);
+    }
+  }
+
+  void unrotateResult(AccPacket& res0,
+                      AccPacket& res1,
+                      AccPacket& res2,
+                      AccPacket& res3)
+  {
+    PacketBlock<AccPacket> resblock;
+    resblock.packet[0] = res0;
+    resblock.packet[1] = res1;
+    resblock.packet[2] = res2;
+    resblock.packet[3] = res3;
+    ptranspose(resblock);
+    resblock.packet[3] = protate<1>(resblock.packet[3]);
+    resblock.packet[2] = protate<2>(resblock.packet[2]);
+    resblock.packet[1] = protate<3>(resblock.packet[1]);
+    ptranspose(resblock);
+    res0 = resblock.packet[0];
+    res1 = resblock.packet[1];
+    res2 = resblock.packet[2];
+    res3 = resblock.packet[3];
+  }
+};
+
 /* optimized GEneral packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
@@ -510,7 +944,7 @@ class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
  *  |real |cplx | no vectorization yet, would require to pack A with duplication
  *  |cplx |real | easy vectorization
  */
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
@@ -520,6 +954,15 @@ struct gebp_kernel
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
 
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
+  typedef typename SwappedTraits::ResScalar SResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
   enum {
     Vectorizable  = Traits::Vectorizable,
     LhsProgress   = Traits::LhsProgress,
@@ -527,572 +970,800 @@ struct gebp_kernel
     ResPacketSize = Traits::ResPacketSize
   };
 
+
+  static const bool UseRotatingKernel =
+    EIGEN_ARCH_ARM &&
+    internal::is_same<LhsScalar, float>::value &&
+    internal::is_same<RhsScalar, float>::value &&
+    internal::is_same<ResScalar, float>::value &&
+    Traits::LhsPacketSize == 4 &&
+    Traits::RhsPacketSize == 4 &&
+    Traits::ResPacketSize == 4;
+
   EIGEN_DONT_INLINE
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
+  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index rows, Index depth, Index cols, ResScalar alpha,
+                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
-  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
+void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
+  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+               Index rows, Index depth, Index cols, ResScalar alpha,
+               Index strideA, Index strideB, Index offsetA, Index offsetB)
   {
     Traits traits;
+    SwappedTraits straits;
     
     if(strideA==-1) strideA = depth;
     if(strideB==-1) strideB = depth;
     conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-//     conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-    Index packet_cols = (cols/nr) * nr;
-    const Index peeled_mc = (rows/mr)*mr;
-    // FIXME:
-    const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
-    const Index peeled_kc = (depth/4)*4;
-
-    if(unpackedB==0)
-      unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
-
-    // loops on each micro vertical panel of rhs (depth x nr)
-    for(Index j2=0; j2<packet_cols; j2+=nr)
-    {
-      traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
-
-      // loops on each largest micro horizontal panel of lhs (mr x depth)
-      // => we select a mr x nr micro block of res which is entirely
-      //    stored into mr/packet_size x nr registers.
-      for(Index i=0; i<peeled_mc; i+=mr)
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
+    const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
+    const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
+    enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+    const Index peeled_kc  = depth & ~(pk-1);
+    const Index prefetch_res_offset = 32/sizeof(ResScalar);    
+//     const Index depth2     = depth & ~1;
+
+    //---------- Process 3 * LhsProgress rows at once ----------
+    // This corresponds to 3*LhsProgress x nr register blocks.
+    // Usually, make sense only with FMA
+    if(mr>=3*Traits::LhsProgress)
+    {      
+      PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
+      
+      // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
+      // and on each largest micro vertical panel of the rhs (depth * nr).
+      // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
+      // However, if depth is too small, we can extend the number of rows of these horizontal panels.
+      // This actual number of rows is computed as follow:
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
+      for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-                  traits.initAcc(C0);
-                  traits.initAcc(C1);
-        if(nr==4) traits.initAcc(C2);
-        if(nr==4) traits.initAcc(C3);
-                  traits.initAcc(C4);
-                  traits.initAcc(C5);
-        if(nr==4) traits.initAcc(C6);
-        if(nr==4) traits.initAcc(C7);
-
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
-        ResScalar* r1 = r0 + resStride;
-        ResScalar* r2 = r1 + resStride;
-        ResScalar* r3 = r2 + resStride;
-
-        prefetch(r0+16);
-        prefetch(r1+16);
-        prefetch(r2+16);
-        prefetch(r3+16);
-
-        // performs "inner" product
-        // TODO let's check wether the folowing peeled loop could not be
-        //      optimized via optimal prefetching from one loop to the other
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<peeled_kc; k+=4)
+        const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          if(nr==2)
-          {
-            LhsPacket A0, A1;
-            RhsPacket B_0;
-            RhsPacket T0;
-            
-EIGEN_ASM_COMMENT("mybegin2");
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadLhs(&blA[3*LhsProgress], A1);
-            traits.loadRhs(&blB[2*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[4*LhsProgress], A0);
-            traits.loadLhs(&blA[5*LhsProgress], A1);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[5*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[6*LhsProgress], A0);
-            traits.loadLhs(&blA[7*LhsProgress], A1);
-            traits.loadRhs(&blB[6*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[7*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-EIGEN_ASM_COMMENT("myend");
-          }
-          else
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
           {
-EIGEN_ASM_COMMENT("mybegin4");
-            LhsPacket A0, A1;
-            RhsPacket B_0, B1, B2, B3;
-            RhsPacket T0;
-            
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[6*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[3*LhsProgress], A1);
-            traits.loadRhs(&blB[7*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[8*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[9*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[10*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[4*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[5*LhsProgress], A1);
-            traits.loadRhs(&blB[11*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[12*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[13*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[14*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[6*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[7*LhsProgress], A1);
-            traits.loadRhs(&blB[15*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.madd(A1,B3,C7,B3);
-          }
+          
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 3 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2,  C3,
+                    C4, C5, C6,  C7,
+                    C8, C9, C10, C11;
+          traits.initAcc(C0);  traits.initAcc(C1);  traits.initAcc(C2);  traits.initAcc(C3);
+          traits.initAcc(C4);  traits.initAcc(C5);  traits.initAcc(C6);  traits.initAcc(C7);
+          traits.initAcc(C8);  traits.initAcc(C9);  traits.initAcc(C10); traits.initAcc(C11);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
 
-          blB += 4*nr*RhsProgress;
-          blA += 4*mr;
-        }
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          if(nr==2)
+          for(Index k=0; k<peeled_kc; k+=pk)
           {
-            LhsPacket A0, A1;
-            RhsPacket B_0;
-            RhsPacket T0;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+            RhsPacket B_0, T0;
+            LhsPacket A2;
+
+#define EIGEN_GEBP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              if (EIGEN_ARCH_ARM) internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
+
+            internal::prefetch(blB);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
           }
-          else
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
           {
-            LhsPacket A0, A1;
-            RhsPacket B_0, B1, B2, B3;
-            RhsPacket T0;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.madd(A1,B3,C7,B3);
+            RhsPacket B_0, T0;
+            LhsPacket A2;
+            EIGEN_GEBP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 3*Traits::LhsProgress;
           }
 
-          blB += nr*RhsProgress;
-          blA += mr;
-        }
+#undef EIGEN_GEBP_ONESTEP
 
-        if(nr==4)
-        {
-          ResPacket R0, R1, R2, R3, R4, R5, R6;
+          possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
+          possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
+          possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
+
+          ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0);
-          R1 = ploadu<ResPacket>(r1);
-          R2 = ploadu<ResPacket>(r2);
-          R3 = ploadu<ResPacket>(r3);
-          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
-          R5 = ploadu<ResPacket>(r1 + ResPacketSize);
-          R6 = ploadu<ResPacket>(r2 + ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0, R0);
-          R0 = ploadu<ResPacket>(r3 + ResPacketSize);
-
-          traits.acc(C1, alphav, R1);
-          traits.acc(C2, alphav, R2);
-          traits.acc(C3, alphav, R3);
-          traits.acc(C4, alphav, R4);
-          traits.acc(C5, alphav, R5);
-          traits.acc(C6, alphav, R6);
-          traits.acc(C7, alphav, R0);
-          
-          pstoreu(r1, R1);
-          pstoreu(r2, R2);
-          pstoreu(r3, R3);
-          pstoreu(r0 + ResPacketSize, R4);
-          pstoreu(r1 + ResPacketSize, R5);
-          pstoreu(r2 + ResPacketSize, R6);
-          pstoreu(r3 + ResPacketSize, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C1, alphav, R0);
+          traits.acc(C5, alphav, R1);
+          traits.acc(C9, alphav, R2);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C6, alphav, R1);
+          traits.acc(C10, alphav, R2);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
+          traits.acc(C3, alphav, R0);
+          traits.acc(C7, alphav, R1);
+          traits.acc(C11, alphav, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
-        else
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
         {
-          ResPacket R0, R1, R4;
+          for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
+          {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C4, C8;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+          traits.initAcc(C8);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1, A2;
+          
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+            RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
+        
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*3*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 3*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0);
-          R1 = ploadu<ResPacket>(r1);
-          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0, R0);
-          R0 = ploadu<ResPacket>(r1 + ResPacketSize);
-          traits.acc(C1, alphav, R1);
-          traits.acc(C4, alphav, R4);
-          traits.acc(C5, alphav, R0);
-          pstoreu(r1, R1);
-          pstoreu(r0 + ResPacketSize, R4);
-          pstoreu(r1 + ResPacketSize, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);          
+          }
         }
-        
       }
-      
-      if(rows-peeled_mc>=LhsProgress)
+    }
+
+    //---------- Process 2 * LhsProgress rows at once ----------
+    if(mr>=2*Traits::LhsProgress)
+    {
+      const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
+      // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+      // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
+      // or because we are testing specific blocking sizes.
+      Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
+
+      for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
       {
-        Index i = peeled_mc;
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0, C1, C2, C3;
-                  traits.initAcc(C0);
-                  traits.initAcc(C1);
-        if(nr==4) traits.initAcc(C2);
-        if(nr==4) traits.initAcc(C3);
-
-        // performs "inner" product
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<peeled_kc; k+=4)
+        Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
         {
-          if(nr==2)
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
           {
-            LhsPacket A0;
-            RhsPacket B_0, B1;
+          
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 2 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3,
+                    C4, C5, C6, C7;
+          traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
+          traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[2*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[1*LhsProgress], A0);
-            traits.loadRhs(&blB[3*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[6*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[3*LhsProgress], A0);
-            traits.loadRhs(&blB[7*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-          }
-          else
+          for(Index k=0; k<peeled_kc; k+=pk)
           {
-            LhsPacket A0;
-            RhsPacket B_0, B1, B2, B3;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[6*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-            traits.loadLhs(&blA[1*LhsProgress], A0);
-            traits.loadRhs(&blB[7*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[8*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[9*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[10*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadRhs(&blB[11*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[12*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[13*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[14*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-
-            traits.loadLhs(&blA[3*LhsProgress], A0);
-            traits.loadRhs(&blB[15*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.madd(A0,B3,C3,B3);
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+            RhsPacket B_0, B1, B2, B3, T0;
+
+   #define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
+            
+            internal::prefetch(blB+(48+0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*(2*Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
           }
-
-          blB += nr*4*RhsProgress;
-          blA += 4*LhsProgress;
-        }
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          if(nr==2)
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
           {
-            LhsPacket A0;
-            RhsPacket B_0, B1;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
+            RhsPacket B_0, B1, B2, B3, T0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 2*Traits::LhsProgress;
           }
-          else
-          {
-            LhsPacket A0;
-            RhsPacket B_0, B1, B2, B3;
+#undef EIGEN_GEBGP_ONESTEP
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
+          ResPacket R0, R1, R2, R3;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.madd(A0,B3,C3,B3);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C1, alphav, R2);
+          traits.acc(C5, alphav, R3);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C6,  alphav, R1);
+          traits.acc(C3,  alphav, R2);
+          traits.acc(C7,  alphav, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
           }
-
-          blB += nr*RhsProgress;
-          blA += LhsProgress;
         }
+      
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
+          {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
+          prefetch(&blA[0]);
 
-        ResPacket R0, R1, R2, R3;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
-        ResScalar* r1 = r0 + resStride;
-        ResScalar* r2 = r1 + resStride;
-        ResScalar* r3 = r2 + resStride;
+          // gets res block as register
+          AccPacket C0, C4;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
 
-                  R0 = ploadu<ResPacket>(r0);
-                  R1 = ploadu<ResPacket>(r1);
-        if(nr==4) R2 = ploadu<ResPacket>(r2);
-        if(nr==4) R3 = ploadu<ResPacket>(r3);
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
 
-                  traits.acc(C0, alphav, R0);
-                  traits.acc(C1, alphav, R1);
-        if(nr==4) traits.acc(C2, alphav, R2);
-        if(nr==4) traits.acc(C3, alphav, R3);
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0, A1;
 
-                  pstoreu(r0, R0);
-                  pstoreu(r1, R1);
-        if(nr==4) pstoreu(r2, R2);
-        if(nr==4) pstoreu(r3, R3);
-      }
-      for(Index i=peeled_mc2; i<rows; i++)
-      {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        prefetch(&blA[0]);
-
-        // gets a 1 x nr res block as registers
-        ResScalar C0(0), C1(0), C2(0), C3(0);
-        // TODO directly use blockB ???
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-        for(Index k=0; k<depth; k++)
-        {
-          if(nr==2)
+          for(Index k=0; k<peeled_kc; k+=pk)
           {
-            LhsScalar A0;
-            RhsScalar B_0, B1;
-
-            A0 = blA[k];
-            B_0 = blB[0];
-            B1 = blB[1];
-            MADD(cj,A0,B_0,C0,B_0);
-            MADD(cj,A0,B1,C1,B1);
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            RhsPacket B_0, B1;
+        
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                  \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+            } while(false)
+        
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*2*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
-          else
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
           {
-            LhsScalar A0;
-            RhsScalar B_0, B1, B2, B3;
-
-            A0 = blA[k];
-            B_0 = blB[0];
-            B1 = blB[1];
-            B2 = blB[2];
-            B3 = blB[3];
-
-            MADD(cj,A0,B_0,C0,B_0);
-            MADD(cj,A0,B1,C1,B1);
-            MADD(cj,A0,B2,C2,B2);
-            MADD(cj,A0,B3,C3,B3);
+            RhsPacket B_0, B1;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 2*Traits::LhsProgress;
           }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-          blB += nr;
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          }
         }
-                  res[(j2+0)*resStride + i] += alpha*C0;
-                  res[(j2+1)*resStride + i] += alpha*C1;
-        if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
-        if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
       }
     }
-    // process remaining rhs/res columns one at a time
-    // => do the same but with nr==1
-    for(Index j2=packet_cols; j2<cols; j2++)
+    //---------- Process 1 * LhsProgress rows at once ----------
+    if(mr>=1*Traits::LhsProgress)
     {
-      // unpack B
-      traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
-
-      for(Index i=0; i<peeled_mc; i+=mr)
+      // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
-        prefetch(&blA[0]);
+        // loops on each largest micro vertical panel of rhs (depth * nr)
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 1 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0;
 
-        // TODO move the res loads to the stores
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
+            RhsPacket B_0, B1, B2, B3;
+               
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
+            
+            internal::prefetch(blB+(48+0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1, B2, B3;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 1*LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
 
-        // get res block as registers
-        AccPacket C0, C4;
-        traits.initAcc(C0);
-        traits.initAcc(C4);
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<depth; k++)
-        {
-          LhsPacket A0, A1;
-          RhsPacket B_0;
-          RhsPacket T0;
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C1,  alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C3,  alphav, R1);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(0 * Traits::ResPacketSize, R1);
+        }
 
-          traits.loadLhs(&blA[0*LhsProgress], A0);
-          traits.loadLhs(&blA[1*LhsProgress], A1);
-          traits.loadRhs(&blB[0*RhsProgress], B_0);
-          traits.madd(A0,B_0,C0,T0);
-          traits.madd(A1,B_0,C4,B_0);
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
 
-          blB += RhsProgress;
-          blA += 2*LhsProgress;
-        }
-        ResPacket R0, R4;
-        ResPacket alphav = pset1<ResPacket>(alpha);
+          // gets res block as register
+          AccPacket C0;
+          traits.initAcc(C0);
 
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
+          LinearMapper r0 = res.getLinearMapper(i, j2);
 
-        R0 = ploadu<ResPacket>(r0);
-        R4 = ploadu<ResPacket>(r0+ResPacketSize);
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0;
 
-        traits.acc(C0, alphav, R0);
-        traits.acc(C4, alphav, R4);
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
+            RhsPacket B_0;
+        
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
+            } while(false);
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
+          }
 
-        pstoreu(r0,               R0);
-        pstoreu(r0+ResPacketSize, R4);
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 1*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+        }
       }
-      if(rows-peeled_mc>=LhsProgress)
+    }
+    //---------- Process remaining rows, 1 at once ----------
+    if(peeled_mc1<rows)
+    {
+      // loop on each panel of the rhs
+      for(Index j2=0; j2<packet_cols4; j2+=nr)
       {
-        Index i = peeled_mc;
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
-        prefetch(&blA[0]);
-
-        AccPacket C0;
-        traits.initAcc(C0);
-
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<depth; k++)
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc1; i<rows; i+=1)
         {
-          LhsPacket A0;
-          RhsPacket B_0;
-          traits.loadLhs(blA, A0);
-          traits.loadRhs(blB, B_0);
-          traits.madd(A0, B_0, C0, B_0);
-          blB += RhsProgress;
-          blA += LhsProgress;
-        }
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-        ResPacket alphav = pset1<ResPacket>(alpha);
-        ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
-        traits.acc(C0, alphav, R0);
-        pstoreu(&res[(j2+0)*resStride + i], R0);
+          if( (SwappedTraits::LhsProgress % 4)==0 )
+          {
+            // NOTE The following piece of code wont work for 512 bit registers
+            SAccPacket C0, C1, C2, C3;
+            straits.initAcc(C0);
+            straits.initAcc(C1);
+            straits.initAcc(C2);
+            straits.initAcc(C3);
+
+            const Index spk   = (std::max)(1,SwappedTraits::LhsProgress/4);
+            const Index endk  = (depth/spk)*spk;
+            const Index endk4 = (depth/(spk*4))*(spk*4);
+
+            Index k=0;
+            for(; k<endk4; k+=4*spk)
+            {
+              SLhsPacket A0,A1;
+              SRhsPacket B_0,B_1;
+
+              straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
+
+              straits.loadRhsQuad(blA+0*spk, B_0);
+              straits.loadRhsQuad(blA+1*spk, B_1);
+              straits.madd(A0,B_0,C0,B_0);
+              straits.madd(A1,B_1,C1,B_1);
+
+              straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
+              straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
+              straits.loadRhsQuad(blA+2*spk, B_0);
+              straits.loadRhsQuad(blA+3*spk, B_1);
+              straits.madd(A0,B_0,C2,B_0);
+              straits.madd(A1,B_1,C3,B_1);
+
+              blB += 4*SwappedTraits::LhsProgress;
+              blA += 4*spk;
+            }
+            C0 = padd(padd(C0,C1),padd(C2,C3));
+            for(; k<endk; k+=spk)
+            {
+              SLhsPacket A0;
+              SRhsPacket B_0;
+
+              straits.loadLhsUnaligned(blB, A0);
+              straits.loadRhsQuad(blA, B_0);
+              straits.madd(A0,B_0,C0,B_0);
+
+              blB += SwappedTraits::LhsProgress;
+              blA += spk;
+            }
+            if(SwappedTraits::LhsProgress==8)
+            {
+              // Special case where we have to first reduce the accumulation register C0
+              typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress==8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
+
+              SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+              SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+              if(depth-endk>0)
+              {
+                // We have to handle the last row of the rhs which corresponds to a half-packet
+                SLhsPacketHalf a0;
+                SRhsPacketHalf b0;
+                straits.loadLhsUnaligned(blB, a0);
+                straits.loadRhs(blA, b0);
+                SAccPacketHalf c0 = predux4(C0);
+                straits.madd(a0,b0,c0,b0);
+                straits.acc(c0, alphav, R);
+              }
+              else
+              {
+                straits.acc(predux4(C0), alphav, R);
+              }
+              res.scatterPacket(i, j2, R);
+            }
+            else
+            {
+              SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+              SResPacket alphav = pset1<SResPacket>(alpha);
+              straits.acc(C0, alphav, R);
+              res.scatterPacket(i, j2, R);
+            }
+          }
+          else // scalar path
+          {
+            // get a 1 x 4 res block as registers
+            ResScalar C0(0), C1(0), C2(0), C3(0);
+
+            for(Index k=0; k<depth; k++)
+            {
+              LhsScalar A0;
+              RhsScalar B_0, B_1;
+
+              A0 = blA[k];
+
+              B_0 = blB[0];
+              B_1 = blB[1];
+              CJMADD(cj,A0,B_0,C0,  B_0);
+              CJMADD(cj,A0,B_1,C1,  B_1);
+              
+              B_0 = blB[2];
+              B_1 = blB[3];
+              CJMADD(cj,A0,B_0,C2,  B_0);
+              CJMADD(cj,A0,B_1,C3,  B_1);
+              
+              blB += 4;
+            }
+            res(i, j2 + 0) += alpha * C0;
+            res(i, j2 + 1) += alpha * C1;
+            res(i, j2 + 2) += alpha * C2;
+            res(i, j2 + 3) += alpha * C3;
+          }
+        }
       }
-      for(Index i=peeled_mc2; i<rows; i++)
+      // remaining columns
+      for(Index j2=packet_cols4; j2<cols; j2++)
       {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        prefetch(&blA[0]);
-
-        // gets a 1 x 1 res block as registers
-        ResScalar C0(0);
-        // FIXME directly use blockB ??
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-        for(Index k=0; k<depth; k++)
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for(Index i=peeled_mc1; i<rows; i+=1)
         {
-          LhsScalar A0 = blA[k];
-          RhsScalar B_0 = blB[k];
-          MADD(cj, A0, B_0, C0, B_0);
+          const LhsScalar* blA = &blockA[i*strideA+offsetA];
+          prefetch(&blA[0]);
+          // gets a 1 x 1 res block as registers
+          ResScalar C0(0);
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          for(Index k=0; k<depth; k++)
+          {
+            LhsScalar A0 = blA[k];
+            RhsScalar B_0 = blB[k];
+            CJMADD(cj, A0, B_0, C0, B_0);
+          }
+          res(i, j2) += alpha * C0;
         }
-        res[(j2+0)*resStride + i] += alpha*C0;
       }
     }
   }
@@ -1114,79 +1785,193 @@ EIGEN_ASM_COMMENT("mybegin4");
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
 {
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
   typedef typename packet_traits<Scalar>::type Packet;
   enum { PacketSize = packet_traits<Scalar>::size };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-  eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
+  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
   Index count = 0;
-  Index peeled_mc = (rows/Pack1)*Pack1;
-  for(Index i=0; i<peeled_mc; i+=Pack1)
+
+  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
+                         : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
+
+  Index i=0;
+
+  // Pack 3 packets
+  if(Pack1>=3*PacketSize)
   {
-    if(PanelMode) count += Pack1 * offset;
+    for(; i<peeled_mc3; i+=3*PacketSize)
+    {
+      if(PanelMode) count += (3*PacketSize) * offset;
 
-    if(StorageOrder==ColMajor)
+      for(Index k=0; k<depth; k++)
+      {
+        Packet A, B, C;
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        C = lhs.loadPacket(i+2*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
+      }
+      if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack 2 packets
+  if(Pack1>=2*PacketSize)
+  {
+    for(; i<peeled_mc2; i+=2*PacketSize)
     {
+      if(PanelMode) count += (2*PacketSize) * offset;
+
       for(Index k=0; k<depth; k++)
       {
-        Packet A, B, C, D;
-        if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
-        if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
-        if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
-        if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
-        if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
-        if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
-        if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
+        Packet A, B;
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
+        pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
+      if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
     }
-    else
+  }
+  // Pack 1 packets
+  if(Pack1>=1*PacketSize)
+  {
+    for(; i<peeled_mc1; i+=1*PacketSize)
     {
+      if(PanelMode) count += (1*PacketSize) * offset;
+
       for(Index k=0; k<depth; k++)
       {
-        // TODO add a vectorized transpose here
+        Packet A;
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        pstore(blockA+count, cj.pconj(A));
+        count+=PacketSize;
+      }
+      if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
+    }
+  }
+  // Pack scalars
+  if(Pack2<PacketSize && Pack2>1)
+  {
+    for(; i<peeled_mc0; i+=Pack2)
+    {
+      if(PanelMode) count += Pack2 * offset;
+
+      for(Index k=0; k<depth; k++)
+        for(Index w=0; w<Pack2; w++)
+          blockA[count++] = cj(lhs(i+w, k));
+
+      if(PanelMode) count += Pack2 * (stride-offset-depth);
+    }
+  }
+  for(; i<rows; i++)
+  {
+    if(PanelMode) count += offset;
+    for(Index k=0; k<depth; k++)
+      blockA[count++] = cj(lhs(i, k));
+    if(PanelMode) count += (stride-offset-depth);
+  }
+}
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+{
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
+{
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
+
+//   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+//   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+//   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
+  int pack = Pack1;
+  Index i = 0;
+  while(pack>0)
+  {
+    Index remaining_rows = rows-i;
+    Index peeled_mc = i+(remaining_rows/pack)*pack;
+    for(; i<peeled_mc; i+=pack)
+    {
+      if(PanelMode) count += pack * offset;
+
+      const Index peeled_k = (depth/PacketSize)*PacketSize;
+      Index k=0;
+      if(pack>=PacketSize)
+      {
+        for(; k<peeled_k; k+=PacketSize)
+        {
+          for (Index m = 0; m < pack; m += PacketSize)
+          {
+            PacketBlock<Packet> kernel;
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
+            ptranspose(kernel);
+            for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+          }
+          count += PacketSize*pack;
+        }
+      }
+      for(; k<depth; k++)
+      {
         Index w=0;
-        for(; w<Pack1-3; w+=4)
+        for(; w<pack-3; w+=4)
         {
           Scalar a(cj(lhs(i+w+0, k))),
-                  b(cj(lhs(i+w+1, k))),
-                  c(cj(lhs(i+w+2, k))),
-                  d(cj(lhs(i+w+3, k)));
+                 b(cj(lhs(i+w+1, k))),
+                 c(cj(lhs(i+w+2, k))),
+                 d(cj(lhs(i+w+3, k)));
           blockA[count++] = a;
           blockA[count++] = b;
           blockA[count++] = c;
           blockA[count++] = d;
         }
-        if(Pack1%4)
-          for(;w<Pack1;++w)
+        if(pack%4)
+          for(;w<pack;++w)
             blockA[count++] = cj(lhs(i+w, k));
       }
+
+      if(PanelMode) count += pack * (stride-offset-depth);
     }
-    if(PanelMode) count += Pack1 * (stride-offset-depth);
-  }
-  if(rows-peeled_mc>=Pack2)
-  {
-    if(PanelMode) count += Pack2*offset;
-    for(Index k=0; k<depth; k++)
-      for(Index w=0; w<Pack2; w++)
-        blockA[count++] = cj(lhs(peeled_mc+w, k));
-    if(PanelMode) count += Pack2 * (stride-offset-depth);
-    peeled_mc += Pack2;
+
+    pack -= PacketSize;
+    if(pack<Pack2 && (pack+PacketSize)!=Pack2)
+      pack = Pack2;
   }
-  for(Index i=peeled_mc; i<rows; i++)
+
+  for(; i<rows; i++)
   {
     if(PanelMode) count += offset;
     for(Index k=0; k<depth; k++)
@@ -1202,51 +1987,123 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder,
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols = (cols/nr) * nr;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   Index count = 0;
-  for(Index j2=0; j2<packet_cols; j2+=nr)
+  const Index peeled_k = (depth/PacketSize)*PacketSize;
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+//       const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+//       const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+//       const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+//       const Scalar* b4 = &rhs[(j2+4)*rhsStride];
+//       const Scalar* b5 = &rhs[(j2+5)*rhsStride];
+//       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
+//       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
+//       Index k=0;
+//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
+//       {
+//         for(; k<peeled_k; k+=PacketSize) {
+//           PacketBlock<Packet> kernel;
+//           for (int p = 0; p < PacketSize; ++p) {
+//             kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
+//           }
+//           ptranspose(kernel);
+//           for (int p = 0; p < PacketSize; ++p) {
+//             pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
+//             count+=PacketSize;
+//           }
+//         }
+//       }
+//       for(; k<depth; k++)
+//       {
+//         blockB[count+0] = cj(b0[k]);
+//         blockB[count+1] = cj(b1[k]);
+//         blockB[count+2] = cj(b2[k]);
+//         blockB[count+3] = cj(b3[k]);
+//         blockB[count+4] = cj(b4[k]);
+//         blockB[count+5] = cj(b5[k]);
+//         blockB[count+6] = cj(b6[k]);
+//         blockB[count+7] = cj(b7[k]);
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+
+  if(nr>=4)
   {
-    // skip what we have before
-    if(PanelMode) count += nr * offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-    const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-    const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-    const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-    for(Index k=0; k<depth; k++)
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
-                blockB[count+0] = cj(b0[k]);
-                blockB[count+1] = cj(b1[k]);
-      if(nr==4) blockB[count+2] = cj(b2[k]);
-      if(nr==4) blockB[count+3] = cj(b3[k]);
-      count += nr;
+      // skip what we have before
+      if(PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k=0;
+      if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for(; k<peeled_k; k+=PacketSize) {
+          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
+          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
+          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
+          ptranspose(kernel);
+          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
+          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
+          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
+          count+=4*PacketSize;
+        }
+      }
+      for(; k<depth; k++)
+      {
+        blockB[count+0] = cj(dm0(k));
+        blockB[count+1] = cj(dm1(k));
+        blockB[count+2] = cj(dm2(k));
+        blockB[count+3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if(PanelMode) count += 4 * (stride-offset-depth);
     }
-    // skip what we have after
-    if(PanelMode) count += nr * (stride-offset-depth);
   }
 
   // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols; j2<cols; ++j2)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k]);
+      blockB[count] = cj(dm0(k));
       count += 1;
     }
     if(PanelMode) count += (stride-offset-depth);
@@ -1254,46 +2111,93 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
 }
 
 // this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
+  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
 {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols = (cols/nr) * nr;
+  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
   Index count = 0;
-  for(Index j2=0; j2<packet_cols; j2+=nr)
+
+//   if(nr>=8)
+//   {
+//     for(Index j2=0; j2<packet_cols8; j2+=8)
+//     {
+//       // skip what we have before
+//       if(PanelMode) count += 8 * offset;
+//       for(Index k=0; k<depth; k++)
+//       {
+//         if (PacketSize==8) {
+//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+//           pstoreu(blockB+count, cj.pconj(A));
+//         } else if (PacketSize==4) {
+//           Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
+//           Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
+//           pstoreu(blockB+count, cj.pconj(A));
+//           pstoreu(blockB+count+PacketSize, cj.pconj(B));
+//         } else {
+//           const Scalar* b0 = &rhs[k*rhsStride + j2];
+//           blockB[count+0] = cj(b0[0]);
+//           blockB[count+1] = cj(b0[1]);
+//           blockB[count+2] = cj(b0[2]);
+//           blockB[count+3] = cj(b0[3]);
+//           blockB[count+4] = cj(b0[4]);
+//           blockB[count+5] = cj(b0[5]);
+//           blockB[count+6] = cj(b0[6]);
+//           blockB[count+7] = cj(b0[7]);
+//         }
+//         count += 8;
+//       }
+//       // skip what we have after
+//       if(PanelMode) count += 8 * (stride-offset-depth);
+//     }
+//   }
+  if(nr>=4)
   {
-    // skip what we have before
-    if(PanelMode) count += nr * offset;
-    for(Index k=0; k<depth; k++)
+    for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
     {
-      const Scalar* b0 = &rhs[k*rhsStride + j2];
-                blockB[count+0] = cj(b0[0]);
-                blockB[count+1] = cj(b0[1]);
-      if(nr==4) blockB[count+2] = cj(b0[2]);
-      if(nr==4) blockB[count+3] = cj(b0[3]);
-      count += nr;
+      // skip what we have before
+      if(PanelMode) count += 4 * offset;
+      for(Index k=0; k<depth; k++)
+      {
+        if (PacketSize==4) {
+          Packet A = rhs.loadPacket(k, j2);
+          pstoreu(blockB+count, cj.pconj(A));
+          count += PacketSize;
+        } else {
+          const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+          blockB[count+0] = cj(dm0(0));
+          blockB[count+1] = cj(dm0(1));
+          blockB[count+2] = cj(dm0(2));
+          blockB[count+3] = cj(dm0(3));
+          count += 4;
+        }
+      }
+      // skip what we have after
+      if(PanelMode) count += 4 * (stride-offset-depth);
     }
-    // skip what we have after
-    if(PanelMode) count += nr * (stride-offset-depth);
   }
   // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols; j2<cols; ++j2)
+  for(Index j2=packet_cols4; j2<cols; ++j2)
   {
     if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[j2];
     for(Index k=0; k<depth; k++)
     {
-      blockB[count] = cj(b0[k*rhsStride]);
+      blockB[count] = cj(rhs(k, j2));
       count += 1;
     }
     if(PanelMode) count += stride-offset-depth;
@@ -1306,8 +2210,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l1CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l1;
 }
 
@@ -1315,8 +2219,8 @@ inline std::ptrdiff_t l1CacheSize()
   * \sa setCpuCacheSize */
 inline std::ptrdiff_t l2CacheSize()
 {
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l2;
 }
 
@@ -1325,9 +2229,9 @@ inline std::ptrdiff_t l2CacheSize()
   * for the algorithms working per blocks.
   *
   * \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
 {
-  internal::manage_caching_sizes(SetAction, &l1, &l2);
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix.h b/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 3f5ffcf5..d830dfb9 100644
--- a/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -23,6 +23,8 @@ template<
   typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
 {
+  typedef gebp_traits<RhsScalar,LhsScalar> Traits;
+  
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(
     Index rows, Index cols, Index depth,
@@ -51,27 +53,31 @@ template<
 struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
 {
 
+typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
   const LhsScalar* _lhs, Index lhsStride,
   const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* res, Index resStride,
+  ResScalar* _res, Index resStride,
   ResScalar alpha,
   level3_blocking<LhsScalar,RhsScalar>& blocking,
   GemmParallelInfo<Index>* info = 0)
 {
-  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+  LhsMapper lhs(_lhs,lhsStride);
+  RhsMapper rhs(_rhs,rhsStride);
+  ResMapper res(_res, resStride);
 
   Index kc = blocking.kc();                   // cache block size along the K direction
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-  //Index nc = blocking.nc(); // cache block size along the N direction
+  Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
 #ifdef EIGEN_HAS_OPENMP
   if(info)
@@ -80,68 +86,71 @@ static void run(Index rows, Index cols, Index depth,
     Index tid = omp_get_thread_num();
     Index threads = omp_get_num_threads();
     
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
+    LhsScalar* blockA = blocking.blockA();
+    eigen_internal_assert(blockA!=0);
     
-    RhsScalar* blockB = blocking.blockB();
-    eigen_internal_assert(blockB!=0);
-
+    std::size_t sizeB = kc*nc;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+      
     // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
     for(Index k=0; k<depth; k+=kc)
     {
       const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
 
       // In order to reduce the chance that a thread has to wait for the other,
-      // let's start by packing A'.
-      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+      // let's start by packing B'.
+      pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
 
-      // Pack B_k to B' in a parallel fashion:
-      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
+      // Pack A_k to A' in a parallel fashion:
+      // each thread packs the sub block A_k,i to A'_i where i is the thread id.
 
-      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
+      // However, before copying to A'_i, we have to make sure that no other thread is still using it,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
       info[tid].users += threads;
 
-      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
+      pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
-      // Notify the other threads that the part B'_j is ready to go.
+      // Notify the other threads that the part A'_i is ready to go.
       info[tid].sync = k;
-
-      // Computes C_i += A' * B' per B'_j
+      
+      // Computes C_i += A' * B' per A'_i
       for(Index shift=0; shift<threads; ++shift)
       {
-        Index j = (tid+shift)%threads;
+        Index i = (tid+shift)%threads;
 
-        // At this point we have to make sure that B'_j has been updated by the thread j,
+        // At this point we have to make sure that A'_i has been updated by the thread i,
         // we use testAndSetOrdered to mimic a volatile access.
         // However, no need to wait for the B' part which has been updated by the current thread!
-        if(shift>0)
-          while(info[j].sync!=k) {}
+        if (shift>0) {
+          while(info[i].sync!=k) {
+          }
+        }
 
-        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
+        gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
       }
 
-      // Then keep going as usual with the remaining A'
-      for(Index i=mc; i<rows; i+=mc)
+      // Then keep going as usual with the remaining B'
+      for(Index j=nc; j<cols; j+=nc)
       {
-        const Index actual_mc = (std::min)(i+mc,rows)-i;
+        const Index actual_nc = (std::min)(j+nc,cols)-j;
 
-        // pack A_i,k to A'
-        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
+        // pack B_k,j to B'
+        pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
 
-        // C_i += A' * B'
-        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
+        // C_j += A' * B'
+        gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
       }
 
-      // Release all the sub blocks B'_j of B' for the current thread,
+      // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
-      for(Index j=0; j<threads; ++j)
+      #pragma omp critical
+      {
+      for(Index i=0; i<threads; ++i)
         #pragma omp atomic
-        --(info[j].users);
+        info[i].users -= 1;
+      }
     }
   }
   else
@@ -151,38 +160,42 @@ static void run(Index rows, Index cols, Index depth,
 
     // this is the sequential version!
     std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*nc;
 
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
+    
+    const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
 
     // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-    // (==GEMM_VAR1)
-    for(Index k2=0; k2<depth; k2+=kc)
+    for(Index i2=0; i2<rows; i2+=mc)
     {
-      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+      const Index actual_mc = (std::min)(i2+mc,rows)-i2;
 
-      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
-      // Note that this panel will be read as many times as the number of blocks in the lhs's
-      // vertical panel which is, in practice, a very low number.
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
-
-      // For each mc x kc block of the lhs's vertical panel...
-      // (==GEPP_VAR1)
-      for(Index i2=0; i2<rows; i2+=mc)
+      for(Index k2=0; k2<depth; k2+=kc)
       {
-        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-
-        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
-        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
-
-        // Everything is packed, we can now call the block * panel kernel:
-        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+        const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+        
+        // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+        // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+        // Note that this panel will be read as many times as the number of blocks in the rhs's
+        // horizontal panel which is, in practice, a very low number.
+        pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
+        
+        // For each kc x nc block of the rhs's horizontal panel...
+        for(Index j2=0; j2<cols; j2+=nc)
+        {
+          const Index actual_nc = (std::min)(j2+nc,cols)-j2;
+          
+          // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+          // Note that this block will be read a very high number of times, which is equal to the number of
+          // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+          if((!pack_rhs_once) || i2==0)
+            pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
+          
+          // Everything is packed, we can now call the panel * block kernel:
+          gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+        }
       }
     }
   }
@@ -191,26 +204,21 @@ static void run(Index rows, Index cols, Index depth,
 };
 
 /*********************************************************************************
-*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
+*  Specialization of generic_product_impl for "large" GEMM, i.e.,
 *  implementation of the high level wrapper to general_matrix_matrix_product
 **********************************************************************************/
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
-{};
-
 template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
 struct gemm_functor
 {
-  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
-                  BlockingType& blocking)
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
     : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
   {}
 
-  void initParallelSession() const
+  void initParallelSession(Index num_threads) const
   {
-    m_blocking.allocateB();
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
+    m_blocking.allocateA();
   }
 
   void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
@@ -219,11 +227,13 @@ struct gemm_functor
       cols = m_rhs.cols();
 
     Gemm::run(rows, cols, m_lhs.cols(),
-              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
-              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
+              &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
+              &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
               (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
               m_actualAlpha, m_blocking, info);
   }
+  
+  typedef typename Gemm::Traits Traits;
 
   protected:
     const Lhs& m_lhs;
@@ -245,29 +255,27 @@ class level3_blocking
   protected:
     LhsScalar* m_blockA;
     RhsScalar* m_blockB;
-    RhsScalar* m_blockW;
 
-    DenseIndex m_mc;
-    DenseIndex m_nc;
-    DenseIndex m_kc;
+    Index m_mc;
+    Index m_nc;
+    Index m_kc;
 
   public:
 
     level3_blocking()
-      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
+      : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0)
     {}
 
-    inline DenseIndex mc() const { return m_mc; }
-    inline DenseIndex nc() const { return m_nc; }
-    inline DenseIndex kc() const { return m_kc; }
+    inline Index mc() const { return m_mc; }
+    inline Index nc() const { return m_nc; }
+    inline Index kc() const { return m_kc; }
 
     inline LhsScalar* blockA() { return m_blockA; }
     inline RhsScalar* blockB() { return m_blockB; }
-    inline RhsScalar* blockW() { return m_blockW; }
 };
 
 template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
+class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */>
   : public level3_blocking<
       typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
       typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
@@ -282,29 +290,38 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
     enum {
       SizeA = ActualRows * MaxDepth,
-      SizeB = ActualCols * MaxDepth,
-      SizeW = MaxDepth * Traits::WorkSpaceFactor
+      SizeB = ActualCols * MaxDepth
     };
 
-    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
-    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+    EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+    EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+    EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+    EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1];
+#endif
 
   public:
 
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
+    gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/)
     {
       this->m_mc = ActualRows;
       this->m_nc = ActualCols;
       this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
       this->m_blockA = m_staticA;
       this->m_blockB = m_staticB;
-      this->m_blockW = m_staticW;
+#else
+      this->m_blockA = reinterpret_cast<LhsScalar*>((std::size_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+      this->m_blockB = reinterpret_cast<RhsScalar*>((std::size_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1));
+#endif
     }
+    
+    void initParallel(Index, Index, Index, Index)
+    {}
 
     inline void allocateA() {}
     inline void allocateB() {}
-    inline void allocateW() {}
     inline void allocateAll() {}
 };
 
@@ -321,22 +338,43 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
     typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-    DenseIndex m_sizeA;
-    DenseIndex m_sizeB;
-    DenseIndex m_sizeW;
+    Index m_sizeA;
+    Index m_sizeB;
 
   public:
 
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
+    gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
     {
       this->m_mc = Transpose ? cols : rows;
       this->m_nc = Transpose ? rows : cols;
       this->m_kc = depth;
 
-      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
+      if(l3_blocking)
+      {
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+      }
+      else  // no l3 blocking
+      {
+        Index m = this->m_mc;
+        Index n = this->m_nc;
+        computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
+      }
+
+      m_sizeA = this->m_mc * this->m_kc;
+      m_sizeB = this->m_kc * this->m_nc;
+    }
+    
+    void initParallel(Index rows, Index cols, Index depth, Index num_threads)
+    {
+      this->m_mc = Transpose ? cols : rows;
+      this->m_nc = Transpose ? rows : cols;
+      this->m_kc = depth;
+      
+      eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0);      
+      Index m = this->m_mc;
+      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads);
       m_sizeA = this->m_mc * this->m_kc;
       m_sizeB = this->m_kc * this->m_nc;
-      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
     }
 
     void allocateA()
@@ -351,77 +389,108 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
         this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
     }
 
-    void allocateW()
-    {
-      if(this->m_blockW==0)
-        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
-    }
-
     void allocateAll()
     {
       allocateA();
       allocateB();
-      allocateW();
     }
 
     ~gemm_blocking_space()
     {
       aligned_delete(this->m_blockA, m_sizeA);
       aligned_delete(this->m_blockB, m_sizeB);
-      aligned_delete(this->m_blockW, m_sizeW);
     }
 };
 
 } // end namespace internal
 
+namespace internal {
+  
 template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemmProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
+struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> >
 {
-    enum {
-      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
-    };
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-    
-    typedef typename  Lhs::Scalar LhsScalar;
-    typedef typename  Rhs::Scalar RhsScalar;
-    typedef           Scalar      ResScalar;
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+  
+  enum {
+    MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
+  };
+  
+  typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct;
+  
+  template<typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::evalTo(dst, lhs, rhs);
+    else
     {
-      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
+      dst.setZero();
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
     }
+  }
 
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-      typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-      typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
-
-      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
-
-      typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
-              Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
-
-      typedef internal::gemm_functor<
-        Scalar, Index,
-        internal::general_matrix_matrix_product<
-          Index,
-          LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-          RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
-        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
-
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
+  template<typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::addTo(dst, lhs, rhs);
+    else
+      scaleAndAddTo(dst,lhs, rhs, Scalar(1));
+  }
 
-      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
-    }
+  template<typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
+      lazyproduct::subTo(dst, lhs, rhs);
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha)
+  {
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
+    if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
+      return;
+
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
+            Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+
+    typedef internal::gemm_functor<
+      Scalar, Index,
+      internal::general_matrix_matrix_product<
+        Index,
+        LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
+        RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+      ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
+
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
+                              (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
+  }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 5c376390..a36eb2fe 100644
--- a/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -20,7 +20,7 @@ namespace internal {
 /**********************************************************************
 * This file implements a general A * B product while
 * evaluating only one triangular part of the product.
-* This is more general version of self adjoint product (C += A A^T)
+* This is a more general version of self adjoint product (C += A A^T)
 * as the level 3 SYRK Blas routine.
 **********************************************************************/
 
@@ -58,30 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 {
   typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
   static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
+                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
   {
-    const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
     Index kc = depth; // cache block size along the K direction
     Index mc = size;  // cache block size along the M direction
     Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
+    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
     // !!! mc must be a multiple of nr:
     if(mc > Traits::nr)
       mc = (mc/Traits::nr)*Traits::nr;
 
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*size;
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
-    RhsScalar* blockB = allocatedBlockB + sizeW;
-    
-    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-    gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
 
     for(Index k2=0; k2<depth; k2+=kc)
@@ -89,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
       const Index actual_kc = (std::min)(k2+kc,depth)-k2;
 
       // note that the actual rhs is the transpose/adjoint of mat
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
 
       for(Index i2=0; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
 
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         // the selected actual_mc * size panel of res is split into three different part:
         //  1 - before the diagonal => processed with gebp or skipped
         //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
         //  3 - after the diagonal => processed with gebp or skipped
         if (UpLo==Lower)
-          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
+               (std::min)(size,i2), alpha, -1, -1, 0, 0);
+
 
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
+        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
         {
           Index j2 = i2+actual_mc;
-          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+          gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
+               actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -132,13 +134,16 @@ struct tribb_kernel
 {
   typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
   typedef typename Traits::ResScalar ResScalar;
-  
+
   enum {
     BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
   };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
+  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
-    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
+    ResMapper res(_res, resStride);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+
     Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
 
     // let's process the block per panel of actual_mc x BlockSize,
@@ -149,20 +154,20 @@ struct tribb_kernel
       const RhsScalar* actual_b = blockB+j*depth;
 
       if(UpLo==Upper)
-        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0);
 
       // selfadjoint micro block
       {
         Index i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                    -1, -1, 0, 0);
         // 2 - triangular accumulation
         for(Index j1=0; j1<actualBlockSize; ++j1)
         {
-          ResScalar* r = res + (j+j1)*resStride + i;
+          ResScalar* r = &res(i, j + j1);
           for(Index i1=UpLo==Lower ? j1 : 0;
               UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
             r[i1] += buffer(i1,j1);
@@ -172,8 +177,8 @@ struct tribb_kernel
       if(UpLo==Lower)
       {
         Index i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
       }
     }
   }
@@ -193,7 +198,6 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
   static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     
     typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
@@ -238,8 +242,6 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 {
   static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
   {
-    typedef typename MatrixType::Index Index;
-    
     typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
@@ -265,12 +267,14 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 };
 
 template<typename MatrixType, unsigned int UpLo>
-template<typename ProductDerived, typename _Lhs, typename _Rhs>
-TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
+template<typename ProductType>
+TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha)
 {
-  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
+  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
+  
+  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha);
   
-  return *this;
+  return derived();
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
index 060af328..b6ae729b 100644
--- a/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+++ b/nuparu/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
@@ -53,6 +53,8 @@ template< \
   int RhsStorageOrder, bool ConjugateRhs> \
 struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
 { \
+typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
+\
 static void run(Index rows, Index cols, Index depth, \
   const EIGTYPE* _lhs, Index lhsStride, \
   const EIGTYPE* _rhs, Index rhsStride, \
diff --git a/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector.h b/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector.h
index c1cb7849..8b7dca45 100644
--- a/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
 #define EIGEN_GENERAL_MATRIX_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -26,11 +26,39 @@ namespace internal {
  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * Accesses to the matrix coefficients follow the following logic:
+ *
+ * - if all columns have the same alignment then
+ *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
+ *   - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise
+ *   - if even columns have the same alignment then
+ *     // odd columns are guaranteed to have the same alignment too
+ *     - if even or odd columns have the same alignment as the result, then
+ *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
+ *       - perform half aligned and half unaligned loads (-> EvenAligned case)
+ *     - otherwise perform unaligned loads only (-> NoneAligned case)
+ *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
+ *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
+ *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
+ *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
+ *   - otherwise,
+ *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
+ *     // we currently fall back to the NoneAligned case
+ *
+ * The same reasoning apply for the transposed case.
+ *
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
+ * compared to unaligned loads on a 4 byte boundary.
+ *
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
 enum {
   Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@@ -50,38 +78,35 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-    resIncr
-  #endif
-  , RhsScalar alpha);
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-    resIncr
-  #endif
-  , RhsScalar alpha)
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
+  RhsScalar alpha)
 {
+  EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr==1);
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
     pstore(&res[j], \
       padd(pload<ResPacket>(&res[j]), \
         padd( \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
+      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
+      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
+      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
+      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
+
+  typedef typename LhsMapper::VectorMapper LhsScalars;
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@@ -95,10 +120,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   const Index ResPacketAlignedMask = ResPacketSize-1;
 //  const Index PeelAlignedMask = ResPacketSize*peels-1;
   const Index size = rows;
-  
+
+  const Index lhsStride = lhs.stride();
+
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_aligned(res,size);
+  Index alignedStart = internal::first_default_aligned(res,size);
   Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
@@ -108,19 +135,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
                        : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
+  const Index lhsAlignmentOffset = lhs.firstAligned(size);
 
   // find how many columns do we have to skip to be aligned with the result (if possible)
   Index skipColumns = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    // Currently, it seems to be better to perform unaligned loads anyway
+    alignmentPattern = NoneAligned;
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
 
     while (skipColumns<LhsPacketSize &&
           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@@ -137,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // note that the skiped columns are processed later.
     }
 
-    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
                       || (skipColumns + columnsAtOnce >= cols)
                       || LhsPacketSize > size
-                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -149,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
   Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
   for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
   {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
 
     // this helps a lot generating better binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
+                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
 
     if (Vectorizable)
     {
@@ -170,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
       // process initial unaligned coeffs
       for (Index j=0; j<alignedStart; ++j)
       {
-        res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
       }
 
       if (alignedSize>alignedStart)
@@ -182,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -196,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
               ResPacket T0, T1;
 
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*ResPacketSize)
               {
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                A00 = pload<LhsPacket>(&lhs0[j]);
-                A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
+                A00 = lhs0.template load<LhsPacket, Aligned>(j);
+                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
                 T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
                 T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
 
                 T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 T0  = pcj.pmadd(A03, ptmp3, T0);
                 pstore(&res[j],T0);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
                 T1  = pcj.pmadd(A11, ptmp1, T1);
                 T1  = pcj.pmadd(A12, ptmp2, T1);
                 T1  = pcj.pmadd(A13, ptmp3, T1);
@@ -225,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
               }
             }
             for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
       }
@@ -239,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
     /* process remaining coeffs (or all if there is no explicit vectorization) */
     for (Index j=alignedSize; j<size; ++j)
     {
-      res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
     }
   }
 
@@ -253,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
   {
     for (Index k=start; k<end; ++k)
     {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
-      const LhsScalar* lhs0 = lhs + k*lhsStride;
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
 
       if (Vectorizable)
       {
         /* explicit vectorization */
         // process first unaligned result's coeffs
         for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
+          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
         // process aligned result's coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
         else
           for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
       }
 
       // process remaining scalars (or all if no explicit vectorization)
       for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
     }
     if (skipColumns)
     {
@@ -297,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  *  - alpha is always a complex (or converted to a complex)
  *  - no vectorization
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
@@ -317,73 +351,84 @@ typedef typename packet_traits<ResScalar>::type  _ResPacket;
 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  
+
 EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
+        ResScalar* res, Index resIncr,
   ResScalar alpha);
 };
 
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
+template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
+  const LhsMapper& lhs,
+  const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
-  EIGEN_UNUSED_VARIABLE(rhsIncr);
-  eigen_internal_assert(rhsIncr==1);
+  eigen_internal_assert(rhs.stride()==1);
+
   #ifdef _EIGEN_ACCUMULATE_PACKETS
   #error _EIGEN_ACCUMULATE_PACKETS has already been defined
   #endif
 
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
-    RhsPacket b = pload<RhsPacket>(&rhs[j]); \
-    ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
-    ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
-    ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
-    ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
+    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
 
+  typedef typename LhsMapper::VectorMapper LhsScalars;
+
   enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
   const Index rowsAtOnce = 4;
   const Index peels = 2;
   const Index RhsPacketAlignedMask = RhsPacketSize-1;
   const Index LhsPacketAlignedMask = LhsPacketSize-1;
-//   const Index PeelAlignedMask = RhsPacketSize*peels-1;
   const Index depth = cols;
+  const Index lhsStride = lhs.stride();
 
   // How many coeffs of the result do we have to skip to be aligned.
   // Here we assume data are at least aligned on the base scalar type
   // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = internal::first_aligned(rhs, depth);
+  Index alignedStart = rhs.firstAligned(depth);
   Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
   const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
   const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
   Index alignmentPattern = alignmentStep==0 ? AllAligned
-                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                         : FirstAligned;
+                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                           : FirstAligned;
 
   // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
+  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
 
   // find how many rows do we have to skip to be aligned with rhs (if possible)
   Index skipRows = 0;
   // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
+      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
+      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
   {
     alignedSize = 0;
     alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    alignmentPattern = NoneAligned;
   }
   else if (LhsPacketSize>1)
   {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
 
     while (skipRows<LhsPacketSize &&
            alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@@ -399,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       skipRows = (std::min)(skipRows,Index(rows));
       // note that the skiped columns are processed later.
     }
-    eigen_internal_assert(  alignmentPattern==NoneAligned
+    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
                       || LhsPacketSize==1
                       || (skipRows + rowsAtOnce >= rows)
                       || LhsPacketSize > depth
-                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
   }
   else if(Vectorizable)
   {
@@ -412,18 +457,19 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     alignmentPattern = AllAligned;
   }
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
+  const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
+  const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
 
   Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
   for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
   {
-    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+    // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
+    EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
     ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
 
     // this helps the compiler generating good binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
+    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
+                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
 
     if (Vectorizable)
     {
@@ -435,9 +481,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
       {
-        RhsScalar b = rhs[j];
-        tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-        tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+        RhsScalar b = rhs(j, 0);
+        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
       }
 
       if (alignedSize>alignedStart)
@@ -446,11 +492,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
         {
           case AllAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
             break;
           case EvenAligned:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
             break;
           case FirstAligned:
           {
@@ -464,39 +510,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
                * than basic unaligned loads.
                */
               LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
 
               for (; j<peeledSize; j+=peels*RhsPacketSize)
               {
-                RhsPacket b = pload<RhsPacket>(&rhs[j]);
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
+                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
 
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
                 ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
                 ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
                 ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
 
-                b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
+                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
                 ptmp1 = pcj.pmadd(A11, b, ptmp1);
                 ptmp2 = pcj.pmadd(A12, b, ptmp2);
                 ptmp3 = pcj.pmadd(A13, b, ptmp3);
               }
             }
             for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
             break;
           }
           default:
             for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
             break;
         }
         tmp0 += predux(ptmp0);
@@ -510,9 +556,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
     // FIXME this loop get vectorized by the compiler !
     for (Index j=alignedSize; j<depth; ++j)
     {
-      RhsScalar b = rhs[j];
-      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+      RhsScalar b = rhs(j, 0);
+      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
     }
     res[i*resIncr]            += alpha*tmp0;
     res[(i+offset1)*resIncr]  += alpha*tmp1;
@@ -527,30 +573,30 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
   {
     for (Index i=start; i<end; ++i)
     {
-      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
+      EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
       ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalar* lhs0 = lhs + i*lhsStride;
+      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
       // process first unaligned result's coeffs
       // FIXME this loop get vectorized by the compiler !
       for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
 
       if (alignedSize>alignedStart)
       {
         // process aligned rhs coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         else
           for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
         tmp0 += predux(ptmp0);
       }
 
       // process remaining scalars
       // FIXME this loop get vectorized by the compiler !
       for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
       res[i*resIncr] += alpha*tmp0;
     }
     if (skipRows)
diff --git a/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
index 1cb9fe6b..12c3d13b 100644
--- a/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ b/nuparu/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
@@ -46,38 +46,37 @@ namespace internal {
 
 // gemv specialization
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
-struct general_matrix_vector_product_gemv :
-  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
+template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
 
 #define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \
 static void run( \
   Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \
   Scalar* res, Index resIncr, Scalar alpha) \
 { \
   if (ConjugateLhs) { \
-    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+    general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \
+      rows, cols, lhs, rhs, res, resIncr, alpha); \
   } else { \
     general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
   } \
 } \
 }; \
 template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
+struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \
 static void run( \
   Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
+  const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \
+  const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \
   Scalar* res, Index resIncr, Scalar alpha) \
 { \
     general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
+      rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \
 } \
 }; \
 
diff --git a/nuparu/include/Eigen/src/Core/products/Parallelizer.h b/nuparu/include/Eigen/src/Core/products/Parallelizer.h
index 5c3e9b7a..e0bfcc35 100644
--- a/nuparu/include/Eigen/src/Core/products/Parallelizer.h
+++ b/nuparu/include/Eigen/src/Core/products/Parallelizer.h
@@ -49,8 +49,8 @@ inline void initParallel()
 {
   int nbt;
   internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
 }
 
 /** \returns the max number of threads reserved for Eigen
@@ -73,13 +73,13 @@ namespace internal {
 
 template<typename Index> struct GemmParallelInfo
 {
-  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
+  GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
   int volatile sync;
   int volatile users;
 
-  Index rhs_start;
-  Index rhs_length;
+  Index lhs_start;
+  Index lhs_length;
 };
 
 template<bool Condition, typename Functor, typename Index>
@@ -102,53 +102,49 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos
   // - we are not already in a parallel code
   // - the sizes are large enough
 
-  // 1- are we already in a parallel session?
-  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
-  if((!Condition) || (omp_get_num_threads()>1))
-    return func(0,rows, 0,cols);
-
-  Index size = transpose ? cols : rows;
-
-  // 2- compute the maximal number of threads from the size of the product:
+  // compute the maximal number of threads from the size of the product:
   // FIXME this has to be fine tuned
-  Index max_threads = std::max<Index>(1,size / 32);
-
-  // 3 - compute the number of threads we are going to use
-  Index threads = std::min<Index>(nbThreads(), max_threads);
+  Index size = transpose ? rows : cols;
+  Index pb_max_threads = std::max<Index>(1,size / 32);
+  // compute the number of threads we are going to use
+  Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  if(threads==1)
+  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
+  // then abort multi-threading
+  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
+  if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
     return func(0,rows, 0,cols);
 
   Eigen::initParallel();
-  func.initParallelSession();
+  func.initParallelSession(threads);
 
   if(transpose)
     std::swap(rows,cols);
-
-  Index blockCols = (cols / threads) & ~Index(0x3);
-  Index blockRows = (rows / threads) & ~Index(0x7);
   
-  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
-
-  #pragma omp parallel for schedule(static,1) num_threads(threads)
-  for(Index i=0; i<threads; ++i)
+  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
+  
+  #pragma omp parallel num_threads(threads)
   {
+    Index i = omp_get_thread_num();
+    // Note that the actual number of threads might be lower than the number of request ones.
+    Index actual_threads = omp_get_num_threads();
+    
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr;
+  
     Index r0 = i*blockRows;
-    Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows;
+    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
 
     Index c0 = i*blockCols;
-    Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols;
+    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
 
-    info[i].rhs_start = c0;
-    info[i].rhs_length = actualBlockCols;
+    info[i].lhs_start = r0;
+    info[i].lhs_length = actualBlockRows;
 
-    if(transpose)
-      func(0, cols, r0, actualBlockRows, info);
-    else
-      func(r0, actualBlockRows, 0,cols, info);
+    if(transpose) func(c0, actualBlockCols, 0, rows, info);
+    else          func(0, rows, c0, actualBlockCols, info);
   }
-
-  delete[] info;
 #endif
 }
 
diff --git a/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 99cf9e0a..f84f5498 100644
--- a/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 
 // pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
+template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
 struct symm_pack_lhs
 {
   template<int BlockRows> inline
@@ -45,25 +45,32 @@ struct symm_pack_lhs
   }
   void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
   {
+    enum { PacketSize = packet_traits<Scalar>::size };
     const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
     Index count = 0;
-    Index peeled_mc = (rows/Pack1)*Pack1;
-    for(Index i=0; i<peeled_mc; i+=Pack1)
-    {
-      pack<Pack1>(blockA, lhs, cols, i, count);
-    }
-
-    if(rows-peeled_mc>=Pack2)
-    {
-      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
-      peeled_mc += Pack2;
-    }
+    //Index peeled_mc3 = (rows/Pack1)*Pack1;
+    
+    const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+    const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+    
+    if(Pack1>=3*PacketSize)
+      for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
+        pack<3*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=2*PacketSize)
+      for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize)
+        pack<2*PacketSize>(blockA, lhs, cols, i, count);
+    
+    if(Pack1>=1*PacketSize)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
+        pack<1*PacketSize>(blockA, lhs, cols, i, count);
 
     // do the same with mr==1
-    for(Index i=peeled_mc; i<rows; i++)
+    for(Index i=peeled_mc1; i<rows; i++)
     {
       for(Index k=0; k<i; k++)
-        blockA[count++] = lhs(i, k);              // normal
+        blockA[count++] = lhs(i, k);                   // normal
 
       blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
 
@@ -82,7 +89,8 @@ struct symm_pack_rhs
     Index end_k = k2 + rows;
     Index count = 0;
     const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
-    Index packet_cols = (cols/nr)*nr;
+    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
+    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
 
     // first part: normal case
     for(Index j2=0; j2<k2; j2+=nr)
@@ -91,79 +99,151 @@ struct symm_pack_rhs
       {
         blockB[count+0] = rhs(k,j2+0);
         blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        if (nr>=4)
         {
           blockB[count+2] = rhs(k,j2+2);
           blockB[count+3] = rhs(k,j2+3);
         }
+        if (nr>=8)
+        {
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+        }
         count += nr;
       }
     }
 
     // second part: diagonal block
-    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
+    Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2;
+    if(nr>=8)
     {
-      // again we can split vertically in three different parts (transpose, symmetric, normal)
-      // transpose
-      for(Index k=k2; k<j2; k++)
+      for(Index j2=k2; j2<end8; j2+=8)
       {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
         {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
           blockB[count+2] = numext::conj(rhs(j2+2,k));
           blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
         }
-        count += nr;
-      }
-      // symmetric
-      Index h = 0;
-      for(Index k=j2; k<j2+nr; k++)
-      {
-        // normal
-        for (Index w=0 ; w<h; ++w)
-          blockB[count+w] = rhs(k,j2+w);
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+8; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
 
-        blockB[count+h] = numext::real(rhs(k,k));
+          blockB[count+h] = numext::real(rhs(k,k));
 
-        // transpose
-        for (Index w=h+1 ; w<nr; ++w)
-          blockB[count+w] = numext::conj(rhs(j2+w,k));
-        count += nr;
-        ++h;
+          // transpose
+          for (Index w=h+1 ; w<8; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 8;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+8; k<end_k; k++)
+        {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
+          blockB[count+2] = rhs(k,j2+2);
+          blockB[count+3] = rhs(k,j2+3);
+          blockB[count+4] = rhs(k,j2+4);
+          blockB[count+5] = rhs(k,j2+5);
+          blockB[count+6] = rhs(k,j2+6);
+          blockB[count+7] = rhs(k,j2+7);
+          count += 8;
+        }
       }
-      // normal
-      for(Index k=j2+nr; k<end_k; k++)
+    }
+    if(nr>=4)
+    {
+      for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4)
       {
-        blockB[count+0] = rhs(k,j2+0);
-        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for(Index k=k2; k<j2; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
+        }
+        // symmetric
+        Index h = 0;
+        for(Index k=j2; k<j2+4; k++)
+        {
+          // normal
+          for (Index w=0 ; w<h; ++w)
+            blockB[count+w] = rhs(k,j2+w);
+
+          blockB[count+h] = numext::real(rhs(k,k));
+
+          // transpose
+          for (Index w=h+1 ; w<4; ++w)
+            blockB[count+w] = numext::conj(rhs(j2+w,k));
+          count += 4;
+          ++h;
+        }
+        // normal
+        for(Index k=j2+4; k<end_k; k++)
         {
+          blockB[count+0] = rhs(k,j2+0);
+          blockB[count+1] = rhs(k,j2+1);
           blockB[count+2] = rhs(k,j2+2);
           blockB[count+3] = rhs(k,j2+3);
+          count += 4;
         }
-        count += nr;
       }
     }
 
     // third part: transposed
-    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
+    if(nr>=8)
     {
-      for(Index k=k2; k<end_k; k++)
+      for(Index j2=k2+rows; j2<packet_cols8; j2+=8)
       {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
+        for(Index k=k2; k<end_k; k++)
         {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
           blockB[count+2] = numext::conj(rhs(j2+2,k));
           blockB[count+3] = numext::conj(rhs(j2+3,k));
+          blockB[count+4] = numext::conj(rhs(j2+4,k));
+          blockB[count+5] = numext::conj(rhs(j2+5,k));
+          blockB[count+6] = numext::conj(rhs(j2+6,k));
+          blockB[count+7] = numext::conj(rhs(j2+7,k));
+          count += 8;
+        }
+      }
+    }
+    if(nr>=4)
+    {
+      for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4)
+      {
+        for(Index k=k2; k<end_k; k++)
+        {
+          blockB[count+0] = numext::conj(rhs(j2+0,k));
+          blockB[count+1] = numext::conj(rhs(j2+1,k));
+          blockB[count+2] = numext::conj(rhs(j2+2,k));
+          blockB[count+3] = numext::conj(rhs(j2+3,k));
+          count += 4;
         }
-        count += nr;
       }
     }
 
     // copy the remaining columns one at a time (=> the same with nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
+    for(Index j2=packet_cols4; j2<cols; ++j2)
     {
       // transpose
       Index half = (std::min)(end_k,j2);
@@ -244,33 +324,38 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha)
   {
     Index size = rows;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
+
     Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
     // kc must smaller than mc
     kc = (std::min)(kc,mc);
 
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
+    std::size_t sizeB = kc*cols;
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
+    Scalar* blockB = allocatedBlockB;
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -279,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       // we have selected one row panel of rhs and one column panel of lhs
       // pack rhs's panel into a sequential chunk of memory
       // and expand each coeff to a constant packet for further reuse
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
 
       // the select lhs's panel has to be split in three different parts:
       //  1 - the transposed panel above the diagonal block => transposed packed copy
@@ -289,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       {
         const Index actual_mc = (std::min)(i2+mc,k2)-i2;
         // transposed packed copy
-        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
+        pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
       // the block diagonal
       {
@@ -299,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
         // symmetric packed copy
         pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
 
-        gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
 
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
-          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
+          (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
@@ -335,27 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     Index rows, Index cols,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha)
   {
     Index size = cols;
 
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-
     typedef gebp_traits<Scalar,Scalar> Traits;
 
-    Index kc = size; // cache block size along the K direction
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    ResMapper res(_res,resStride);
+
+    Index kc = size;  // cache block size along the K direction
     Index mc = rows;  // cache block size along the M direction
     Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
+    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
+    std::size_t sizeB = kc*cols;
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
     ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
+    Scalar* blockB = allocatedBlockB;
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
@@ -368,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
       for(Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
       }
     }
   }
@@ -382,55 +469,53 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
 ***************************************************************************/
 
 namespace internal {
+  
 template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  
   enum {
     LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
     LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
     RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
     RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
   };
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  
+  template<typename Dest>
+  static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     internal::product_selfadjoint_matrix<Scalar, Index,
-      EIGEN_LOGICAL_XOR(LhsIsUpper,
-                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
-      EIGEN_LOGICAL_XOR(RhsIsUpper,
-                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
+      EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
       NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
       internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
       ::run(
         lhs.rows(), rhs.cols(),                 // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
+        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
         &dst.coeffRef(0,0), dst.outerStride(),  // result info
         actualAlpha                             // alpha
       );
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector.h b/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
index c40e80f5..d8d30267 100644
--- a/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -30,7 +30,7 @@ struct selfadjoint_matrix_vector_product
 static EIGEN_DONT_INLINE void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha);
 };
@@ -39,11 +39,12 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
+  const Scalar*  rhs,
   Scalar* res,
   Scalar alpha)
 {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
 
   enum {
@@ -54,23 +55,13 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
   conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+  conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd;
 
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
   conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
-  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
-  // this is because we need to extract packets
-  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
-  if (rhsIncr!=1)
-  {
-    const Scalar* it = _rhs;
-    for (Index i=0; i<size; ++i, it+=rhsIncr)
-      rhs[i] = *it;
-  }
 
   Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
   if (FirstTriangular)
@@ -79,8 +70,8 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
   for (Index j=FirstTriangular ? bound : 0;
        j<(FirstTriangular ? size : bound);j+=2)
   {
-    register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
-    register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
 
     Scalar t0 = cjAlpha * rhs[j];
     Packet ptmp0 = pset1<Packet>(t0);
@@ -94,10 +85,9 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
     size_t starti = FirstTriangular ? 0 : j+2;
     size_t endi   = FirstTriangular ? j : size;
-    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
+    size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti);
     size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
 
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j]   += cjd.pmul(numext::real(A0[j]), t0);
     res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
     if(FirstTriangular)
@@ -113,9 +103,9 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 
     for (size_t i=starti; i<alignedStart; ++i)
     {
-      res[i] += t0 * A0[i] + t1 * A1[i];
-      t2 += numext::conj(A0[i]) * rhs[i];
-      t3 += numext::conj(A1[i]) * rhs[i];
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
     }
     // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
     // gcc 4.2 does this optimization automatically.
@@ -147,11 +137,10 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
   }
   for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
   {
-    register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
 
     Scalar t1 = cjAlpha * rhs[j];
     Scalar t2(0);
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j] += cjd.pmul(numext::real(A0[j]), t1);
     for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
     {
@@ -169,45 +158,44 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
 ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, int LhsMode, typename Rhs>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
-{};
-}
 
 template<typename Lhs, int LhsMode, typename Rhs>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
-
-  enum {
-    LhsUpLo = LhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+  
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+  enum { LhsUpLo = LhsMode&(Upper|Lower) };
+
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
-    typedef typename Base::RhsScalar RhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
     typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
     
-    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+    eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     enum {
       EvalToDest = (Dest::InnerStrideAtCompileTime==1),
-      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+      UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1)
     };
     
     internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
-    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
+    internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
                                                   EvalToDest ? dest.data() : static_dest.data());
@@ -218,7 +206,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
     if(!EvalToDest)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+      Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
       MappedDest(actualDestPtr, dest.size()) = dest;
@@ -227,18 +215,19 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
     if(!UseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = rhs.size();
+      Index size = rhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
     }
       
       
-    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
+    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+                                                int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
       (
         lhs.rows(),                             // size
         &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
-        actualRhsPtr, 1,                        // rhs info
+        actualRhsPtr,                           // rhs info
         actualDestPtr,                          // result info
         actualAlpha                             // scale factor
       );
@@ -248,34 +237,24 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
   }
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
 template<typename Lhs, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
+struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  enum { RhsUpLo = RhsMode&(Upper|Lower)  };
 
-  enum {
-    RhsUpLo = RhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+  template<typename Dest>
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     // let's simply transpose the product
     Transpose<Dest> destT(dest);
-    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
-                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+    selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
+                             Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
index 86684b66..a08f385b 100644
--- a/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ b/nuparu/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
@@ -52,16 +52,16 @@ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool Con
 struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
 static void run( \
   Index size, const Scalar*  lhs, Index lhsStride, \
-  const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
+  const Scalar* _rhs, Scalar* res, Scalar alpha) { \
     enum {\
       IsColMajor = StorageOrder==ColMajor \
     }; \
     if (IsColMajor == ConjugateLhs) {\
       selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
     } else {\
       selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
+        size, lhs, lhsStride, _rhs, res, alpha);  \
     }\
   } \
 }; \
@@ -79,13 +79,13 @@ typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
 \
 static void run( \
 Index size, const EIGTYPE*  lhs, Index lhsStride, \
-const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
+const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \
 { \
   enum {\
     IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
     IsLower = UpLo == Lower ? 1 : 0 \
   }; \
-  MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
+  MKL_INT n=size, lda=lhsStride, incx=1, incy=1; \
   MKLTYPE alpha_, beta_; \
   const EIGTYPE *x_ptr, myone(1); \
   char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
@@ -93,10 +93,9 @@ const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
   assign_scalar_eig2mkl(beta_, myone); \
   SYMVVector x_tmp; \
   if (ConjugateRhs) { \
-    Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
+    Map<const SYMVVector, 0 > map_x(_rhs,size,1); \
     x_tmp=map_x.conjugate(); \
     x_ptr=x_tmp.data(); \
-    incx=1; \
   } else x_ptr=_rhs; \
   MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
 }\
diff --git a/nuparu/include/Eigen/src/Core/products/SelfadjointProduct.h b/nuparu/include/Eigen/src/Core/products/SelfadjointProduct.h
index 6ca4ae6c..2af00058 100644
--- a/nuparu/include/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/nuparu/include/Eigen/src/Core/products/SelfadjointProduct.h
@@ -53,7 +53,6 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
   static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
     typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
@@ -86,7 +85,6 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
   static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
   {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
     typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
diff --git a/nuparu/include/Eigen/src/Core/products/SelfadjointRank2Update.h b/nuparu/include/Eigen/src/Core/products/SelfadjointRank2Update.h
index 8594a97c..2ae36411 100644
--- a/nuparu/include/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/nuparu/include/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -79,11 +79,11 @@ ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const
   if (IsRowMajor)
     actualAlpha = numext::conj(actualAlpha);
 
-  internal::selfadjoint_rank2_update_selector<Scalar, Index,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
+  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType;
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
     (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
-    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
 
   return *this;
 }
diff --git a/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix.h b/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 8110507b..39ab87df 100644
--- a/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -117,19 +117,21 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     Index depth     = IsLower ? diagSize : _depth;
     Index cols      = _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
     Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
     triangularBuffer.setZero();
@@ -138,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
         IsLower ? k2>0 : k2<depth;
@@ -156,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         k2 = k2+actual_kc-kc;
       }
 
-      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
+      pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
 
       // the selected lhs's panel has to be split in three different parts:
       //  1 - the part which is zero => skip it
@@ -184,20 +186,22 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
             for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
               triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
           }
-          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
+          pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
 
-          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
-                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+          gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
+                      actualPanelWidth, actualPanelWidth, cols, alpha,
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
 
           // GEBP with remaining micro panel
           if (lengthTarget>0)
           {
             Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
 
-            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+            gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
+                        lengthTarget, actualPanelWidth, cols, alpha,
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
       }
@@ -208,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
-            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
+            (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+          gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
+                      actual_kc, cols, alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -249,28 +254,31 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     Index _rows, Index _cols, Index _depth,
     const Scalar* _lhs, Index lhsStride,
     const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* _res,        Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
+    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
     // strip zeros
     Index diagSize  = (std::min)(_cols,_depth);
     Index rows      = _rows;
     Index depth     = IsLower ? _depth : diagSize;
     Index cols      = IsLower ? diagSize : _cols;
     
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
+    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    LhsMapper lhs(_lhs,lhsStride);
+    RhsMapper rhs(_rhs,rhsStride);
+    ResMapper res(_res, resStride);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
 
     std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
+    std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar);
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
     Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
     triangularBuffer.setZero();
@@ -279,10 +287,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     else
       triangularBuffer.diagonal().setOnes();
 
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
     for(Index k2=IsLower ? 0 : depth;
         IsLower ? k2<depth  : k2>0;
@@ -304,8 +312,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
 
       Scalar* geb = blockB+ts*ts;
+      geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar));
 
-      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
+      pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
 
       // pack the triangular part of the rhs padding the unrolled blocks with zeros
       if(ts>0)
@@ -318,7 +327,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
           // general part
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
 
@@ -332,7 +341,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
           }
 
           pack_rhs_panel(blockB+j2*actual_kc,
-                         triangularBuffer.data(), triangularBuffer.outerStride(),
+                         RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
                          actualPanelWidth, actualPanelWidth,
                          actual_kc, j2);
         }
@@ -341,7 +350,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       for (Index i2=0; i2<rows; i2+=mc)
       {
         const Index actual_mc = (std::min)(mc,rows-i2);
-        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
         // triangular kernel
         if(ts>0)
@@ -352,19 +361,18 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
             Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
             Index blockOffset = IsLower ? j2 : 0;
 
-            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
+            gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
                         blockA, blockB+j2*actual_kc,
                         actual_mc, panelLength, actualPanelWidth,
                         alpha,
                         actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset,// offsets
-                        blockW); // workspace
+                        blockOffset, blockOffset);// offsets
           }
         }
-        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
+        gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
                     blockA, geb, actual_mc, actual_kc, rs,
                     alpha,
-                    -1, -1, 0, 0, blockW);
+                    -1, -1, 0, 0);
       }
     }
   }
@@ -373,28 +381,28 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
 * Wrapper to product_triangular_matrix_matrix
 ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
-  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
-{};
-
 } // end namespace internal
 
+namespace internal {
 template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha)
   {
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    typedef typename Dest::Scalar     Scalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+    
+    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs)
+                               * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
               Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
@@ -405,23 +413,25 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
     Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
                                          : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
 
-    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
 
     internal::product_triangular_matrix_matrix<Scalar, Index,
       Mode, LhsIsTriangular,
-      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+      (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
       (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
       ::run(
         stripedRows, stripedCols, stripedDepth,   // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
+        &lhs.coeffRef(0,0), lhs.outerStride(),    // lhs info
+        &rhs.coeffRef(0,0), rhs.outerStride(),    // rhs info
         &dst.coeffRef(0,0), dst.outerStride(),    // result info
         actualAlpha, blocking
       );
   }
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
index ba41a1c9..d9e7cf85 100644
--- a/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ b/nuparu/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
@@ -109,7 +109,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
 /* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
    if (rows != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
 \
      if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
      /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
@@ -122,7 +122,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
        Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
        MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
        MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
 \
@@ -223,7 +223,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
 /* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
    if (cols != depth) { \
 \
-     int nthr = mkl_domain_get_max_threads(MKL_BLAS); \
+     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
 \
      if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
      /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
@@ -236,7 +236,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
        Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
        MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
        MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
+       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
        general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
        rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
 \
diff --git a/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector.h b/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector.h
index 6117d5a8..7c014b72 100644
--- a/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
 #define EIGEN_TRIANGULARMATRIXVECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
     typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-    
+
     typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
     const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
     typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
     typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
     ResMap res(_res,rows);
 
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<size; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? pi+actualPanelWidth : 0;
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(s,pi), lhsStride,
-            &rhs.coeffRef(pi), rhsIncr,
+            LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
+            RhsMapper(&rhs.coeffRef(pi), rhsIncr),
             &res.coeffRef(s), resIncr, alpha);
       }
     }
     if((!IsLower) && cols>size)
     {
-      general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
           rows, cols-size,
-          &lhs.coeffRef(0,size), lhsStride,
-          &rhs.coeffRef(size), rhsIncr,
+          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
+          RhsMapper(&rhs.coeffRef(size), rhsIncr),
           _res, resIncr, alpha);
     }
   }
@@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 
     typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
     ResMap res(_res,rows,InnerStride<>(resIncr));
-    
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
+
     for (Index pi=0; pi<diagSize; pi+=PanelWidth)
     {
       Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
       if (r>0)
       {
         Index s = IsLower ? 0 : pi + actualPanelWidth;
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
             actualPanelWidth, r,
-            &lhs.coeffRef(pi,s), lhsStride,
-            &rhs.coeffRef(s), rhsIncr,
+            LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
+            RhsMapper(&rhs.coeffRef(s), rhsIncr),
             &res.coeffRef(pi), resIncr, alpha);
       }
     }
     if(IsLower && rows>diagSize)
     {
-      general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
+      general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
             rows-diagSize, cols,
-            &lhs.coeffRef(diagSize,0), lhsStride,
-            &rhs.coeffRef(0), rhsIncr,
+            LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
+            RhsMapper(&rhs.coeffRef(0), rhsIncr),
             &res.coeffRef(diagSize), resIncr, alpha);
     }
   }
@@ -157,83 +163,66 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
 * Wrapper to product_triangular_vector
 ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
-{};
-
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
-{};
-
-
-template<int StorageOrder>
+template<int Mode,int StorageOrder>
 struct trmv_selector;
 
 } // end namespace internal
 
+namespace internal {
+
 template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
-  : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
+struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
   
-    internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+    internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha);
   }
 };
 
 template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
+struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false>
 {
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
+  template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha)
   {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+    eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols());
 
-    typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
     Transpose<Dest> dstT(dst);
-    internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
-      TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+    internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),
+                            (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>
+            ::run(rhs.transpose(),lhs.transpose(), dstT, alpha);
   }
 };
 
+} // end namespace internal
+
 namespace internal {
 
 // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
   
-template<> struct trmv_selector<ColMajor>
+template<int Mode> struct trmv_selector<Mode,ColMajor>
 {
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    typedef typename Dest::RealScalar RealScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    
     typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
@@ -247,7 +236,7 @@ template<> struct trmv_selector<ColMajor>
 
     bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
+
     RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
     ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@@ -267,7 +256,7 @@ template<> struct trmv_selector<ColMajor>
       else
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
@@ -288,33 +277,32 @@ template<> struct trmv_selector<ColMajor>
   }
 };
 
-template<> struct trmv_selector<RowMajor>
+template<int Mode> struct trmv_selector<Mode,RowMajor>
 {
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
+  template<typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha)
   {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    typedef typename Lhs::Scalar      LhsScalar;
+    typedef typename Rhs::Scalar      RhsScalar;
+    typedef typename Dest::Scalar     ResScalar;
+    
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned;
+
+    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs);
+    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs)
+                                  * RhsBlasTraits::extractScalarFactor(rhs);
 
     enum {
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
 
     ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
@@ -322,12 +310,12 @@ template<> struct trmv_selector<RowMajor>
     if(!DirectlyUseRhs)
     {
       #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
       #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
-    
+
     internal::triangular_matrix_vector_product
       <Index,Mode,
        LhsScalar, LhsBlasTraits::NeedToConjugate,
diff --git a/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
index 09f110da..3672b124 100644
--- a/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ b/nuparu/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
@@ -129,7 +129,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
@@ -214,7 +213,6 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
 /* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
diff --git a/nuparu/include/Eigen/src/Core/products/TriangularSolverMatrix.h b/nuparu/include/Eigen/src/Core/products/TriangularSolverMatrix.h
index f103eae7..20859371 100644
--- a/nuparu/include/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/nuparu/include/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index cols = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
+
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    TriMapper tri(_tri, triStride);
+    OtherMapper other(_other, otherStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
+
     enum {
       SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
       IsLower = (Mode&Lower) == Lower
@@ -66,21 +70,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
+    gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
+    gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
     // coherence when accessing the rhs elements
-    std::ptrdiff_t l1, l2;
-    manage_caching_sizes(GetAction, &l1, &l2);
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
     Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
     subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
 
@@ -115,8 +117,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           {
             // TODO write a small kernel handling this (can be shared with trsv)
             Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
-            Index s  = IsLower ? k2+k1 : i+1;
             Index rs = actualPanelWidth - k - 1; // remaining size
+            Index s  = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1)
+                                                 :  IsLower ? i+1 : i-rs;
 
             Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
             for (Index j=j2; j<j2+actual_cols; ++j)
@@ -133,7 +136,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
               }
               else
               {
-                Index s = IsLower ? i+1 : i-rs;
                 Scalar b = (other(i,j) *= a);
                 Scalar* r = &other(s,j);
                 const Scalar* l = &tri(s,i);
@@ -148,17 +150,17 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           Index blockBOffset = IsLower ? k1 : lengthTarget;
 
           // update the respective rows of B from other
-          pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+          pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
 
           // GEBP
           if (lengthTarget>0)
           {
             Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
 
-            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+            pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
+            gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
+                        actualPanelWidth, actual_kc, 0, blockBOffset);
           }
         }
       }
@@ -172,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
           const Index actual_mc = (std::min)(mc,end-i2);
           if (actual_mc>0)
           {
-            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+            pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
 
-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
+            gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
           }
         }
       }
@@ -200,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     level3_blocking<Scalar,Scalar>& blocking)
   {
     Index rows = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
+
+    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+    LhsMapper lhs(_other, otherStride);
+    RhsMapper rhs(_tri, triStride);
 
     typedef gebp_traits<Scalar,Scalar> Traits;
     enum {
@@ -215,17 +220,15 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
 
     std::size_t sizeA = kc*mc;
     std::size_t sizeB = kc*size;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
 
     ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
 
     conj_if<Conjugate> conj;
-    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
+    gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
@@ -238,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
       Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
       Scalar* geb = blockB+actual_kc*actual_kc;
 
-      if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+      if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
 
       // triangular packing (we only pack the panels off the diagonal,
       // neglecting the blocks overlapping the diagonal
@@ -252,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
 
           if (panelLength>0)
           pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), triStride,
+                         rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
                          panelLength, actualPanelWidth,
                          actual_kc, panelOffset);
         }
@@ -280,13 +283,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
             // GEBP
             if(panelLength>0)
             {
-              gebp_kernel(&lhs(i2,absolute_j2), otherStride,
+              gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
                           blockA, blockB+j2*actual_kc,
                           actual_mc, panelLength, actualPanelWidth,
                           Scalar(-1),
                           actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset, // offsets
-                          blockW);  // workspace
+                          panelOffset, panelOffset); // offsets
             }
 
             // unblocked triangular solve
@@ -302,22 +304,25 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
                 for (Index i=0; i<actual_mc; ++i)
                   r[i] -= a[i] * b;
               }
-              Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j));
-              for (Index i=0; i<actual_mc; ++i)
-                r[i] *= b;
+              if((Mode & UnitDiag)==0)
+              {
+                Scalar b = conj(rhs(j,j));
+                for (Index i=0; i<actual_mc; ++i)
+                  r[i] /= b;
+              }
             }
 
             // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
+            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
                            actualPanelWidth, actual_mc,
                            actual_kc, j2);
           }
         }
 
         if (rs>0)
-          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
+          gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
                       actual_mc, actual_kc, rs, Scalar(-1),
-                      -1, -1, 0, 0, blockW);
+                      -1, -1, 0, 0);
       }
     }
   }
diff --git a/nuparu/include/Eigen/src/Core/products/TriangularSolverVector.h b/nuparu/include/Eigen/src/Core/products/TriangularSolverVector.h
index ce4d1008..b994759b 100644
--- a/nuparu/include/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/nuparu/include/Eigen/src/Core/products/TriangularSolverVector.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
 
@@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
       >::run(size, _lhs, lhsStride, rhs);
   }
 };
-    
+
 // forward and backward substitution, row-major, rhs is a vector
 template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
 struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
+
     typename internal::conditional<
                           Conjugate,
                           const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
           actualPanelWidth, r,
-          &lhs.coeffRef(startRow,startCol), lhsStride,
-          rhs + startCol, 1,
+          LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
+          RhsMapper(rhs + startCol, 1),
           rhs + startRow, 1,
           RhsScalar(-1));
       }
@@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         Index s = IsLower ? pi   : i+1;
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
-        
+
         if(!(Mode & UnitDiag))
           rhs[i] /= cjLhs(i,i);
       }
@@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
   {
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
     const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
     typename internal::conditional<Conjugate,
                                    const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
                                    const LhsMap&
@@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
         // 2 - it is slighlty faster at runtime
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
+        general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
-            &lhs.coeffRef(endBlock,startBlock), lhsStride,
-            rhs+startBlock, 1,
+            LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
+            RhsMapper(rhs+startBlock, 1),
             rhs+endBlock, 1, RhsScalar(-1));
       }
     }
diff --git a/nuparu/include/Eigen/src/Core/util/BlasUtil.h b/nuparu/include/Eigen/src/Core/util/BlasUtil.h
index a28f16fa..d00fa970 100644
--- a/nuparu/include/Eigen/src/Core/util/BlasUtil.h
+++ b/nuparu/include/Eigen/src/Core/util/BlasUtil.h
@@ -18,13 +18,13 @@ namespace Eigen {
 namespace internal {
 
 // forward declarations
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
 struct gebp_kernel;
 
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -34,7 +34,9 @@ template<
   int ResStorageOrder>
 struct general_matrix_matrix_product;
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+template<typename Index,
+         typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+         typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
 
@@ -56,8 +58,8 @@ template<> struct conj_if<false> {
 
 template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
 {
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
 };
 
 template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
@@ -109,39 +111,142 @@ template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::comp
 };
 
 template<typename From,typename To> struct get_factor {
-  static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
 };
 
 template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
+  EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
 };
 
+
+template<typename Scalar, typename Index>
+class BlasVectorMapper {
+  public:
+  EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    return m_data[i];
+  }
+  template <typename Packet, int AlignmentType>
+  EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  bool aligned(Index i) const {
+    return (size_t(m_data+i)%sizeof(Packet))==0;
+  }
+
+  protected:
+  Scalar* m_data;
+};
+
+template<typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper {
+  public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+
+  EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i];
+  }
+
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return ploadt<HalfPacket, AlignmentType>(m_data + i);
+  }
+
+  EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
+    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+  }
+
+  protected:
+  Scalar *m_data;
+};
+
 // Lightweight helper class to access matrix coefficients.
-// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
-// and so I hope better compilation performance (time and code quality).
-template<typename Scalar, typename Index, int StorageOrder>
-class blas_data_mapper
-{
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
+class blas_data_mapper {
   public:
-    blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+  EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+
+  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
+  }
+
+  EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return ploadt<Packet, AlignmentType>(&operator()(i, j));
+  }
+
+  EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
+  }
+
+  template<typename SubPacket>
+  EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  const Index stride() const { return m_stride; }
+  const Scalar* data() const { return m_data; }
+
+  Index firstAligned(Index size) const {
+    if (size_t(m_data)%sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_default_aligned(m_data, size);
+  }
+
   protected:
-    Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
 };
 
 // lightweight helper class to access matrix coefficients (const version)
 template<typename Scalar, typename Index, int StorageOrder>
-class const_blas_data_mapper
-{
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
   public:
-    const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
-  protected:
-    const Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
+    return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
+  }
 };
 
 
@@ -230,7 +335,7 @@ struct blas_traits<Transpose<NestedXpr> >
   enum {
     IsTransposed = Base::IsTransposed ? 0 : 1
   };
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  static inline ExtractType extract(const XprType& x) { return ExtractType(Base::extract(x.nestedExpression())); }
   static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
 };
 
diff --git a/nuparu/include/Eigen/src/Core/util/Constants.h b/nuparu/include/Eigen/src/Core/util/Constants.h
index 14b9624e..a364f48d 100644
--- a/nuparu/include/Eigen/src/Core/util/Constants.h
+++ b/nuparu/include/Eigen/src/Core/util/Constants.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -30,6 +30,14 @@ const int DynamicIndex = 0xffffff;
   */
 const int Infinity = -1;
 
+/** This value means that the cost to evaluate an expression coefficient is either very expensive or
+  * cannot be known at compile time.
+  *
+  * This value has to be positive to (1) simplify cost computation, and (2) allow to distinguish between a very expensive and very very expensive expressions.
+  * It thus must also be large enough to make sure unrolling won't happen and that sub expressions will be evaluated, but not too large to avoid overflow.
+  */
+const int HugeCost = 10000;
+
 /** \defgroup flags Flags
   * \ingroup Core_Module
   *
@@ -53,14 +61,13 @@ const int Infinity = -1;
 const unsigned int RowMajorBit = 0x1;
 
 /** \ingroup flags
-  *
   * means the expression should be evaluated by the calling expression */
 const unsigned int EvalBeforeNestingBit = 0x2;
 
 /** \ingroup flags
-  *
+  * \deprecated
   * means the expression should be evaluated before any assignment */
-const unsigned int EvalBeforeAssigningBit = 0x4;
+const unsigned int EvalBeforeAssigningBit = 0x4; // FIXME deprecated
 
 /** \ingroup flags
   *
@@ -141,13 +148,43 @@ const unsigned int LvalueBit = 0x20;
   */
 const unsigned int DirectAccessBit = 0x40;
 
-/** \ingroup flags
+/** \deprecated \ingroup flags
   *
-  * means the first coefficient packet is guaranteed to be aligned */
+  * means the first coefficient packet is guaranteed to be aligned.
+  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
+  * In other words, this means we are allow to perform an aligned packet access to the first element regardless
+  * of the expression kind:
+  * \code
+  * expression.packet<Aligned>(0);
+  * \endcode
+  */
 const unsigned int AlignedBit = 0x80;
 
 const unsigned int NestByRefBit = 0x100;
 
+/** \ingroup flags
+  *
+  * for an expression, this means that the storage order
+  * can be either row-major or column-major.
+  * The precise choice will be decided at evaluation time or when
+  * combined with other expressions.
+  * \sa \ref RowMajorBit, \ref TopicStorageOrders */
+const unsigned int NoPreferredStorageOrderBit = 0x200;
+
+/** \ingroup flags
+  *
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
+  */
+const unsigned int CompressedAccessBit = 0x400;
+
+
 // list of flags that are inherited by default
 const unsigned int HereditaryBits = RowMajorBit
                                   | EvalBeforeNestingBit
@@ -160,8 +197,8 @@ const unsigned int HereditaryBits = RowMajorBit
   */
 
 /** \ingroup enums
-  * Enum containing possible values for the \p Mode parameter of 
-  * MatrixBase::selfadjointView() and MatrixBase::triangularView(). */
+  * Enum containing possible values for the \c Mode or \c UpLo parameter of
+  * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
 enum {
   /** View matrix as a lower triangular matrix. */
   Lower=0x1,                      
@@ -186,12 +223,31 @@ enum {
 };
 
 /** \ingroup enums
-  * Enum for indicating whether an object is aligned or not. */
+  * Enum for indicating whether a buffer is aligned or not. */
 enum { 
-  /** Object is not correctly aligned for vectorization. */
-  Unaligned=0, 
-  /** Object is aligned for vectorization. */
-  Aligned=1 
+  Unaligned=0,        /**< Data pointer has no specific alignment. */
+  Aligned8=8,         /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16=16,       /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32=32,       /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64=64,       /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128=128,     /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask=255,
+  Aligned=16,         /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES==128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES==64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES==32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES==16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES==8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES==0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
 };
 
 /** \ingroup enums
@@ -406,10 +462,16 @@ namespace Architecture
     Generic = 0x0,
     SSE = 0x1,
     AltiVec = 0x2,
+    VSX = 0x3,
+    NEON = 0x4,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
     Target = AltiVec
+#elif defined EIGEN_VECTORIZE_VSX
+    Target = VSX
+#elif defined EIGEN_VECTORIZE_NEON
+    Target = NEON
 #else
     Target = Generic
 #endif
@@ -417,8 +479,8 @@ namespace Architecture
 }
 
 /** \internal \ingroup enums
-  * Enum used as template parameter in GeneralProduct. */
-enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+  * Enum used as template parameter in Product and product evalautors. */
+enum { DefaultProduct=0, LazyProduct, AliasFreeProduct, CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
 
 /** \internal \ingroup enums
   * Enum used in experimental parallel implementation. */
@@ -427,12 +489,58 @@ enum Action {GetAction, SetAction};
 /** The type used to identify a dense storage. */
 struct Dense {};
 
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+/** The type used to identify a general solver (foctored) storage. */
+struct SolverStorage {};
+
+/** The type used to identify a permutation storage. */
+struct PermutationStorage {};
+
+/** The type used to identify a permutation storage. */
+struct TranspositionsStorage {};
+
 /** The type used to identify a matrix expression */
 struct MatrixXpr {};
 
 /** The type used to identify an array expression */
 struct ArrayXpr {};
 
+// An evaluator must define its shape. By default, it can be one of the following:
+struct DenseShape             { static std::string debugName() { return "DenseShape"; } };
+struct SolverShape            { static std::string debugName() { return "SolverShape"; } };
+struct HomogeneousShape       { static std::string debugName() { return "HomogeneousShape"; } };
+struct DiagonalShape          { static std::string debugName() { return "DiagonalShape"; } };
+struct BandShape              { static std::string debugName() { return "BandShape"; } };
+struct TriangularShape        { static std::string debugName() { return "TriangularShape"; } };
+struct SelfAdjointShape       { static std::string debugName() { return "SelfAdjointShape"; } };
+struct PermutationShape       { static std::string debugName() { return "PermutationShape"; } };
+struct TranspositionsShape    { static std::string debugName() { return "TranspositionsShape"; } };
+struct SparseShape            { static std::string debugName() { return "SparseShape"; } };
+
+namespace internal {
+
+  // random access iterators based on coeff*() accessors.
+struct IndexBased {};
+
+// evaluator based on iterators to access coefficients. 
+struct IteratorBased {};
+
+/** \internal
+ * Constants for comparison functors
+ */
+enum ComparisonName {
+  cmp_EQ = 0,
+  cmp_LT = 1,
+  cmp_LE = 2,
+  cmp_UNORD = 3,
+  cmp_NEQ = 4,
+  cmp_GT = 5,
+  cmp_GE = 6
+};
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_CONSTANTS_H
diff --git a/nuparu/include/Eigen/src/Core/util/DisableStupidWarnings.h b/nuparu/include/Eigen/src/Core/util/DisableStupidWarnings.h
index 6a0bf062..74723293 100644
--- a/nuparu/include/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/nuparu/include/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -10,6 +10,7 @@
   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
   // 4273 - QtAlignedMalloc, inconsistent DLL linkage
   // 4324 - structure was padded due to declspec(align())
+  // 4503 - decorated name length exceeded, name was truncated
   // 4512 - assignment operator could not be generated
   // 4522 - 'class' : multiple assignment operators specified
   // 4700 - uninitialized local variable 'xyz' used
@@ -17,17 +18,19 @@
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4512 4522 4700 4717 )
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4717 )
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
   //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
   //        typedef that may be a reference type.
   // 279  - controlling expression is constant
   //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
+  // 1684 - conversion from pointer to same-sized integral type (potential portability problem)
+  // 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning push
   #endif
-  #pragma warning disable 2196 279
+  #pragma warning disable 2196 279 1684 2259
 #elif defined __clang__
   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
   //     this is really a stupid warning as it warns on compile-time expressions involving enums
diff --git a/nuparu/include/Eigen/src/Core/util/ForwardDeclarations.h b/nuparu/include/Eigen/src/Core/util/ForwardDeclarations.h
index d6a81458..483af876 100644
--- a/nuparu/include/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/nuparu/include/Eigen/src/Core/util/ForwardDeclarations.h
@@ -36,6 +36,10 @@ template<typename Derived> struct accessors_level
   };
 };
 
+template<typename T> struct evaluator_traits;
+
+template< typename T> struct evaluator;
+
 } // end namespace internal
 
 template<typename T> struct NumTraits;
@@ -51,18 +55,18 @@ class DenseCoeffsBase;
 
 template<typename _Scalar, int _Rows, int _Cols,
          int _Options = AutoAlign |
-#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
+#if EIGEN_GNUC_AT(3,4)
     // workaround a bug in at least gcc 3.4.6
     // the innermost ?: ternary operator is misparsed. We write it slightly
     // differently and this makes gcc 3.4.6 happy, but it's ugly.
     // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
     // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
                           : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
-                          : ColMajor ),
+                          : Eigen::ColMajor ),
 #else
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : (_Cols==1 && _Rows!=1) ? Eigen::ColMajor
                           : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
 #endif
          int _MaxRows = _Rows,
@@ -87,11 +91,15 @@ template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
 template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
 template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
 template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
-template<typename BinOp,     typename Lhs, typename Rhs>  class SelfCwiseBinaryOp;
-template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;
-template<typename Lhs, typename Rhs, int Mode>            class GeneralProduct;
-template<typename Lhs, typename Rhs, int NestingFlags>    class CoeffBasedProduct;
+template<typename Decomposition, typename Rhstype>        class Solve;
+template<typename XprType>                                class Inverse;
+
+namespace internal {
+  template<typename Lhs, typename Rhs> struct product_tag;
+}
 
+template<typename Lhs, typename Rhs, int Option = DefaultProduct> class Product;
+         
 template<typename Derived> class DiagonalBase;
 template<typename _DiagonalVectorType> class DiagonalWrapper;
 template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeAtCompileTime> class DiagonalMatrix;
@@ -108,7 +116,12 @@ template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class MapBase;
 template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
+template<int Value = Dynamic> class InnerStride;
+template<int Value = Dynamic> class OuterStride;
 template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
+template<typename Derived> class RefBase;
+template<typename PlainObjectType, int Options = 0,
+         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
 
 template<typename Derived> class TriangularBase;
 template<typename MatrixType, unsigned int Mode> class TriangularView;
@@ -119,10 +132,10 @@ template<typename MatrixType> struct CommaInitializer;
 template<typename Derived> class ReturnByValue;
 template<typename ExpressionType> class ArrayWrapper;
 template<typename ExpressionType> class MatrixWrapper;
+template<typename Derived> class SolverBase;
+template<typename XprType> class InnerIterator;
 
 namespace internal {
-template<typename DecompositionType, typename Rhs> struct solve_retval_base;
-template<typename DecompositionType, typename Rhs> struct solve_retval;
 template<typename DecompositionType> struct kernel_retval_base;
 template<typename DecompositionType> struct kernel_retval;
 template<typename DecompositionType> struct image_retval_base;
@@ -135,6 +148,21 @@ template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynami
 
 namespace internal {
 template<typename Lhs, typename Rhs> struct product_type;
+
+template<bool> struct EnableIf;
+
+/** \internal
+  * \class product_evaluator
+  * Products need their own evaluator with more template arguments allowing for
+  * easier partial template specializations.
+  */
+template< typename T,
+          int ProductTag = internal::product_type<typename T::Lhs,typename T::Rhs>::ret,
+          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar
+        > struct product_evaluator;
 }
 
 template<typename Lhs, typename Rhs,
@@ -160,6 +188,7 @@ template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
 template<typename Scalar> struct scalar_sqrt_op;
+template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
 template<typename Scalar> struct scalar_log_op;
 template<typename Scalar> struct scalar_cos_op;
@@ -180,10 +209,12 @@ template<typename Scalar> struct scalar_random_op;
 template<typename Scalar> struct scalar_add_op;
 template<typename Scalar> struct scalar_constant_op;
 template<typename Scalar> struct scalar_identity_op;
+template<typename Scalar,bool iscpx> struct scalar_sign_op;
 
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
 template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
 template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
+template<typename LhsScalar,typename RhsScalar> struct scalar_quotient2_op;
 
 } // end namespace internal
 
@@ -192,18 +223,18 @@ struct IOFormat;
 // Array module
 template<typename _Scalar, int _Rows, int _Cols,
          int _Options = AutoAlign |
-#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
+#if EIGEN_GNUC_AT(3,4)
     // workaround a bug in at least gcc 3.4.6
     // the innermost ?: ternary operator is misparsed. We write it slightly
     // differently and this makes gcc 3.4.6 happy, but it's ugly.
     // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
     // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
                           : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
-                          : ColMajor ),
+                          : Eigen::ColMajor ),
 #else
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
+                          ( (_Rows==1 && _Cols!=1) ? Eigen::RowMajor
+                          : (_Cols==1 && _Rows!=1) ? Eigen::ColMajor
                           : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
 #endif
          int _MaxRows = _Rows, int _MaxCols = _Cols> class Array;
@@ -222,6 +253,7 @@ template<typename MatrixType> class HouseholderQR;
 template<typename MatrixType> class ColPivHouseholderQR;
 template<typename MatrixType> class FullPivHouseholderQR;
 template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
+template<typename MatrixType> class BDCSVD;
 template<typename MatrixType, int UpLo = Lower> class LLT;
 template<typename MatrixType, int UpLo = Lower> class LDLT;
 template<typename VectorsType, typename CoeffsType, int Side=OnTheLeft> class HouseholderSequence;
@@ -234,36 +266,16 @@ template<typename Derived> class QuaternionBase;
 template<typename Scalar> class Rotation2D;
 template<typename Scalar> class AngleAxis;
 template<typename Scalar,int Dim> class Translation;
-
-#ifdef EIGEN2_SUPPORT
-template<typename Derived, int _Dim> class eigen2_RotationBase;
-template<typename Lhs, typename Rhs> class eigen2_Cross;
-template<typename Scalar> class eigen2_Quaternion;
-template<typename Scalar> class eigen2_Rotation2D;
-template<typename Scalar> class eigen2_AngleAxis;
-template<typename Scalar,int Dim> class eigen2_Transform;
-template <typename _Scalar, int _AmbientDim> class eigen2_ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class eigen2_Hyperplane;
-template<typename Scalar,int Dim> class eigen2_Translation;
-template<typename Scalar,int Dim> class eigen2_Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-template<typename Scalar> class Quaternion;
-template<typename Scalar,int Dim> class Transform;
-template <typename _Scalar, int _AmbientDim> class ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class Hyperplane;
-template<typename Scalar,int Dim> class Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
+template<typename Scalar,int Dim> class AlignedBox;
 template<typename Scalar, int Options = AutoAlign> class Quaternion;
 template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
 template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperplane;
 template<typename Scalar> class UniformScaling;
 template<typename MatrixType,int Direction> class Homogeneous;
-#endif
+
+// Sparse module:
+template<typename Derived> class SparseMatrixBase;
 
 // MatrixFunctions module
 template<typename Derived> struct MatrixExponentialReturnValue;
@@ -271,7 +283,7 @@ template<typename Derived> class MatrixFunctionReturnValue;
 template<typename Derived> class MatrixSquareRootReturnValue;
 template<typename Derived> class MatrixLogarithmReturnValue;
 template<typename Derived> class MatrixPowerReturnValue;
-template<typename Derived, typename Lhs, typename Rhs> class MatrixPowerProduct;
+template<typename Derived> class MatrixComplexPowerReturnValue;
 
 namespace internal {
 template <typename Scalar>
@@ -282,18 +294,6 @@ struct stem_function
 };
 }
 
-
-#ifdef EIGEN2_SUPPORT
-template<typename ExpressionType> class Cwise;
-template<typename MatrixType> class Minor;
-template<typename MatrixType> class LU;
-template<typename MatrixType> class QR;
-template<typename MatrixType> class SVD;
-namespace internal {
-template<typename MatrixType, unsigned int Mode> struct eigen2_part_return_type;
-}
-#endif
-
 } // end namespace Eigen
 
 #endif // EIGEN_FORWARDDECLARATIONS_H
diff --git a/nuparu/include/Eigen/src/Core/util/MKL_support.h b/nuparu/include/Eigen/src/Core/util/MKL_support.h
index 1e6e355d..1ef3b61d 100644
--- a/nuparu/include/Eigen/src/Core/util/MKL_support.h
+++ b/nuparu/include/Eigen/src/Core/util/MKL_support.h
@@ -54,11 +54,60 @@
 #endif
 
 #if defined EIGEN_USE_MKL
+#   include <mkl.h> 
+/*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
+#   ifndef INTEL_MKL_VERSION
+#       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
+#   elif INTEL_MKL_VERSION < 100305    /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
+#       undef EIGEN_USE_MKL
+#   endif
+#   ifndef EIGEN_USE_MKL
+    /*If the MKL version is too old, undef everything*/
+#       undef   EIGEN_USE_MKL_ALL
+#       undef   EIGEN_USE_BLAS
+#       undef   EIGEN_USE_LAPACKE
+#       undef   EIGEN_USE_MKL_VML
+#       undef   EIGEN_USE_LAPACKE_STRICT
+#       undef   EIGEN_USE_LAPACKE
+#   endif
+#endif
 
-#include <mkl.h>
+#if defined EIGEN_USE_MKL
 #include <mkl_lapacke.h>
 #define EIGEN_MKL_VML_THRESHOLD 128
 
+/* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
+/* MKL_BLAS, etc are not defined in 11.2 */
+#ifdef MKL_DOMAIN_ALL
+#define EIGEN_MKL_DOMAIN_ALL MKL_DOMAIN_ALL
+#else
+#define EIGEN_MKL_DOMAIN_ALL MKL_ALL
+#endif
+
+#ifdef MKL_DOMAIN_BLAS
+#define EIGEN_MKL_DOMAIN_BLAS MKL_DOMAIN_BLAS
+#else
+#define EIGEN_MKL_DOMAIN_BLAS MKL_BLAS
+#endif
+
+#ifdef MKL_DOMAIN_FFT
+#define EIGEN_MKL_DOMAIN_FFT MKL_DOMAIN_FFT
+#else
+#define EIGEN_MKL_DOMAIN_FFT MKL_FFT
+#endif
+
+#ifdef MKL_DOMAIN_VML
+#define EIGEN_MKL_DOMAIN_VML MKL_DOMAIN_VML
+#else
+#define EIGEN_MKL_DOMAIN_VML MKL_VML
+#endif
+
+#ifdef MKL_DOMAIN_PARDISO
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_DOMAIN_PARDISO
+#else
+#define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO
+#endif
+
 namespace Eigen {
 
 typedef std::complex<double> dcomplex;
diff --git a/nuparu/include/Eigen/src/Core/util/Macros.h b/nuparu/include/Eigen/src/Core/util/Macros.h
index a50f1f8c..9b4f8faa 100644
--- a/nuparu/include/Eigen/src/Core/util/Macros.h
+++ b/nuparu/include/Eigen/src/Core/util/Macros.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -13,93 +13,399 @@
 
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 0
+#define EIGEN_MINOR_VERSION 92
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
+
+// Compiler identification, EIGEN_COMP_*
+
+/// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+  #define EIGEN_COMP_GNUC 1
 #else
-  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+  #define EIGEN_COMP_GNUC 0
 #endif
- 
-#ifdef __GNUC__
-  #define EIGEN_GNUC_AT_MOST(x,y) ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+
+/// \internal EIGEN_COMP_CLANG set to 1 if the compiler is clang (alias for __clang__)
+#if defined(__clang__)
+  #define EIGEN_COMP_CLANG 1
 #else
-  #define EIGEN_GNUC_AT_MOST(x,y) 0
+  #define EIGEN_COMP_CLANG 0
 #endif
 
-#if EIGEN_GNUC_AT_MOST(4,3) && !defined(__clang__)
-  // see bug 89
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+
+/// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm
+#if defined(__llvm__)
+  #define EIGEN_COMP_LLVM 1
 #else
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+  #define EIGEN_COMP_LLVM 0
+#endif
+
+/// \internal EIGEN_COMP_ICC set to __INTEL_COMPILER if the compiler is Intel compiler, 0 otherwise
+#if defined(__INTEL_COMPILER)
+  #define EIGEN_COMP_ICC __INTEL_COMPILER
+#else
+  #define EIGEN_COMP_ICC 0
+#endif
+
+/// \internal EIGEN_COMP_MINGW set to 1 if the compiler is mingw
+#if defined(__MINGW32__)
+  #define EIGEN_COMP_MINGW 1
+#else
+  #define EIGEN_COMP_MINGW 0
+#endif
+
+/// \internal EIGEN_COMP_SUNCC set to 1 if the compiler is Solaris Studio
+#if defined(__SUNPRO_CC)
+  #define EIGEN_COMP_SUNCC 1
+#else
+  #define EIGEN_COMP_SUNCC 0
+#endif
+
+/// \internal EIGEN_COMP_MSVC set to _MSC_VER if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSC_VER)
+  #define EIGEN_COMP_MSVC _MSC_VER
+#else
+  #define EIGEN_COMP_MSVC 0
+#endif
+
+/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
+#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
+  #define EIGEN_COMP_MSVC_STRICT _MSC_VER
+#else
+  #define EIGEN_COMP_MSVC_STRICT 0
+#endif
+
+/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
+#if defined(__IBMCPP__) || defined(__xlc__)
+  #define EIGEN_COMP_IBM 1
+#else
+  #define EIGEN_COMP_IBM 0
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ <= 3)
+/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
+#if defined(__PGI)
+  #define EIGEN_COMP_PGI 1
+#else
+  #define EIGEN_COMP_PGI 0
+#endif
+
+/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
+#if defined(__CC_ARM) || defined(__ARMCC_VERSION)
+  #define EIGEN_COMP_ARM 1
+#else
+  #define EIGEN_COMP_ARM 0
+#endif
+
+
+/// \internal EIGEN_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC, clang, mingw, etc.)
+#if EIGEN_COMP_GNUC && !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM )
+  #define EIGEN_COMP_GNUC_STRICT 1
+#else
+  #define EIGEN_COMP_GNUC_STRICT 0
+#endif
+
+
+#if EIGEN_COMP_GNUC
+  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+  #define EIGEN_GNUC_AT_MOST(x,y)  ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+  #define EIGEN_GNUC_AT(x,y)       ( __GNUC__==x && __GNUC_MINOR__==y )
+#else
+  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+  #define EIGEN_GNUC_AT_MOST(x,y)  0
+  #define EIGEN_GNUC_AT(x,y)       0
+#endif
+
+// FIXME: could probably be removed as we do not support gcc 3.x anymore
+#if EIGEN_COMP_GNUC && (__GNUC__ <= 3)
 #define EIGEN_GCC3_OR_OLDER 1
 #else
 #define EIGEN_GCC3_OR_OLDER 0
 #endif
 
-// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-// certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
-// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-// when we have to disable static alignment.
-#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+
+// Architecture identification, EIGEN_ARCH_*
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
+  #define EIGEN_ARCH_x86_64 1
 #else
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #define EIGEN_ARCH_x86_64 0
 #endif
 
-// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
- && !EIGEN_GCC3_OR_OLDER \
- && !defined(__SUNPRO_CC) \
- && !defined(__QNXNTO__)
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
+  #define EIGEN_ARCH_i386 1
 #else
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #define EIGEN_ARCH_i386 0
 #endif
 
-#ifdef EIGEN_DONT_ALIGN
-  #ifndef EIGEN_DONT_ALIGN_STATICALLY
-    #define EIGEN_DONT_ALIGN_STATICALLY
-  #endif
-  #define EIGEN_ALIGN 0
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_i386
+  #define EIGEN_ARCH_i386_OR_x86_64 1
 #else
-  #define EIGEN_ALIGN 1
+  #define EIGEN_ARCH_i386_OR_x86_64 0
 #endif
 
-// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
-#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
-  #define EIGEN_ALIGN_STATICALLY 1
+/// \internal EIGEN_ARCH_ARM set to 1 if the architecture is ARM
+#if defined(__arm__)
+  #define EIGEN_ARCH_ARM 1
 #else
-  #define EIGEN_ALIGN_STATICALLY 0
-  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-  #endif
+  #define EIGEN_ARCH_ARM 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64
+#if defined(__aarch64__)
+  #define EIGEN_ARCH_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM64 0
+#endif
+
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+  #define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+  #define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
+/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
+#if defined(__mips__) || defined(__mips)
+  #define EIGEN_ARCH_MIPS 1
+#else
+  #define EIGEN_ARCH_MIPS 0
+#endif
+
+/// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC
+#if defined(__sparc__) || defined(__sparc)
+  #define EIGEN_ARCH_SPARC 1
+#else
+  #define EIGEN_ARCH_SPARC 0
+#endif
+
+/// \internal EIGEN_ARCH_IA64 set to 1 if the architecture is Intel Itanium
+#if defined(__ia64__)
+  #define EIGEN_ARCH_IA64 1
+#else
+  #define EIGEN_ARCH_IA64 0
+#endif
+
+/// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
+  #define EIGEN_ARCH_PPC 1
+#else
+  #define EIGEN_ARCH_PPC 0
+#endif
+
+
+
+// Operating system identification, EIGEN_OS_*
+
+/// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
+#if defined(__unix__) || defined(__unix)
+  #define EIGEN_OS_UNIX 1
+#else
+  #define EIGEN_OS_UNIX 0
+#endif
+
+/// \internal EIGEN_OS_LINUX set to 1 if the OS is based on Linux kernel
+#if defined(__linux__)
+  #define EIGEN_OS_LINUX 1
+#else
+  #define EIGEN_OS_LINUX 0
+#endif
+
+/// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
+  #define EIGEN_OS_ANDROID 1
+#else
+  #define EIGEN_OS_ANDROID 0
+#endif
+
+/// \internal EIGEN_OS_GNULINUX set to 1 if the OS is GNU Linux and not Linux-based OS (e.g., not android)
+#if defined(__gnu_linux__) && !(EIGEN_OS_ANDROID)
+  #define EIGEN_OS_GNULINUX 1
+#else
+  #define EIGEN_OS_GNULINUX 0
+#endif
+
+/// \internal EIGEN_OS_BSD set to 1 if the OS is a BSD variant
+#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__)
+  #define EIGEN_OS_BSD 1
+#else
+  #define EIGEN_OS_BSD 0
+#endif
+
+/// \internal EIGEN_OS_MAC set to 1 if the OS is MacOS
+#if defined(__APPLE__)
+  #define EIGEN_OS_MAC 1
+#else
+  #define EIGEN_OS_MAC 0
+#endif
+
+/// \internal EIGEN_OS_QNX set to 1 if the OS is QNX
+#if defined(__QNX__)
+  #define EIGEN_OS_QNX 1
+#else
+  #define EIGEN_OS_QNX 0
+#endif
+
+/// \internal EIGEN_OS_WIN set to 1 if the OS is Windows based
+#if defined(_WIN32)
+  #define EIGEN_OS_WIN 1
+#else
+  #define EIGEN_OS_WIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN64 set to 1 if the OS is Windows 64bits
+#if defined(_WIN64)
+  #define EIGEN_OS_WIN64 1
+#else
+  #define EIGEN_OS_WIN64 0
+#endif
+
+/// \internal EIGEN_OS_WINCE set to 1 if the OS is Windows CE
+#if defined(_WIN32_WCE)
+  #define EIGEN_OS_WINCE 1
+#else
+  #define EIGEN_OS_WINCE 0
+#endif
+
+/// \internal EIGEN_OS_CYGWIN set to 1 if the OS is Windows/Cygwin
+#if defined(__CYGWIN__)
+  #define EIGEN_OS_CYGWIN 1
+#else
+  #define EIGEN_OS_CYGWIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN_STRICT set to 1 if the OS is really Windows and not some variants
+#if EIGEN_OS_WIN && !( EIGEN_OS_WINCE || EIGEN_OS_CYGWIN )
+  #define EIGEN_OS_WIN_STRICT 1
+#else
+  #define EIGEN_OS_WIN_STRICT 0
+#endif
+
+/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SUN 1
+#else
+  #define EIGEN_OS_SUN 0
 #endif
 
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+  #define EIGEN_OS_SOLARIS 1
+#else
+  #define EIGEN_OS_SOLARIS 0
+#endif
+
+
+
+#if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
+  // see bug 89
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+#else
+  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+#endif
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
 #else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
 #endif
 
 #ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
 #endif
 
+// Cross compiler wrapper around LLVM's __has_builtin
+#ifdef __has_builtin
+#  define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define EIGEN_HAS_BUILTIN(x) 0
+#endif
+
+// A Clang feature extension to determine compiler features.
+// We use it to determine 'cxx_rvalue_references'
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+
+// Do we support r-value references?
+#if (__has_feature(cxx_rvalue_references) || \
+    (defined(__cplusplus) && __cplusplus >= 201103L) || \
+     defined(__GXX_EXPERIMENTAL_CXX0X__) || \
+    (EIGEN_COMP_MSVC >= 1600))
+  #define EIGEN_HAVE_RVALUE_REFERENCES
+#endif
+
+// Does the compiler support C99?
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
+  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define EIGEN_HAS_C99_MATH 1
+#endif
+
+// Does the compiler support result_of?
+#if (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))
+#define EIGEN_HAS_STD_RESULT_OF 1
+#endif
+
+// Does the compiler support variadic templates?
+#if __cplusplus > 199711L
+#define EIGEN_HAS_VARIADIC_TEMPLATES 1
+#endif
+
+// Does the compiler support const expressions?
+#ifdef __CUDACC__
+// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above
+#if defined(__CUDACC_VER__) &&  __CUDACC_VER__ >= 70500 && __cplusplus > 199711L
+  #define EIGEN_HAS_CONSTEXPR 1
+#endif
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || \
+    EIGEN_GNUC_AT_LEAST(4,8)
+#define EIGEN_HAS_CONSTEXPR 1
+#endif
+
+// Does the compiler support C++11 math?
+// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
+#ifndef EIGEN_HAS_CXX11_MATH
+  #if (__cplusplus > 201103L) || (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC)  \
+      && (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
+    #define EIGEN_HAS_CXX11_MATH 1
+  #else
+    #define EIGEN_HAS_CXX11_MATH 0
+  #endif
+#endif
+
+// Does the compiler support proper C++11 containers?
+#ifndef EIGEN_HAS_CXX11_CONTAINERS
+  #if    (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
+      || EIGEN_COMP_MSVC >= 1900
+    #define EIGEN_HAS_CXX11_CONTAINERS 1
+  #else
+    #define EIGEN_HAS_CXX11_CONTAINERS 0
+  #endif
+#endif
+
+// Does the compiler support C++11 noexcept?
+#ifndef EIGEN_HAS_CXX11_NOEXCEPT
+  #if    (__cplusplus > 201103L) \
+      || ((__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_ICC>=1400)) \
+      || EIGEN_COMP_MSVC >= 1900
+    #define EIGEN_HAS_CXX11_NOEXCEPT 1
+  #else
+    #define EIGEN_HAS_CXX11_NOEXCEPT 0
+  #endif
+#endif
+
 /** Allows to disable some optimizations which might affect the accuracy of the result.
   * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
   * They currently include:
-  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
   */
 #ifndef EIGEN_FAST_MATH
 #define EIGEN_FAST_MATH 1
@@ -118,7 +424,7 @@
 // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
 // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
 // but GCC is still doing fine with just inline.
-#if (defined _MSC_VER) || (defined __INTEL_COMPILER)
+#if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
 #define EIGEN_STRONG_INLINE __forceinline
 #else
 #define EIGEN_STRONG_INLINE inline
@@ -137,15 +443,15 @@
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_DONT_INLINE __attribute__((noinline))
-#elif (defined _MSC_VER)
+#elif EIGEN_COMP_MSVC
 #define EIGEN_DONT_INLINE __declspec(noinline)
 #else
 #define EIGEN_DONT_INLINE
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_PERMISSIVE_EXPR __extension__
 #else
 #define EIGEN_PERMISSIVE_EXPR
@@ -214,15 +520,15 @@
 #endif
 
 #ifdef EIGEN_NO_DEBUG
-#define EIGEN_ONLY_USED_FOR_DEBUG(x) (void)x
+#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
 #else
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
 #endif
 
 #ifndef EIGEN_NO_DEPRECATED_WARNING
-  #if (defined __GNUC__)
+  #if EIGEN_COMP_GNUC
     #define EIGEN_DEPRECATED __attribute__((deprecated))
-  #elif (defined _MSC_VER)
+  #elif EIGEN_COMP_MSVC
     #define EIGEN_DEPRECATED __declspec(deprecated)
   #else
     #define EIGEN_DEPRECATED
@@ -231,23 +537,42 @@
   #define EIGEN_DEPRECATED
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_UNUSED __attribute__((unused))
 #else
 #define EIGEN_UNUSED
 #endif
 
 // Suppresses 'unused variable' warnings.
-#define EIGEN_UNUSED_VARIABLE(var) (void)var;
+namespace Eigen {
+  namespace internal {
+    template<typename T> EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {}
+  }
+}
+#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
 
 #if !defined(EIGEN_ASM_COMMENT)
-  #if (defined __GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
-    #define EIGEN_ASM_COMMENT(X)  asm("#" X)
+  #if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
+    #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
   #else
     #define EIGEN_ASM_COMMENT(X)
   #endif
 #endif
 
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+// 
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+// 
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
 /* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
  * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
  * so that vectorization doesn't affect binary compatibility.
@@ -255,27 +580,141 @@
  * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
  * vectorized and non-vectorized code.
  */
-#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
+#if (defined __CUDACC__)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif (defined _MSC_VER)
+#elif EIGEN_COMP_MSVC
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif (defined __SUNPRO_CC)
+#elif EIGEN_COMP_SUNCC
   // FIXME not sure about this one:
   #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #else
   #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
 #endif
 
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#elif defined(__AVX__)
+  // 32 bytes static alignmeent is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+  
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+  
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+  
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
 #define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
 
-#if EIGEN_ALIGN_STATICALLY
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
 #else
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
 #endif
 
+//----------------------------------------------------------------------
+
+
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
@@ -284,7 +723,8 @@
 #endif
 
 #ifndef EIGEN_STACK_ALLOCATION_LIMIT
-#define EIGEN_STACK_ALLOCATION_LIMIT 20000
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
 #endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
@@ -300,27 +740,31 @@
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if defined(_MSC_VER) && (!defined(__INTEL_COMPILER))
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =;
-#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
-  template <typename OtherDerived> \
-  EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
-#else
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
-  { \
-    Base::operator=(other); \
-    return *this; \
-  }
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900 // for older MSVC versions using the base operator is sufficient (cf Bug 1000)
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =;
+#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+    template <typename OtherDerived> \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
+#else
+  #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    using Base::operator =; \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
+    { \
+      Base::operator=(other); \
+      return *this; \
+    }
 #endif
 
-#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ */
+#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
 
 /**
 * Just a side note. Commenting within defines works only by documenting
@@ -334,32 +778,12 @@
   typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
   typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
+  typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
   typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
+  typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
   enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
         ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
         Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime };
-
-
-#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
-  typedef typename Base::PacketScalar PacketScalar; \
-  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
-  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
-        MaxRowsAtCompileTime = Eigen::internal::traits<Derived>::MaxRowsAtCompileTime, \
-        MaxColsAtCompileTime = Eigen::internal::traits<Derived>::MaxColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
         SizeAtCompileTime = Base::SizeAtCompileTime, \
         MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
         IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
@@ -367,6 +791,12 @@
   using Base::const_cast_derived;
 
 
+// FIXME Maybe the EIGEN_DENSE_PUBLIC_INTERFACE could be removed as importing PacketScalar is rarely needed
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
+  typedef typename Base::PacketScalar PacketScalar;
+
+
 #define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
 #define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
 
@@ -398,7 +828,7 @@
 
 #define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
   template<typename OtherDerived> \
-  EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
   (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
   { \
     return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
@@ -415,4 +845,29 @@
       const RHS \
     >
 
+#ifdef EIGEN_EXCEPTIONS
+#  define EIGEN_THROW_X(X) throw X
+#  define EIGEN_THROW throw
+#  define EIGEN_TRY try
+#  define EIGEN_CATCH(X) catch (X)
+#else
+#  ifdef __CUDA_ARCH__
+#    define EIGEN_THROW_X(X) asm("trap;") return {}
+#    define EIGEN_THROW asm("trap;"); return {}
+#  else
+#    define EIGEN_THROW_X(X) std::abort()
+#    define EIGEN_THROW std::abort()
+#  endif
+#  define EIGEN_TRY if (true)
+#  define EIGEN_CATCH(X) else
+#endif
+
+#if EIGEN_HAS_CXX11_NOEXCEPT
+#   define EIGEN_NO_THROW noexcept(true)
+#   define EIGEN_EXCEPTION_SPEC(X) noexcept(false)
+#else
+#   define EIGEN_NO_THROW throw()
+#   define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#endif
+
 #endif // EIGEN_MACROS_H
diff --git a/nuparu/include/Eigen/src/Core/util/Memory.h b/nuparu/include/Eigen/src/Core/util/Memory.h
index 451535a0..1fc535a3 100644
--- a/nuparu/include/Eigen/src/Core/util/Memory.h
+++ b/nuparu/include/Eigen/src/Core/util/Memory.h
@@ -1,11 +1,12 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 // Copyright (C) 2010 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2013 Pavel Holoborodko <pavel@holoborodko.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -31,7 +32,7 @@
 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
 #if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
+ && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
   #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@@ -41,15 +42,15 @@
 //   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
 // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
 //   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
+#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
   #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
 #else
   #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
 #endif
 
-#if defined(__APPLE__) \
- || defined(_WIN64) \
- || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
+#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16))     \
+ || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16))   \
+ || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED              \
  || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
   #define EIGEN_MALLOC_ALREADY_ALIGNED 1
 #else
@@ -58,21 +59,23 @@
 
 #endif
 
-// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
-// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
-// Currently, let's include it only on unix systems:
-#if defined(__unix__) || defined(__unix)
-  #include <unistd.h>
-  #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
-    #define EIGEN_HAS_POSIX_MEMALIGN 1
+#ifndef EIGEN_HAS_POSIX_MEMALIGN
+  // See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
+  // It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
+  // Currently, let's include it only on unix systems:
+  #if EIGEN_OS_UNIX && !(EIGEN_OS_SUN || EIGEN_OS_SOLARIS)
+    #include <unistd.h>
+    #if (EIGEN_OS_QNX || (defined _GNU_SOURCE) || EIGEN_COMP_PGI || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
+      #define EIGEN_HAS_POSIX_MEMALIGN 1
+    #endif
   #endif
-#endif
 
-#ifndef EIGEN_HAS_POSIX_MEMALIGN
-  #define EIGEN_HAS_POSIX_MEMALIGN 0
+  #ifndef EIGEN_HAS_POSIX_MEMALIGN
+    #define EIGEN_HAS_POSIX_MEMALIGN 0
+  #endif
 #endif
 
-#ifdef EIGEN_VECTORIZE_SSE
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
   #define EIGEN_HAS_MM_MALLOC 1
 #else
   #define EIGEN_HAS_MM_MALLOC 0
@@ -82,12 +85,13 @@ namespace Eigen {
 
 namespace internal {
 
+EIGEN_DEVICE_FUNC 
 inline void throw_std_bad_alloc()
 {
   #ifdef EIGEN_EXCEPTIONS
     throw std::bad_alloc();
   #else
-    std::size_t huge = -1;
+    std::size_t huge = static_cast<std::size_t>(-1);
     new int[huge];
   #endif
 }
@@ -103,9 +107,9 @@ inline void throw_std_bad_alloc()
   */
 inline void* handmade_aligned_malloc(std::size_t size)
 {
-  void *original = std::malloc(size+16);
+  void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -126,9 +130,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
   if (ptr == 0) return handmade_aligned_malloc(size);
   void *original = *(reinterpret_cast<void**>(ptr) - 1);
   std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+16);
+  original = std::realloc(original,size+EIGEN_DEFAULT_ALIGN_BYTES);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
@@ -141,8 +145,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
 *** Implementation of generic aligned realloc (when no realloc can be used)***
 *****************************************************************************/
 
-void* aligned_malloc(std::size_t size);
-void  aligned_free(void *ptr);
+EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
+EIGEN_DEVICE_FUNC void  aligned_free(void *ptr);
 
 /** \internal
   * \brief Reallocates aligned memory.
@@ -183,47 +187,47 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
 *****************************************************************************/
 
 #ifdef EIGEN_NO_MALLOC
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
 }
 #elif defined EIGEN_RUNTIME_NO_MALLOC
-inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
 {
   static bool value = true;
   if (update == 1)
     value = new_value;
   return value;
 }
-inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
 #else 
-inline void check_that_malloc_is_allowed()
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
 
-/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
   * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
   */
-inline void* aligned_malloc(size_t size)
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
 {
   check_that_malloc_is_allowed();
 
   void *result;
-  #if !EIGEN_ALIGN
+  #if EIGEN_DEFAULT_ALIGN_BYTES==0
     result = std::malloc(size);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
     result = std::malloc(size);
   #elif EIGEN_HAS_POSIX_MEMALIGN
-    if(posix_memalign(&result, 16, size)) result = 0;
+    if(posix_memalign(&result, EIGEN_DEFAULT_ALIGN_BYTES, size)) result = 0;
   #elif EIGEN_HAS_MM_MALLOC
-    result = _mm_malloc(size, 16);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    result = _aligned_malloc(size, 16);
+    result = _mm_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES);
+  #elif EIGEN_OS_WIN_STRICT
+    result = _aligned_malloc(size, EIGEN_DEFAULT_ALIGN_BYTES);
   #else
     result = handmade_aligned_malloc(size);
   #endif
@@ -235,9 +239,9 @@ inline void* aligned_malloc(size_t size)
 }
 
 /** \internal Frees memory allocated with aligned_malloc. */
-inline void aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
-  #if !EIGEN_ALIGN
+  #if EIGEN_DEFAULT_ALIGN_BYTES==0
     std::free(ptr);
   #elif EIGEN_MALLOC_ALREADY_ALIGNED
     std::free(ptr);
@@ -245,7 +249,7 @@ inline void aligned_free(void *ptr)
     std::free(ptr);
   #elif EIGEN_HAS_MM_MALLOC
     _mm_free(ptr);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
+  #elif EIGEN_OS_WIN_STRICT
     _aligned_free(ptr);
   #else
     handmade_aligned_free(ptr);
@@ -262,7 +266,7 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
   EIGEN_UNUSED_VARIABLE(old_size);
 
   void *result;
-#if !EIGEN_ALIGN
+#if EIGEN_DEFAULT_ALIGN_BYTES==0
   result = std::realloc(ptr,new_size);
 #elif EIGEN_MALLOC_ALREADY_ALIGNED
   result = std::realloc(ptr,new_size);
@@ -272,13 +276,13 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
   // The defined(_mm_free) is just here to verify that this MSVC version
   // implements _mm_malloc/_mm_free based on the corresponding _aligned_
   // functions. This may not always be the case and we just try to be safe.
-  #if defined(_MSC_VER) && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,16);
+  #if EIGEN_OS_WIN_STRICT && defined(_mm_free)
+    result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES);
   #else
     result = generic_aligned_realloc(ptr,new_size,old_size);
   #endif
-#elif defined(_MSC_VER)
-  result = _aligned_realloc(ptr,new_size,16);
+#elif EIGEN_OS_WIN_STRICT
+  result = _aligned_realloc(ptr,new_size,EIGEN_DEFAULT_ALIGN_BYTES);
 #else
   result = handmade_aligned_realloc(ptr,new_size,old_size);
 #endif
@@ -296,12 +300,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
   * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
   */
-template<bool Align> inline void* conditional_aligned_malloc(size_t size)
+template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
 {
   return aligned_malloc(size);
 }
 
-template<> inline void* conditional_aligned_malloc<false>(size_t size)
+template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
 {
   check_that_malloc_is_allowed();
 
@@ -312,12 +316,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
 }
 
 /** \internal Frees memory allocated with conditional_aligned_malloc */
-template<bool Align> inline void conditional_aligned_free(void *ptr)
+template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
 {
   aligned_free(ptr);
 }
 
-template<> inline void conditional_aligned_free<false>(void *ptr)
+template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
   std::free(ptr);
 }
@@ -336,31 +340,40 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
 *** Construction/destruction of array elements                             ***
 *****************************************************************************/
 
-/** \internal Constructs the elements of an array.
-  * The \a size parameter tells on how many objects to call the constructor of T.
-  */
-template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
-{
-  for (size_t i=0; i < size; ++i) ::new (ptr + i) T;
-  return ptr;
-}
-
 /** \internal Destructs the elements of an array.
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
 {
   // always destruct an array starting from the end.
   if(ptr)
     while(size) ptr[--size].~T();
 }
 
+/** \internal Constructs the elements of an array.
+  * The \a size parameter tells on how many objects to call the constructor of T.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
+{
+  size_t i;
+  EIGEN_TRY
+  {
+      for (i = 0; i < size; ++i) ::new (ptr + i) T;
+      return ptr;
+  }
+  EIGEN_CATCH(...)
+  {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+}
+
 /*****************************************************************************
 *** Implementation of aligned new/delete-like functions                    ***
 *****************************************************************************/
 
 template<typename T>
-EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
 {
   if(size > size_t(-1) / sizeof(T))
     throw_std_bad_alloc();
@@ -370,24 +383,40 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
   * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
   * The default constructor of T is called.
   */
-template<typename T> inline T* aligned_new(size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    aligned_free(result);
+    EIGEN_THROW;
+  }
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
 {
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  EIGEN_TRY
+  {
+    return construct_elements_of_array(result, size);
+  }
+  EIGEN_CATCH(...)
+  {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
+  }
 }
 
 /** \internal Deletes objects constructed with aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T> inline void aligned_delete(T *ptr, size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   aligned_free(ptr);
@@ -396,13 +425,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
 /** \internal Deletes objects constructed with conditional_aligned_new
   * The \a size parameters tells on how many objects to call the destructor of T.
   */
-template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
@@ -410,17 +439,39 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
     destruct_elements_of_array(pts+new_size, old_size-new_size);
   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
   if(new_size > old_size)
-    construct_elements_of_array(result+old_size, new_size-old_size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
 
-template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
 {
+  if(size==0)
+    return 0; // short-cut. Also fixes Bug 884
   check_size_for_overflow<T>(size);
   T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
   if(NumTraits<T>::RequireInitialization)
-    construct_elements_of_array(result, size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result, size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
@@ -432,11 +483,21 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
     destruct_elements_of_array(pts+new_size, old_size-new_size);
   T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
   if(NumTraits<T>::RequireInitialization && (new_size > old_size))
-    construct_elements_of_array(result+old_size, new_size-old_size);
+  {
+    EIGEN_TRY
+    {
+      construct_elements_of_array(result+old_size, new_size-old_size);
+    }
+    EIGEN_CATCH(...)
+    {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
-template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
+template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
 {
   if(NumTraits<T>::RequireInitialization)
     destruct_elements_of_array<T>(ptr, size);
@@ -445,52 +506,61 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
 
 /****************************************************************************/
 
-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a Alignment.
   *
+  * \tparam Alignment requested alignment in Bytes.
   * \param array the address of the start of the array
   * \param size the size of the array
   *
-  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
-  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
+  * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+  * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
   * packet size for the given scalar type is 1, then everything is considered well-aligned.
   *
-  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
-  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
-  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
+  * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+  * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
   * example with Scalar=double on certain 32-bit platforms, see bug #79.
   *
   * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+  * \sa first_default_aligned()
   */
-template<typename Scalar, typename Index>
-static inline Index first_aligned(const Scalar* array, Index size)
+template<int Alignment, typename Scalar, typename Index>
+inline Index first_aligned(const Scalar* array, Index size)
 {
-  enum { PacketSize = packet_traits<Scalar>::size,
-         PacketAlignedMask = PacketSize-1
-  };
+  static const Index ScalarSize = sizeof(Scalar);
+  static const Index AlignmentSize = Alignment / ScalarSize;
+  static const Index AlignmentMask = AlignmentSize-1;
 
-  if(PacketSize==1)
+  if(AlignmentSize<=1)
   {
-    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same alignment.
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
     return 0;
   }
-  else if(size_t(array) & (sizeof(Scalar)-1))
+  else if( (std::size_t(array) & (sizeof(Scalar)-1)) || (Alignment%ScalarSize)!=0)
   {
-    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the scalar size.
     // Consequently, no element of the array is well aligned.
     return size;
   }
   else
   {
-    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, size);
+    return std::min<Index>( (AlignmentSize - (Index((std::size_t(array)/sizeof(Scalar))) & AlignmentMask)) & AlignmentMask, size);
   }
 }
 
+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet requirement.
+   * \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template<typename Scalar, typename Index>
+inline Index first_default_aligned(const Scalar* array, Index size)
+{
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
+}
+
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
   */ 
 template<typename Index> 
-inline static Index first_multiple(Index size, Index base)
+inline Index first_multiple(Index size, Index base)
 {
   return ((size+base-1)/base)*base;
 }
@@ -499,21 +569,49 @@ inline static Index first_multiple(Index size, Index base)
 // use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
 template<typename T, bool UseMemcpy> struct smart_copy_helper;
 
-template<typename T> void smart_copy(const T* start, const T* end, T* target)
+template<typename T> EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* end, T* target)
 {
   smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
 }
 
 template<typename T> struct smart_copy_helper<T,true> {
-  static inline void run(const T* start, const T* end, T* target)
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
   { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
 };
 
 template<typename T> struct smart_copy_helper<T,false> {
-  static inline void run(const T* start, const T* end, T* target)
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target)
   { std::copy(start, end, target); }
 };
 
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. 
+template<typename T, bool UseMemmove> struct smart_memmove_helper;
+
+template<typename T> void smart_memmove(const T* start, const T* end, T* target)
+{
+  smart_memmove_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+}
+
+template<typename T> struct smart_memmove_helper<T,true> {
+  static inline void run(const T* start, const T* end, T* target)
+  { std::memmove(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+};
+
+template<typename T> struct smart_memmove_helper<T,false> {
+  static inline void run(const T* start, const T* end, T* target)
+  { 
+    if (uintptr_t(target) < uintptr_t(start))
+    {
+      std::copy(start, end, target);
+    }
+    else                                 
+    {
+      std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
+      std::copy_backward(start, end, target + count); 
+    }
+  }
+};
+
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
@@ -522,16 +620,16 @@ template<typename T> struct smart_copy_helper<T,false> {
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
 #ifndef EIGEN_ALLOCA
-  #if (defined __linux__)
+  #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
-  #elif defined(_MSC_VER)
+  #elif EIGEN_COMP_MSVC
     #define EIGEN_ALLOCA _alloca
   #endif
 #endif
 
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
-template<typename T> class aligned_stack_memory_handler
+template<typename T> class aligned_stack_memory_handler : noncopyable
 {
   public:
     /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
@@ -559,6 +657,30 @@ template<typename T> class aligned_stack_memory_handler
     bool m_deallocate;
 };
 
+template<typename T> class scoped_array : noncopyable
+{
+  T* m_ptr;
+public:
+  explicit scoped_array(std::ptrdiff_t size)
+  {
+    m_ptr = new T[size];
+  }
+  ~scoped_array()
+  {
+    delete[] m_ptr;
+  }
+  T& operator[](std::ptrdiff_t i) { return m_ptr[i]; }
+  const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; }
+  T* &ptr() { return m_ptr; }
+  const T* ptr() const { return m_ptr; }
+  operator const T*() const { return m_ptr; }
+};
+
+template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
+{
+  std::swap(a.ptr(),b.ptr());
+}
+    
 } // end namespace internal
 
 /** \internal
@@ -577,11 +699,13 @@ template<typename T> class aligned_stack_memory_handler
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
   */
 #ifdef EIGEN_ALLOCA
-
-  #ifdef __arm__
-    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
+  
+  #if EIGEN_DEFAULT_ALIGN_BYTES>0
+    // We always manually re-align the result of EIGEN_ALLOCA.
+    // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<std::size_t>(EIGEN_ALLOCA(SIZE+EIGEN_DEFAULT_ALIGN_BYTES-1)) + EIGEN_DEFAULT_ALIGN_BYTES-1) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)))
   #else
-    #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
+    #define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
   #endif
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@@ -606,21 +730,12 @@ template<typename T> class aligned_stack_memory_handler
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_ALIGN
-  #ifdef EIGEN_EXCEPTIONS
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        try { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
-        catch (...) { return 0; } \
-        return 0; \
-      }
-  #else
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
+#if EIGEN_MAX_ALIGN_BYTES!=0
+  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
+      void* operator new(size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
+        EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
+        EIGEN_CATCH (...) { return 0; } \
       }
-  #endif
-
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
       void *operator new(size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
@@ -628,16 +743,20 @@ template<typename T> class aligned_stack_memory_handler
       void *operator new[](size_t size) { \
         return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
       } \
-      void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete(void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
+      void operator delete[](void * ptr, std::size_t /* sz */) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
       /* in-place new and delete. since (at least afaik) there is no actual   */ \
       /* memory allocated we can safely let the default implementation handle */ \
       /* this particular case. */ \
       static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
-      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
+      static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
+      void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \
+      void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \
       /* nothrow-new (returns zero instead of std::bad_alloc) */ \
       EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
+      void operator delete(void *ptr, const std::nothrow_t&) EIGEN_NO_THROW { \
         Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
       } \
       typedef void eigen_aligned_operator_new_marker_type;
@@ -647,7 +766,7 @@ template<typename T> class aligned_stack_memory_handler
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0)))
 
 /****************************************************************************/
 
@@ -668,102 +787,53 @@ template<typename T> class aligned_stack_memory_handler
 * \sa \ref TopicStlContainers.
 */
 template<class T>
-class aligned_allocator
+class aligned_allocator : public std::allocator<T>
 {
 public:
-    typedef size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
-
-    template<class U>
-    struct rebind
-    {
-        typedef aligned_allocator<U> other;
-    };
-
-    pointer address( reference value ) const
-    {
-        return &value;
-    }
-
-    const_pointer address( const_reference value ) const
-    {
-        return &value;
-    }
-
-    aligned_allocator()
-    {
-    }
-
-    aligned_allocator( const aligned_allocator& )
-    {
-    }
-
-    template<class U>
-    aligned_allocator( const aligned_allocator<U>& )
-    {
-    }
-
-    ~aligned_allocator()
-    {
-    }
-
-    size_type max_size() const
-    {
-        return (std::numeric_limits<size_type>::max)();
-    }
-
-    pointer allocate( size_type num, const void* hint = 0 )
-    {
-        EIGEN_UNUSED_VARIABLE(hint);
-        internal::check_size_for_overflow<T>(num);
-        return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
-    }
+  typedef size_t          size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef T*              pointer;
+  typedef const T*        const_pointer;
+  typedef T&              reference;
+  typedef const T&        const_reference;
+  typedef T               value_type;
+
+  template<class U>
+  struct rebind
+  {
+    typedef aligned_allocator<U> other;
+  };
 
-    void construct( pointer p, const T& value )
-    {
-        ::new( p ) T( value );
-    }
+  aligned_allocator() : std::allocator<T>() {}
 
-    // Support for c++11
-#if (__cplusplus >= 201103L)
-    template<typename... Args>
-    void  construct(pointer p, Args&&... args)
-    {
-      ::new(p) T(std::forward<Args>(args)...);
-    }
-#endif
+  aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
 
-    void destroy( pointer p )
-    {
-        p->~T();
-    }
+  template<class U>
+  aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
 
-    void deallocate( pointer p, size_type /*num*/ )
-    {
-        internal::aligned_free( p );
-    }
+  ~aligned_allocator() {}
 
-    bool operator!=(const aligned_allocator<T>& ) const
-    { return false; }
+  pointer allocate(size_type num, const void* /*hint*/ = 0)
+  {
+    internal::check_size_for_overflow<T>(num);
+    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
+  }
 
-    bool operator==(const aligned_allocator<T>& ) const
-    { return true; }
+  void deallocate(pointer p, size_type /*num*/)
+  {
+    internal::aligned_free(p);
+  }
 };
 
 //---------- Cache sizes ----------
 
 #if !defined(EIGEN_NO_CPUID)
-#  if defined(__GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
-#    if defined(__PIC__) && defined(__i386__)
+#  if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+#    if defined(__PIC__) && EIGEN_ARCH_i386
        // Case for x86 with PIC
 #      define EIGEN_CPUID(abcd,func,id) \
          __asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id));
-#    elif defined(__PIC__) && defined(__x86_64__)
+#    elif defined(__PIC__) && EIGEN_ARCH_x86_64
        // Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model.
        // However, we cannot detect which code model is used, and the xchg overhead is negligible anyway.
 #      define EIGEN_CPUID(abcd,func,id) \
@@ -773,8 +843,8 @@ class aligned_allocator
 #      define EIGEN_CPUID(abcd,func,id) \
          __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) );
 #    endif
-#  elif defined(_MSC_VER)
-#    if (_MSC_VER > 1500) && ( defined(_M_IX86) || defined(_M_X64) )
+#  elif EIGEN_COMP_MSVC
+#    if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64
 #      define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id)
 #    endif
 #  endif
@@ -784,9 +854,9 @@ namespace internal {
 
 #ifdef EIGEN_CPUID
 
-inline bool cpuid_is_vendor(int abcd[4], const char* vendor)
+inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
 {
-  return abcd[1]==(reinterpret_cast<const int*>(vendor))[0] && abcd[3]==(reinterpret_cast<const int*>(vendor))[1] && abcd[2]==(reinterpret_cast<const int*>(vendor))[2];
+  return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
 }
 
 inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
@@ -928,13 +998,16 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3)
 {
   #ifdef EIGEN_CPUID
   int abcd[4];
+  const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
+  const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
+  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
 
   // identify the CPU vendor
   EIGEN_CPUID(abcd,0x0,0);
   int max_std_funcs = abcd[1];
-  if(cpuid_is_vendor(abcd,"GenuineIntel"))
+  if(cpuid_is_vendor(abcd,GenuineIntel))
     queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
-  else if(cpuid_is_vendor(abcd,"AuthenticAMD") || cpuid_is_vendor(abcd,"AMDisbetter!"))
+  else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
     queryCacheSizes_amd(l1,l2,l3);
   else
     // by default let's use Intel's API
diff --git a/nuparu/include/Eigen/src/Core/util/Meta.h b/nuparu/include/Eigen/src/Core/util/Meta.h
index 71d58710..3dee2bd7 100644
--- a/nuparu/include/Eigen/src/Core/util/Meta.h
+++ b/nuparu/include/Eigen/src/Core/util/Meta.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,6 +11,11 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
+#if defined(__CUDA_ARCH__)
+#include <cfloat>
+#include <math_constants.h>
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -68,6 +73,18 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
+template<typename T> struct is_integral        { enum { value = false }; };
+template<> struct is_integral<bool>            { enum { value = true }; };
+template<> struct is_integral<char>            { enum { value = true }; };
+template<> struct is_integral<signed char>     { enum { value = true }; };
+template<> struct is_integral<unsigned char>   { enum { value = true }; };
+template<> struct is_integral<signed short>    { enum { value = true }; };
+template<> struct is_integral<unsigned short>  { enum { value = true }; };
+template<> struct is_integral<signed int>      { enum { value = true }; };
+template<> struct is_integral<unsigned int>    { enum { value = true }; };
+template<> struct is_integral<signed long>     { enum { value = true }; };
+template<> struct is_integral<unsigned long>   { enum { value = true }; };
+
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
 
@@ -80,6 +97,34 @@ template<typename T> struct add_const_on_value_type<T*>        { typedef T const
 template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
 template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
 
+
+template<typename From, typename To>
+struct is_convertible_impl
+{
+private:
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  static yes test(const To&, int);
+  static no  test(any_conversion, ...);
+
+public:
+  static From ms_from;
+  enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
+};
+
+template<typename From, typename To>
+struct is_convertible
+{
+  enum { value = is_convertible_impl<typename remove_all<From>::type,
+                                     typename remove_all<To  >::type>::value };
+};
+
 /** \internal Allows to enable/disable an overload
   * according to a compile time condition.
   */
@@ -88,21 +133,110 @@ template<bool Condition, typename T> struct enable_if;
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
+#if defined(__CUDA_ARCH__)
+#if !defined(__FLT_EPSILON__)
+#define __FLT_EPSILON__ FLT_EPSILON
+#define __DBL_EPSILON__ DBL_EPSILON
+#endif
+
+namespace device {
+
+template<typename T> struct numeric_limits
+{
+  EIGEN_DEVICE_FUNC
+  static T epsilon() { return 0; }
+  static T (max)() { assert(false && "Highest not supported for this type"); }
+  static T (min)() { assert(false && "Lowest not supported for this type"); }
+};
+template<> struct numeric_limits<float>
+{
+  EIGEN_DEVICE_FUNC
+  static float epsilon() { return __FLT_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static float (max)() { return CUDART_MAX_NORMAL_F; }
+  EIGEN_DEVICE_FUNC
+  static float (min)() { return FLT_MIN; }
+};
+template<> struct numeric_limits<double>
+{
+  EIGEN_DEVICE_FUNC
+  static double epsilon() { return __DBL_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static double (max)() { return DBL_MAX; }
+  EIGEN_DEVICE_FUNC
+  static double (min)() { return DBL_MIN; }
+};
+template<> struct numeric_limits<int>
+{
+  EIGEN_DEVICE_FUNC
+  static int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static int (max)() { return INT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static int (min)() { return INT_MIN; }
+};
+template<> struct numeric_limits<unsigned int>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (max)() { return UINT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (min)() { return 0; }
+};
+template<> struct numeric_limits<long>
+{
+  EIGEN_DEVICE_FUNC
+  static long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long (max)() { return LONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long (min)() { return LONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (max)() { return ULONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (min)() { return 0; }
+};
+template<> struct numeric_limits<long long>
+{
+  EIGEN_DEVICE_FUNC
+  static long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long long (max)() { return LLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long long (min)() { return LLONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (max)() { return ULLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (min)() { return 0; }
+};
+
+}
 
+#endif
 
 /** \internal
   * A base class do disable default copy ctor and copy assignement operator.
   */
 class noncopyable
 {
-  noncopyable(const noncopyable&);
-  const noncopyable& operator=(const noncopyable&);
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
 protected:
-  noncopyable() {}
-  ~noncopyable() {}
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
 };
 
-
 /** \internal
   * Convenient struct to get the result type of a unary or binary functor.
   *
@@ -110,7 +244,13 @@ class noncopyable
   * upcoming next STL generation (using a templated result member).
   * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
   */
-template<typename T> struct result_of {};
+#ifdef EIGEN_HAS_STD_RESULT_OF
+template<typename T> struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename T> struct result_of { };
 
 struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
@@ -128,10 +268,10 @@ struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef ty
 template<typename Func, typename ArgType>
 struct result_of<Func(ArgType)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
@@ -152,15 +292,16 @@ struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
 template<typename Func, typename ArgType0, typename ArgType1>
 struct result_of<Func(ArgType0,ArgType1)> {
     template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
     template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none            testFunctor(...);
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none               testFunctor(...);
 
     // note that the following indirection is needed for gcc-3.3
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
 };
+#endif
 
 /** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
   * Usage example: \code meta_sqrt<1023>::ret \endcode
@@ -224,19 +365,25 @@ template<typename T> struct scalar_product_traits<std::complex<T>, T>
 // typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
 // };
 
-template<typename T> struct is_diagonal
-{ enum { ret = false }; };
-
-template<typename T> struct is_diagonal<DiagonalBase<T> >
-{ enum { ret = true }; };
-
-template<typename T> struct is_diagonal<DiagonalWrapper<T> >
-{ enum { ret = true }; };
+} // end namespace internal
 
-template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
-{ enum { ret = true }; };
+namespace numext {
+  
+#if defined(__CUDA_ARCH__)
+template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
+#else
+template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
+#endif
+
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
 
-} // end namespace internal
+} // end namespace numext
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/Core/util/StaticAssert.h b/nuparu/include/Eigen/src/Core/util/StaticAssert.h
index 8872c5b6..1fe365aa 100644
--- a/nuparu/include/Eigen/src/Core/util/StaticAssert.h
+++ b/nuparu/include/Eigen/src/Core/util/StaticAssert.h
@@ -26,7 +26,7 @@
 
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (defined(_MSC_VER) && (_MSC_VER >= 1600))
+  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (EIGEN_COMP_MSVC >= 1600)
 
     // if native static_assert is enabled, let's use it
     #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
@@ -84,13 +84,20 @@
         THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
         YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
         THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
+        THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS,
         THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
         THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
         YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
         YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED,
         THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE,
         THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
-        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG
+        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
+        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
+        STORAGE_LAYOUT_DOES_NOT_MATCH,
+        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE,
+        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS,
+        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY,
+        THIS_TYPE_IS_NOT_SUPPORTED
       };
     };
 
@@ -101,15 +108,15 @@
     // Specialized implementation for MSVC to avoid "conditional
     // expression is constant" warnings.  This implementation doesn't
     // appear to work under GCC, hence the multiple implementations.
-    #ifdef _MSC_VER
+    #if EIGEN_COMP_MSVC
 
       #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
         {Eigen::internal::static_assertion<bool(CONDITION)>::MSG;}
 
     #else
-
+      // In some cases clang interprets bool(CONDITION) as function declaration
       #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
-        if (Eigen::internal::static_assertion<bool(CONDITION)>::MSG) {}
+        if (Eigen::internal::static_assertion<static_cast<bool>(CONDITION)>::MSG) {}
 
     #endif
 
@@ -157,7 +164,7 @@
 
 #define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
      ( \
-        (int(TYPE0::SizeAtCompileTime)==0 && int(TYPE1::SizeAtCompileTime)==0) \
+        (int(internal::size_of_xpr_at_compile_time<TYPE0>::ret)==0 && int(internal::size_of_xpr_at_compile_time<TYPE1>::ret)==0) \
     || (\
           (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
         || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
@@ -168,13 +175,8 @@
        ) \
      )
 
-#ifdef EIGEN2_SUPPORT
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    eigen_assert(!NumTraits<Scalar>::IsInteger);
-#else
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+#define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
     EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
-#endif
 
 
 // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
@@ -202,5 +204,9 @@
                                             >::value), \
                           YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
 
+// Check that a cost value is positive, and that is stay within a reasonable range
+// TODO this check could be enabled for internal debugging only
+#define EIGEN_INTERNAL_CHECK_COST_VALUE(C) \
+      EIGEN_STATIC_ASSERT((C)>=0 && (C)<=HugeCost*HugeCost, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE);
 
 #endif // EIGEN_STATIC_ASSERT_H
diff --git a/nuparu/include/Eigen/src/Core/util/XprHelper.h b/nuparu/include/Eigen/src/Core/util/XprHelper.h
index 3c477305..f9e2959c 100644
--- a/nuparu/include/Eigen/src/Core/util/XprHelper.h
+++ b/nuparu/include/Eigen/src/Core/util/XprHelper.h
@@ -14,10 +14,10 @@
 // just a workaround because GCC seems to not really like empty structs
 // FIXME: gcc 4.3 generates bad code when strict-aliasing is enabled
 // so currently we simply disable this optimization for gcc 4.3
-#if (defined __GNUG__) && !((__GNUC__==4) && (__GNUC_MINOR__==3))
+#if EIGEN_COMP_GNUC && !EIGEN_GNUC_AT(4,3)
   #define EIGEN_EMPTY_STRUCT_CTOR(X) \
-    EIGEN_STRONG_INLINE X() {} \
-    EIGEN_STRONG_INLINE X(const X& ) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X() {} \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE X(const X& ) {}
 #else
   #define EIGEN_EMPTY_STRUCT_CTOR(X)
 #endif
@@ -26,8 +26,25 @@ namespace Eigen {
 
 typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
 
+/**
+ * \brief The Index type as used for the API.
+ * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+ * \sa \ref TopicPreprocessorDirectives, StorageIndex.
+ */
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
+
 namespace internal {
 
+template<typename IndexDest, typename IndexSrc>
+EIGEN_DEVICE_FUNC
+inline IndexDest convert_index(const IndexSrc& idx) {
+  // for sizeof(IndexDest)>=sizeof(IndexSrc) compilers should be able to optimize this away:
+  eigen_internal_assert(idx <= NumTraits<IndexDest>::highest() && "Index value to big for target type");
+  return IndexDest(idx);
+}
+
+
 //classes inheriting no_assignment_operator don't generate a default operator=.
 class no_assignment_operator
 {
@@ -50,19 +67,19 @@ template<typename T, int Value> class variable_if_dynamic
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
-    explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
-    static T value() { return T(Value); }
-    void setValue(T) {}
+    EIGEN_DEVICE_FUNC explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamic<T, Dynamic>
 {
     T m_value;
-    variable_if_dynamic() { assert(false); }
+    EIGEN_DEVICE_FUNC variable_if_dynamic() { eigen_assert(false); }
   public:
-    explicit variable_if_dynamic(T value) : m_value(value) {}
-    T value() const { return m_value; }
-    void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC explicit variable_if_dynamic(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC void setValue(T value) { m_value = value; }
 };
 
 /** \internal like variable_if_dynamic but for DynamicIndex
@@ -71,19 +88,19 @@ template<typename T, int Value> class variable_if_dynamicindex
 {
   public:
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
-    explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
-    static T value() { return T(Value); }
-    void setValue(T) {}
+    EIGEN_DEVICE_FUNC explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
+    EIGEN_DEVICE_FUNC static T value() { return T(Value); }
+    EIGEN_DEVICE_FUNC void setValue(T) {}
 };
 
 template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
 {
     T m_value;
-    variable_if_dynamicindex() { assert(false); }
+    EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
   public:
-    explicit variable_if_dynamicindex(T value) : m_value(value) {}
-    T value() const { return m_value; }
-    void setValue(T value) { m_value = value; }
+    EIGEN_DEVICE_FUNC explicit variable_if_dynamicindex(T value) : m_value(value) {}
+    EIGEN_DEVICE_FUNC T value() const { return m_value; }
+    EIGEN_DEVICE_FUNC void setValue(T value) { m_value = value; }
 };
 
 template<typename T> struct functor_traits
@@ -101,7 +118,73 @@ template<typename T> struct packet_traits;
 template<typename T> struct unpacket_traits
 {
   typedef T type;
-  enum {size=1};
+  typedef T half;
+  enum
+  {
+    size = 1,
+    alignment = 1
+  };
+};
+
+template<int Size, typename PacketType,
+         bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
+struct find_best_packet_helper;
+
+template< int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,true>
+{
+  typedef PacketType type;
+};
+
+template<int Size, typename PacketType>
+struct find_best_packet_helper<Size,PacketType,false>
+{
+  typedef typename find_best_packet_helper<Size,typename unpacket_traits<PacketType>::half>::type type;
+};
+
+template<typename T, int Size>
+struct find_best_packet
+{
+  typedef typename find_best_packet_helper<Size,typename packet_traits<T>::type>::type type;
+};
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+template<int ArrayBytes, int AlignmentBytes,
+         bool Match     =  bool((ArrayBytes%AlignmentBytes)==0),
+         bool TryHalf   =  bool(AlignmentBytes>EIGEN_MIN_ALIGN_BYTES) >
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+
+template<int ArrayBytes, int AlignmentBytes, bool TryHalf>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, true, TryHalf> // Match
+{
+  enum { value = AlignmentBytes };
+};
+
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper<ArrayBytes, AlignmentBytes, false, true> // Try-half
+{
+  // current packet too large, try with an half-packet
+  enum { value = compute_default_alignment_helper<ArrayBytes, AlignmentBytes/2>::value };
+};
+#else
+// If static alignment is disabled, no need to bother.
+// This also avoids a division by zero in "bool Match =  bool((ArrayBytes%AlignmentBytes)==0)"
+template<int ArrayBytes, int AlignmentBytes>
+struct compute_default_alignment_helper
+{
+  enum { value = 0 };
+};
+#endif
+
+template<typename T, int Size> struct compute_default_alignment {
+  enum { value = compute_default_alignment_helper<Size*sizeof(T),EIGEN_MAX_STATIC_ALIGN_BYTES>::value };
+};
+
+template<typename T> struct compute_default_alignment<T,Dynamic> {
+  enum { value = EIGEN_MAX_ALIGN_BYTES };
 };
 
 template<typename _Scalar, int _Rows, int _Cols,
@@ -127,35 +210,12 @@ template<typename _Scalar, int _Rows, int _Cols,
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
 class compute_matrix_flags
 {
-    enum {
-      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-
-      aligned_bit =
-      (
-            ((Options&DontAlign)==0)
-        && (
-#if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
-#else
-             0
-#endif
-
-          ||
-
-#if EIGEN_ALIGN
-             is_dynamic_size_storage
-#else
-             0
-#endif
-
-          )
-      ) ? AlignedBit : 0,
-      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
-    };
-
+    enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0 };
   public:
-    enum { ret = LinearAccessBit | LvalueBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
+    // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
+    // and then propagate this information to the evaluator's flags.
+    // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage.
+    enum { ret = DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit };
 };
 
 template<int _Rows, int _Cols> struct size_at_compile_time
@@ -163,34 +223,43 @@ template<int _Rows, int _Cols> struct size_at_compile_time
   enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
 };
 
+template<typename XprType> struct size_of_xpr_at_compile_time
+{
+  enum { ret = size_at_compile_time<traits<XprType>::RowsAtCompileTime,traits<XprType>::ColsAtCompileTime>::ret };
+};
+
 /* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type,
  * whereas eval is a const reference in the case of a matrix
  */
 
 template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
-template<typename T, typename BaseClassType> struct plain_matrix_type_dense;
+template<typename T, typename BaseClassType, int Flags> struct plain_matrix_type_dense;
 template<typename T> struct plain_matrix_type<T,Dense>
 {
-  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind>::type type;
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, traits<T>::Flags>::type type;
+};
+template<typename T> struct plain_matrix_type<T,DiagonalShape>
+{
+  typedef typename T::PlainObject type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,MatrixXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,MatrixXpr,Flags>
 {
   typedef Matrix<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,ArrayXpr>
+template<typename T, int Flags> struct plain_matrix_type_dense<T,ArrayXpr,Flags>
 {
   typedef Array<typename traits<T>::Scalar,
                 traits<T>::RowsAtCompileTime,
                 traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+                AutoAlign | (Flags&RowMajorBit ? RowMajor : ColMajor),
                 traits<T>::MaxRowsAtCompileTime,
                 traits<T>::MaxColsAtCompileTime
           > type;
@@ -215,6 +284,11 @@ template<typename T> struct eval<T,Dense>
 //           > type;
 };
 
+template<typename T> struct eval<T,DiagonalShape>
+{
+  typedef typename plain_matrix_type<T>::type type;
+};
+
 // for matrices, no need to evaluate, just use a const reference to avoid a useless copy
 template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
 struct eval<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
@@ -229,6 +303,15 @@ struct eval<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
 };
 
 
+/* similar to plain_matrix_type, but using the evaluator's Flags */
+template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_object_eval;
+
+template<typename T>
+struct plain_object_eval<T,Dense>
+{
+  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+};
+
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
@@ -266,9 +349,6 @@ template<typename T> struct plain_matrix_type_row_major
           > type;
 };
 
-// we should be able to get rid of this one too
-template<typename T> struct must_nest_by_value { enum { ret = false }; };
-
 /** \internal The reference selector for template expressions. The idea is that we don't
   * need to use references for expressions since they are light weight proxy
   * objects which should generate no copying overhead. */
@@ -280,6 +360,12 @@ struct ref_selector
     T const&,
     const T
   >::type type;
+  
+  typedef typename conditional<
+    bool(traits<T>::Flags & NestByRefBit),
+    T &,
+    T
+  >::type non_const_type;
 };
 
 /** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
@@ -293,55 +379,46 @@ struct transfer_constness
   >::type type;
 };
 
-/** \internal Determines how a given expression should be nested into another one.
+
+// However, we still need a mechanism to detect whether an expression which is evaluated multiple time
+// has to be evaluated into a temporary.
+// That's the purpose of this new nested_eval helper:
+/** \internal Determines how a given expression should be nested when evaluated multiple times.
   * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
-  * nested into the bigger product expression. The choice is between nesting the expression b+c as-is, or
+  * evaluated into the bigger product expression. The choice is between nesting the expression b+c as-is, or
   * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
   * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
   * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
   *
-  * \param T the type of the expression being nested
+  * \param T the type of the expression being nested.
   * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
-  *
-  * Note that if no evaluation occur, then the constness of T is preserved.
-  *
-  * Example. Suppose that a, b, and c are of type Matrix3d. The user forms the expression a*(b+c).
-  * b+c is an expression "sum of matrices", which we will denote by S. In order to determine how to nest it,
-  * the Product expression uses: nested<S, 3>::ret, which turns out to be Matrix3d because the internal logic of
-  * nested determined that in this case it was better to evaluate the expression b+c into a temporary. On the other hand,
-  * since a is of type Matrix3d, the Product expression nests it as nested<Matrix3d, 3>::ret, which turns out to be
-  * const Matrix3d&, because the internal logic of nested determined that since a was already a matrix, there was no point
-  * in copying it into another matrix.
+  * \param PlainObject the type of the temporary if needed.
   */
-template<typename T, int n=1, typename PlainObject = typename eval<T>::type> struct nested
+template<typename T, int n, typename PlainObject = typename plain_object_eval<T>::type> struct nested_eval
 {
   enum {
-    // for the purpose of this test, to keep it reasonably simple, we arbitrarily choose a value of Dynamic values.
-    // the choice of 10000 makes it larger than any practical fixed value and even most dynamic values.
-    // in extreme cases where these assumptions would be wrong, we would still at worst suffer performance issues
-    // (poor choice of temporaries).
-    // it's important that this value can still be squared without integer overflowing.
-    DynamicAsInteger = 10000,
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    ScalarReadCostAsInteger = ScalarReadCost == Dynamic ? int(DynamicAsInteger) : int(ScalarReadCost),
-    CoeffReadCost = traits<T>::CoeffReadCost,
-    CoeffReadCostAsInteger = CoeffReadCost == Dynamic ? int(DynamicAsInteger) : int(CoeffReadCost),
-    NAsInteger = n == Dynamic ? int(DynamicAsInteger) : n,
-    CostEvalAsInteger   = (NAsInteger+1) * ScalarReadCostAsInteger + CoeffReadCostAsInteger,
-    CostNoEvalAsInteger = NAsInteger * CoeffReadCostAsInteger
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
+                                                  //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
+                                                  //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
+                                                  //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
+                                                  //      Another solution could be to count the number of temps?
+    NAsInteger = n == Dynamic ? HugeCost : n,
+    CostEval   = (NAsInteger+1) * ScalarReadCost + CoeffReadCost,
+    CostNoEval = NAsInteger * CoeffReadCost
   };
 
   typedef typename conditional<
-      ( (int(traits<T>::Flags) & EvalBeforeNestingBit) ||
-        int(CostEvalAsInteger) < int(CostNoEvalAsInteger)
-      ),
-      PlainObject,
-      typename ref_selector<T>::type
+        ( (int(evaluator<T>::Flags) & EvalBeforeNestingBit) ||
+          (int(CostEval) < int(CostNoEval)) ),
+        PlainObject,
+        typename ref_selector<T>::type
   >::type type;
 };
 
 template<typename T>
-T* const_cast_ptr(const T* ptr)
+EIGEN_DEVICE_FUNC
+inline T* const_cast_ptr(const T* ptr)
 {
   return const_cast<T*>(ptr);
 }
@@ -364,30 +441,59 @@ struct dense_xpr_base<Derived, ArrayXpr>
   typedef ArrayBase<Derived> type;
 };
 
+template<typename Derived, typename XprKind = typename traits<Derived>::XprKind, typename StorageKind = typename traits<Derived>::StorageKind>
+struct generic_xpr_base;
+
+template<typename Derived, typename XprKind>
+struct generic_xpr_base<Derived, XprKind, Dense>
+{
+  typedef typename dense_xpr_base<Derived,XprKind>::type type;
+};
+
 /** \internal Helper base class to add a scalar multiple operator
   * overloads for complex types */
-template<typename Derived,typename Scalar,typename OtherScalar,
+template<typename Derived, typename Scalar, typename OtherScalar, typename BaseType,
          bool EnableIt = !is_same<Scalar,OtherScalar>::value >
-struct special_scalar_op_base : public DenseCoeffsBase<Derived>
+struct special_scalar_op_base : public BaseType
 {
   // dummy operator* so that the
   // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
+  struct dummy {};
+  void operator*(dummy) const;
+  void operator/(dummy) const;
 };
 
-template<typename Derived,typename Scalar,typename OtherScalar>
-struct special_scalar_op_base<Derived,Scalar,OtherScalar,true>  : public DenseCoeffsBase<Derived>
+template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType>
+struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : public BaseType
 {
   const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
   operator*(const OtherScalar& scalar) const
   {
+#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+#endif
     return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
       (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
   }
 
   inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
   operator*(const OtherScalar& scalar, const Derived& matrix)
-  { return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar); }
+  {
+#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+#endif
+    return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar);
+  }
+  
+  const CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+  operator/(const OtherScalar& scalar) const
+  {
+#ifdef EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+    EIGEN_SPECIAL_SCALAR_MULTIPLE_PLUGIN
+#endif
+    return CwiseUnaryOp<scalar_quotient2_op<Scalar,OtherScalar>, Derived>
+      (*static_cast<const Derived*>(this), scalar_quotient2_op<Scalar,OtherScalar>(scalar));
+  }
 };
 
 template<typename XprType, typename CastType> struct cast_return_type
@@ -405,6 +511,68 @@ template <typename A> struct promote_storage_type<A,A>
 {
   typedef A ret;
 };
+template <typename A> struct promote_storage_type<A, const A>
+{
+  typedef A ret;
+};
+template <typename A> struct promote_storage_type<const A, A>
+{
+  typedef A ret;
+};
+
+/** \internal Specify the "storage kind" of applying a coefficient-wise
+  * binary operations between two expressions of kinds A and B respectively.
+  * The template parameter Functor permits to specialize the resulting storage kind wrt to
+  * the functor.
+  * The default rules are as follows:
+  * \code
+  * A     op A      -> A
+  * A     op dense  -> dense
+  * dense op B      -> dense
+  * A     *  dense  -> A
+  * dense *  B      -> B
+  * \endcode
+  */
+template <typename A, typename B, typename Functor> struct cwise_promote_storage_type;
+
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,A,Functor>                                      { typedef A     ret; };
+template <typename Functor>                               struct cwise_promote_storage_type<Dense,Dense,Functor>                              { typedef Dense ret; };
+template <typename ScalarA, typename ScalarB>             struct cwise_promote_storage_type<Dense,Dense,scalar_product_op<ScalarA,ScalarB> >  { typedef Dense ret; };
+template <typename A, typename Functor>                   struct cwise_promote_storage_type<A,Dense,Functor>                                  { typedef Dense ret; };
+template <typename B, typename Functor>                   struct cwise_promote_storage_type<Dense,B,Functor>                                  { typedef Dense ret; };
+template <typename A, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<A,Dense,scalar_product_op<ScalarA,ScalarB> >      { typedef A     ret; };
+template <typename B, typename ScalarA, typename ScalarB> struct cwise_promote_storage_type<Dense,B,scalar_product_op<ScalarA,ScalarB> >      { typedef B     ret; };
+
+/** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
+  * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
+  * some compile-time properties of the product: GemmProduct, GemvProduct, OuterProduct, InnerProduct.
+  * The default rules are as follows:
+  * \code
+  *  K * K            -> K
+  *  dense * K        -> dense
+  *  K * dense        -> dense
+  *  diag * K         -> K
+  *  K * diag         -> K
+  *  Perm * K         -> K
+  * K * Perm          -> K
+  * \endcode
+  */
+template <typename A, typename B, int ProductTag> struct product_promote_storage_type;
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  A,                  ProductTag> { typedef A     ret;};
+template <int ProductTag>             struct product_promote_storage_type<Dense,              Dense,              ProductTag> { typedef Dense ret;};
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  Dense,              ProductTag> { typedef Dense ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<Dense,              B,                  ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  DiagonalShape,      ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<DiagonalShape,      B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              DiagonalShape,      ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<DiagonalShape,      Dense,              ProductTag> { typedef Dense ret; };
+
+template <typename A, int ProductTag> struct product_promote_storage_type<A,                  PermutationStorage, ProductTag> { typedef A ret; };
+template <typename B, int ProductTag> struct product_promote_storage_type<PermutationStorage, B,                  ProductTag> { typedef B ret; };
+template <int ProductTag>             struct product_promote_storage_type<Dense,              PermutationStorage, ProductTag> { typedef Dense ret; };
+template <int ProductTag>             struct product_promote_storage_type<PermutationStorage, Dense,              ProductTag> { typedef Dense ret; };
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
   * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
@@ -462,8 +630,85 @@ struct is_lvalue
                  bool(traits<ExpressionType>::Flags & LvalueBit) };
 };
 
+template<typename T> struct is_diagonal
+{ enum { ret = false }; };
+
+template<typename T> struct is_diagonal<DiagonalBase<T> >
+{ enum { ret = true }; };
+
+template<typename T> struct is_diagonal<DiagonalWrapper<T> >
+{ enum { ret = true }; };
+
+template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
+{ enum { ret = true }; };
+
+template<typename S1, typename S2> struct glue_shapes;
+template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
+{
+  return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
+}
+
+template<typename T1, typename T2>
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
+{
+  return false;
+}
+
+template<typename T, typename U> struct is_same_or_void { enum { value = is_same<T,U>::value }; };
+template<typename T> struct is_same_or_void<void,T>     { enum { value = 1 }; };
+template<typename T> struct is_same_or_void<T,void>     { enum { value = 1 }; };
+template<>           struct is_same_or_void<void,void>  { enum { value = 1 }; };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+std::string demangle_traversal(int t)
+{
+  if(t==DefaultTraversal) return "DefaultTraversal";
+  if(t==LinearTraversal) return "LinearTraversal";
+  if(t==InnerVectorizedTraversal) return "InnerVectorizedTraversal";
+  if(t==LinearVectorizedTraversal) return "LinearVectorizedTraversal";
+  if(t==SliceVectorizedTraversal) return "SliceVectorizedTraversal";
+  return "?";
+}
+std::string demangle_unrolling(int t)
+{
+  if(t==NoUnrolling) return "NoUnrolling";
+  if(t==InnerUnrolling) return "InnerUnrolling";
+  if(t==CompleteUnrolling) return "CompleteUnrolling";
+  return "?";
+}
+std::string demangle_flags(int f)
+{
+  std::string res;
+  if(f&RowMajorBit)                 res += " | RowMajor";
+  if(f&PacketAccessBit)             res += " | Packet";
+  if(f&LinearAccessBit)             res += " | Linear";
+  if(f&LvalueBit)                   res += " | Lvalue";
+  if(f&DirectAccessBit)             res += " | Direct";
+  if(f&NestByRefBit)                res += " | NestByRef";
+  if(f&NoPreferredStorageOrderBit)  res += " | NoPreferredStorageOrderBit";
+  
+  return res;
+}
+#endif
+
 } // end namespace internal
 
+// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
+// that would take two operands of different types. If there were such an example, then this check should be
+// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
+// currently they take only one typename Scalar template parameter.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
+// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
+// add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
+  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
+                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
+                        : int(internal::is_same_or_void<LHS, RHS>::value)), \
+    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    
 } // end namespace Eigen
 
 #endif // EIGEN_XPRHELPER_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Block.h b/nuparu/include/Eigen/src/Eigen2Support/Block.h
deleted file mode 100644
index 604456f4..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Block.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BLOCK2_H
-#define EIGEN_BLOCK2_H
-
-namespace Eigen { 
-
-/** \returns a dynamic-size expression of a corner of *this.
-  *
-  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
-  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_corner_enum_int_int.cpp
-  * Output: \verbinclude MatrixBase_corner_enum_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<typename Derived>
-inline Block<Derived> DenseBase<Derived>
-  ::corner(CornerType type, Index cRows, Index cCols)
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived>(derived(), 0, 0, cRows, cCols);
-    case TopRight:
-      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-    case BottomLeft:
-      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-    case BottomRight:
-      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-  }
-}
-
-/** This is the const version of corner(CornerType, Index, Index).*/
-template<typename Derived>
-inline const Block<Derived>
-DenseBase<Derived>::corner(CornerType type, Index cRows, Index cCols) const
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived>(derived(), 0, 0, cRows, cCols);
-    case TopRight:
-      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-    case BottomLeft:
-      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-    case BottomRight:
-      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-  }
-}
-
-/** \returns a fixed-size expression of a corner of *this.
-  *
-  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
-  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
-  *
-  * The template parameters CRows and CCols arethe number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_corner_enum.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_corner_enum.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<typename Derived>
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols>
-DenseBase<Derived>::corner(CornerType type)
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived, CRows, CCols>(derived(), 0, 0);
-    case TopRight:
-      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-    case BottomLeft:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-    case BottomRight:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-  }
-}
-
-/** This is the const version of corner<int, int>(CornerType).*/
-template<typename Derived>
-template<int CRows, int CCols>
-inline const Block<Derived, CRows, CCols>
-DenseBase<Derived>::corner(CornerType type) const
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived, CRows, CCols>(derived(), 0, 0);
-    case TopRight:
-      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-    case BottomLeft:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-    case BottomRight:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-  }
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_BLOCK2_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/CMakeLists.txt b/nuparu/include/Eigen/src/Eigen2Support/CMakeLists.txt
deleted file mode 100644
index 7ae41b3c..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_Eigen2Support_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigen2Support_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Eigen2Support COMPONENT Devel
-  )
-
-ADD_SUBDIRECTORY(Geometry)
\ No newline at end of file
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Cwise.h b/nuparu/include/Eigen/src/Eigen2Support/Cwise.h
deleted file mode 100644
index d95009b6..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Cwise.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CWISE_H
-#define EIGEN_CWISE_H
-
-namespace Eigen { 
-
-/** \internal
-  * convenient macro to defined the return type of a cwise binary operation */
-#define EIGEN_CWISE_BINOP_RETURN_TYPE(OP) \
-    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, OtherDerived>
-
-/** \internal
-  * convenient macro to defined the return type of a cwise unary operation */
-#define EIGEN_CWISE_UNOP_RETURN_TYPE(OP) \
-    CwiseUnaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType>
-
-/** \internal
-  * convenient macro to defined the return type of a cwise comparison to a scalar */
-#define EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(OP) \
-    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, \
-        typename ExpressionType::ConstantReturnType >
-
-/** \class Cwise
-  *
-  * \brief Pseudo expression providing additional coefficient-wise operations
-  *
-  * \param ExpressionType the type of the object on which to do coefficient-wise operations
-  *
-  * This class represents an expression with additional coefficient-wise features.
-  * It is the return type of MatrixBase::cwise()
-  * and most of the time this is the only way it is used.
-  *
-  * Example: \include MatrixBase_cwise_const.cpp
-  * Output: \verbinclude MatrixBase_cwise_const.out
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_CWISE_PLUGIN.
-  *
-  * \sa MatrixBase::cwise() const, MatrixBase::cwise()
-  */
-template<typename ExpressionType> class Cwise
-{
-  public:
-
-    typedef typename internal::traits<ExpressionType>::Scalar Scalar;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef CwiseUnaryOp<internal::scalar_add_op<Scalar>, ExpressionType> ScalarAddReturnType;
-
-    inline Cwise(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
-    operator*(const MatrixBase<OtherDerived> &other) const;
-
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
-    operator/(const MatrixBase<OtherDerived> &other) const;
-
-    /** \deprecated ArrayBase::min() */
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)
-    (min)(const MatrixBase<OtherDerived> &other) const
-    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)(_expression(), other.derived()); }
-
-    /** \deprecated ArrayBase::max() */
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)
-    (max)(const MatrixBase<OtherDerived> &other) const
-    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)(_expression(), other.derived()); }
-
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)      abs() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)     abs2() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)   square() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)     cube() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)  inverse() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)     sqrt() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)      exp() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)      log() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)      cos() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)      sin() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)      pow(const Scalar& exponent) const;
-
-    const ScalarAddReturnType
-    operator+(const Scalar& scalar) const;
-
-    /** \relates Cwise */
-    friend const ScalarAddReturnType
-    operator+(const Scalar& scalar, const Cwise& mat)
-    { return mat + scalar; }
-
-    ExpressionType& operator+=(const Scalar& scalar);
-
-    const ScalarAddReturnType
-    operator-(const Scalar& scalar) const;
-
-    ExpressionType& operator-=(const Scalar& scalar);
-
-    template<typename OtherDerived>
-    inline ExpressionType& operator*=(const MatrixBase<OtherDerived> &other);
-
-    template<typename OtherDerived>
-    inline ExpressionType& operator/=(const MatrixBase<OtherDerived> &other);
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
-    operator<(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
-    operator<=(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
-    operator>(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
-    operator>=(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
-    operator==(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
-    operator!=(const MatrixBase<OtherDerived>& other) const;
-
-    // comparisons to a scalar value
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
-    operator<(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
-    operator<=(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
-    operator>(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
-    operator>=(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
-    operator==(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
-    operator!=(Scalar s) const;
-
-    // allow to extend Cwise outside Eigen
-    #ifdef EIGEN_CWISE_PLUGIN
-    #include EIGEN_CWISE_PLUGIN
-    #endif
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-
-/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
-  *
-  * Example: \include MatrixBase_cwise_const.cpp
-  * Output: \verbinclude MatrixBase_cwise_const.out
-  *
-  * \sa class Cwise, cwise()
-  */
-template<typename Derived>
-inline const Cwise<Derived> MatrixBase<Derived>::cwise() const
-{
-  return derived();
-}
-
-/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
-  *
-  * Example: \include MatrixBase_cwise.cpp
-  * Output: \verbinclude MatrixBase_cwise.out
-  *
-  * \sa class Cwise, cwise() const
-  */
-template<typename Derived>
-inline Cwise<Derived> MatrixBase<Derived>::cwise()
-{
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CWISE_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/CwiseOperators.h b/nuparu/include/Eigen/src/Eigen2Support/CwiseOperators.h
deleted file mode 100644
index 482f3064..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/CwiseOperators.h
+++ /dev/null
@@ -1,298 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ARRAY_CWISE_OPERATORS_H
-#define EIGEN_ARRAY_CWISE_OPERATORS_H
-
-namespace Eigen { 
-
-/***************************************************************************
-* The following functions were defined in Core
-***************************************************************************/
-
-
-/** \deprecated ArrayBase::abs() */
-template<typename ExpressionType>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)
-Cwise<ExpressionType>::abs() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::abs2() */
-template<typename ExpressionType>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)
-Cwise<ExpressionType>::abs2() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::exp() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)
-Cwise<ExpressionType>::exp() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::log() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)
-Cwise<ExpressionType>::log() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::operator*() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
-Cwise<ExpressionType>::operator*(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator/() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
-Cwise<ExpressionType>::operator/(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator*=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline ExpressionType& Cwise<ExpressionType>::operator*=(const MatrixBase<OtherDerived> &other)
-{
-  return m_matrix.const_cast_derived() = *this * other;
-}
-
-/** \deprecated ArrayBase::operator/=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline ExpressionType& Cwise<ExpressionType>::operator/=(const MatrixBase<OtherDerived> &other)
-{
-  return m_matrix.const_cast_derived() = *this / other;
-}
-
-/***************************************************************************
-* The following functions were defined in Array
-***************************************************************************/
-
-// -- unary operators --
-
-/** \deprecated ArrayBase::sqrt() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)
-Cwise<ExpressionType>::sqrt() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::cos() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)
-Cwise<ExpressionType>::cos() const
-{
-  return _expression();
-}
-
-
-/** \deprecated ArrayBase::sin() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)
-Cwise<ExpressionType>::sin() const
-{
-  return _expression();
-}
-
-
-/** \deprecated ArrayBase::log() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)
-Cwise<ExpressionType>::pow(const Scalar& exponent) const
-{
-  return EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)(_expression(), internal::scalar_pow_op<Scalar>(exponent));
-}
-
-
-/** \deprecated ArrayBase::inverse() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)
-Cwise<ExpressionType>::inverse() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::square() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)
-Cwise<ExpressionType>::square() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::cube() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)
-Cwise<ExpressionType>::cube() const
-{
-  return _expression();
-}
-
-
-// -- binary operators --
-
-/** \deprecated ArrayBase::operator<() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
-Cwise<ExpressionType>::operator<(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::<=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
-Cwise<ExpressionType>::operator<=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator>() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
-Cwise<ExpressionType>::operator>(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator>=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
-Cwise<ExpressionType>::operator>=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator==() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
-Cwise<ExpressionType>::operator==(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator!=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
-Cwise<ExpressionType>::operator!=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)(_expression(), other.derived());
-}
-
-// comparisons to scalar value
-
-/** \deprecated ArrayBase::operator<(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
-Cwise<ExpressionType>::operator<(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator<=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
-Cwise<ExpressionType>::operator<=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator>(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
-Cwise<ExpressionType>::operator>(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator>=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
-Cwise<ExpressionType>::operator>=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator==(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
-Cwise<ExpressionType>::operator==(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator!=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
-Cwise<ExpressionType>::operator!=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-// scalar addition
-
-/** \deprecated ArrayBase::operator+(Scalar) */
-template<typename ExpressionType>
-inline const typename Cwise<ExpressionType>::ScalarAddReturnType
-Cwise<ExpressionType>::operator+(const Scalar& scalar) const
-{
-  return typename Cwise<ExpressionType>::ScalarAddReturnType(m_matrix, internal::scalar_add_op<Scalar>(scalar));
-}
-
-/** \deprecated ArrayBase::operator+=(Scalar) */
-template<typename ExpressionType>
-inline ExpressionType& Cwise<ExpressionType>::operator+=(const Scalar& scalar)
-{
-  return m_matrix.const_cast_derived() = *this + scalar;
-}
-
-/** \deprecated ArrayBase::operator-(Scalar) */
-template<typename ExpressionType>
-inline const typename Cwise<ExpressionType>::ScalarAddReturnType
-Cwise<ExpressionType>::operator-(const Scalar& scalar) const
-{
-  return *this + (-scalar);
-}
-
-/** \deprecated ArrayBase::operator-=(Scalar) */
-template<typename ExpressionType>
-inline ExpressionType& Cwise<ExpressionType>::operator-=(const Scalar& scalar)
-{
-  return m_matrix.const_cast_derived() = *this - scalar;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARRAY_CWISE_OPERATORS_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/AlignedBox.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
deleted file mode 100644
index 2e4309dd..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  * \nonstableyet
-  *
-  * \class AlignedBox
-  *
-  * \brief An axis aligned box
-  *
-  * \param _Scalar the type of the scalar coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *
-  * This class represents an axis aligned box as a pair of the minimal and maximal corners.
-  */
-template <typename _Scalar, int _AmbientDim>
-class AlignedBox
-{
-public:
-EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-
-  /** Default constructor initializing a null box. */
-  inline AlignedBox()
-  { if (AmbientDimAtCompileTime!=Dynamic) setNull(); }
-
-  /** Constructs a null box with \a _dim the dimension of the ambient space. */
-  inline explicit AlignedBox(int _dim) : m_min(_dim), m_max(_dim)
-  { setNull(); }
-
-  /** Constructs a box with extremities \a _min and \a _max. */
-  inline AlignedBox(const VectorType& _min, const VectorType& _max) : m_min(_min), m_max(_max) {}
-
-  /** Constructs a box containing a single point \a p. */
-  inline explicit AlignedBox(const VectorType& p) : m_min(p), m_max(p) {}
-
-  ~AlignedBox() {}
-
-  /** \returns the dimension in which the box holds */
-  inline int dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size()-1 : AmbientDimAtCompileTime; }
-
-  /** \returns true if the box is null, i.e, empty. */
-  inline bool isNull() const { return (m_min.cwise() > m_max).any(); }
-
-  /** Makes \c *this a null/empty box. */
-  inline void setNull()
-  {
-    m_min.setConstant( (std::numeric_limits<Scalar>::max)());
-    m_max.setConstant(-(std::numeric_limits<Scalar>::max)());
-  }
-
-  /** \returns the minimal corner */
-  inline const VectorType& (min)() const { return m_min; }
-  /** \returns a non const reference to the minimal corner */
-  inline VectorType& (min)() { return m_min; }
-  /** \returns the maximal corner */
-  inline const VectorType& (max)() const { return m_max; }
-  /** \returns a non const reference to the maximal corner */
-  inline VectorType& (max)() { return m_max; }
-
-  /** \returns true if the point \a p is inside the box \c *this. */
-  inline bool contains(const VectorType& p) const
-  { return (m_min.cwise()<=p).all() && (p.cwise()<=m_max).all(); }
-
-  /** \returns true if the box \a b is entirely inside the box \c *this. */
-  inline bool contains(const AlignedBox& b) const
-  { return (m_min.cwise()<=(b.min)()).all() && ((b.max)().cwise()<=m_max).all(); }
-
-  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
-  inline AlignedBox& extend(const VectorType& p)
-  { m_min = (m_min.cwise().min)(p); m_max = (m_max.cwise().max)(p); return *this; }
-
-  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& extend(const AlignedBox& b)
-  { m_min = (m_min.cwise().min)(b.m_min); m_max = (m_max.cwise().max)(b.m_max); return *this; }
-
-  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& clamp(const AlignedBox& b)
-  { m_min = (m_min.cwise().max)(b.m_min); m_max = (m_max.cwise().min)(b.m_max); return *this; }
-
-  /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
-  inline AlignedBox& translate(const VectorType& t)
-  { m_min += t; m_max += t; return *this; }
-
-  /** \returns the squared distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa exteriorDistance()
-    */
-  inline Scalar squaredExteriorDistance(const VectorType& p) const;
-
-  /** \returns the distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance()
-    */
-  inline Scalar exteriorDistance(const VectorType& p) const
-  { return ei_sqrt(squaredExteriorDistance(p)); }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AlignedBox,
-           AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<AlignedBox,
-                    AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
-    m_min = (other.min)().template cast<Scalar>();
-    m_max = (other.max)().template cast<Scalar>();
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AlignedBox& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
-
-protected:
-
-  VectorType m_min, m_max;
-};
-
-template<typename Scalar,int AmbiantDim>
-inline Scalar AlignedBox<Scalar,AmbiantDim>::squaredExteriorDistance(const VectorType& p) const
-{
-  Scalar dist2(0);
-  Scalar aux;
-  for (int k=0; k<dim(); ++k)
-  {
-    if ((aux = (p[k]-m_min[k]))<Scalar(0))
-      dist2 += aux*aux;
-    else if ( (aux = (m_max[k]-p[k]))<Scalar(0))
-      dist2 += aux*aux;
-  }
-  return dist2;
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/All.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/All.h
deleted file mode 100644
index e0b00fcc..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/All.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef EIGEN2_GEOMETRY_MODULE_H
-#define EIGEN2_GEOMETRY_MODULE_H
-
-#include <limits>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-#include "RotationBase.h"
-#include "Rotation2D.h"
-#include "Quaternion.h"
-#include "AngleAxis.h"
-#include "Transform.h"
-#include "Translation.h"
-#include "Scaling.h"
-#include "AlignedBox.h"
-#include "Hyperplane.h"
-#include "ParametrizedLine.h"
-#endif
-
-
-#define RotationBase eigen2_RotationBase
-#define Rotation2D eigen2_Rotation2D
-#define Rotation2Df eigen2_Rotation2Df
-#define Rotation2Dd eigen2_Rotation2Dd
-
-#define Quaternion  eigen2_Quaternion
-#define Quaternionf eigen2_Quaternionf
-#define Quaterniond eigen2_Quaterniond
-
-#define AngleAxis eigen2_AngleAxis
-#define AngleAxisf eigen2_AngleAxisf
-#define AngleAxisd eigen2_AngleAxisd
-
-#define Transform   eigen2_Transform
-#define Transform2f eigen2_Transform2f
-#define Transform2d eigen2_Transform2d
-#define Transform3f eigen2_Transform3f
-#define Transform3d eigen2_Transform3d
-
-#define Translation eigen2_Translation
-#define Translation2f eigen2_Translation2f
-#define Translation2d eigen2_Translation2d
-#define Translation3f eigen2_Translation3f
-#define Translation3d eigen2_Translation3d
-
-#define Scaling eigen2_Scaling
-#define Scaling2f eigen2_Scaling2f
-#define Scaling2d eigen2_Scaling2d
-#define Scaling3f eigen2_Scaling3f
-#define Scaling3d eigen2_Scaling3d
-
-#define AlignedBox eigen2_AlignedBox
-
-#define Hyperplane eigen2_Hyperplane
-#define ParametrizedLine eigen2_ParametrizedLine
-
-#define ei_toRotationMatrix eigen2_ei_toRotationMatrix
-#define ei_quaternion_assign_impl eigen2_ei_quaternion_assign_impl
-#define ei_transform_product_impl eigen2_ei_transform_product_impl
-
-#include "RotationBase.h"
-#include "Rotation2D.h"
-#include "Quaternion.h"
-#include "AngleAxis.h"
-#include "Transform.h"
-#include "Translation.h"
-#include "Scaling.h"
-#include "AlignedBox.h"
-#include "Hyperplane.h"
-#include "ParametrizedLine.h"
-
-#undef ei_toRotationMatrix
-#undef ei_quaternion_assign_impl
-#undef ei_transform_product_impl
-
-#undef RotationBase
-#undef Rotation2D
-#undef Rotation2Df
-#undef Rotation2Dd
-
-#undef Quaternion
-#undef Quaternionf
-#undef Quaterniond
-
-#undef AngleAxis
-#undef AngleAxisf
-#undef AngleAxisd
-
-#undef Transform
-#undef Transform2f
-#undef Transform2d
-#undef Transform3f
-#undef Transform3d
-
-#undef Translation
-#undef Translation2f
-#undef Translation2d
-#undef Translation3f
-#undef Translation3d
-
-#undef Scaling
-#undef Scaling2f
-#undef Scaling2d
-#undef Scaling3f
-#undef Scaling3d
-
-#undef AlignedBox
-
-#undef Hyperplane
-#undef ParametrizedLine
-
-#endif // EIGEN2_GEOMETRY_MODULE_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/AngleAxis.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
deleted file mode 100644
index af598a40..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class AngleAxis
-  *
-  * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c AngleAxisf for \c float
-  * \li \c AngleAxisd for \c double
-  *
-  * \addexample AngleAxisForEuler \label How to define a rotation from Euler-angles
-  *
-  * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
-  * mimic Euler-angles. Here is an example:
-  * \include AngleAxis_mimic_euler.cpp
-  * Output: \verbinclude AngleAxis_mimic_euler.out
-  *
-  * \note This class is not aimed to be used to store a rotation transformation,
-  * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
-  * and transformation objects.
-  *
-  * \sa class Quaternion, class Transform, MatrixBase::UnitX()
-  */
-
-template<typename _Scalar> struct ei_traits<AngleAxis<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
-{
-  typedef RotationBase<AngleAxis<_Scalar>,3> Base;
-
-public:
-
-  using Base::operator*;
-
-  enum { Dim = 3 };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,3,1> Vector3;
-  typedef Quaternion<Scalar> QuaternionType;
-
-protected:
-
-  Vector3 m_axis;
-  Scalar m_angle;
-
-public:
-
-  /** Default constructor without initialization. */
-  AngleAxis() {}
-  /** Constructs and initialize the angle-axis rotation from an \a angle in radian
-    * and an \a axis which must be normalized. */
-  template<typename Derived>
-  inline AngleAxis(Scalar angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
-  inline AngleAxis(const QuaternionType& q) { *this = q; }
-  /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
-  template<typename Derived>
-  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
-
-  Scalar angle() const { return m_angle; }
-  Scalar& angle() { return m_angle; }
-
-  const Vector3& axis() const { return m_axis; }
-  Vector3& axis() { return m_axis; }
-
-  /** Concatenates two rotations */
-  inline QuaternionType operator* (const AngleAxis& other) const
-  { return QuaternionType(*this) * QuaternionType(other); }
-
-  /** Concatenates two rotations */
-  inline QuaternionType operator* (const QuaternionType& other) const
-  { return QuaternionType(*this) * other; }
-
-  /** Concatenates two rotations */
-  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
-  { return a * QuaternionType(b); }
-
-  /** Concatenates two rotations */
-  inline Matrix3 operator* (const Matrix3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** Concatenates two rotations */
-  inline friend Matrix3 operator* (const Matrix3& a, const AngleAxis& b)
-  { return a * b.toRotationMatrix(); }
-
-  /** Applies rotation to vector */
-  inline Vector3 operator* (const Vector3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
-  AngleAxis inverse() const
-  { return AngleAxis(-m_angle, m_axis); }
-
-  AngleAxis& operator=(const QuaternionType& q);
-  template<typename Derived>
-  AngleAxis& operator=(const MatrixBase<Derived>& m);
-
-  template<typename Derived>
-  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix3 toRotationMatrix(void) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
-  {
-    m_axis = other.axis().template cast<Scalar>();
-    m_angle = Scalar(other.angle());
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AngleAxis& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_axis.isApprox(other.m_axis, prec) && ei_isApprox(m_angle,other.m_angle, prec); }
-};
-
-/** \ingroup Geometry_Module
-  * single precision angle-axis type */
-typedef AngleAxis<float> AngleAxisf;
-/** \ingroup Geometry_Module
-  * double precision angle-axis type */
-typedef AngleAxis<double> AngleAxisd;
-
-/** Set \c *this from a quaternion.
-  * The axis is normalized.
-  */
-template<typename Scalar>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionType& q)
-{
-  Scalar n2 = q.vec().squaredNorm();
-  if (n2 < precision<Scalar>()*precision<Scalar>())
-  {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
-  }
-  else
-  {
-    m_angle = 2*std::acos(q.w());
-    m_axis = q.vec() / ei_sqrt(n2);
-  }
-  return *this;
-}
-
-/** Set \c *this from a 3x3 rotation matrix \a mat.
-  */
-template<typename Scalar>
-template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
-{
-  // Since a direct conversion would not be really faster,
-  // let's use the robust Quaternion implementation:
-  return *this = QuaternionType(mat);
-}
-
-/** Constructs and \returns an equivalent 3x3 rotation matrix.
-  */
-template<typename Scalar>
-typename AngleAxis<Scalar>::Matrix3
-AngleAxis<Scalar>::toRotationMatrix(void) const
-{
-  Matrix3 res;
-  Vector3 sin_axis  = ei_sin(m_angle) * m_axis;
-  Scalar c = ei_cos(m_angle);
-  Vector3 cos1_axis = (Scalar(1)-c) * m_axis;
-
-  Scalar tmp;
-  tmp = cos1_axis.x() * m_axis.y();
-  res.coeffRef(0,1) = tmp - sin_axis.z();
-  res.coeffRef(1,0) = tmp + sin_axis.z();
-
-  tmp = cos1_axis.x() * m_axis.z();
-  res.coeffRef(0,2) = tmp + sin_axis.y();
-  res.coeffRef(2,0) = tmp - sin_axis.y();
-
-  tmp = cos1_axis.y() * m_axis.z();
-  res.coeffRef(1,2) = tmp - sin_axis.x();
-  res.coeffRef(2,1) = tmp + sin_axis.x();
-
-  res.diagonal() = (cos1_axis.cwise() * m_axis).cwise() + c;
-
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/CMakeLists.txt b/nuparu/include/Eigen/src/Eigen2Support/Geometry/CMakeLists.txt
deleted file mode 100644
index c347a8f2..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigen2Support_Geometry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigen2Support_Geometry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Eigen2Support/Geometry
-  )
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Hyperplane.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
deleted file mode 100644
index b95bf00e..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
+++ /dev/null
@@ -1,254 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Hyperplane
-  *
-  * \brief A hyperplane
-  *
-  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
-  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
-  *
-  * This class represents an hyperplane as the zero set of the implicit equation
-  * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
-  * and \f$ d \f$ is the distance (offset) to the origin.
-  */
-template <typename _Scalar, int _AmbientDim>
-class Hyperplane
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-  typedef Matrix<Scalar,int(AmbientDimAtCompileTime)==Dynamic
-                        ? Dynamic
-                        : int(AmbientDimAtCompileTime)+1,1> Coefficients;
-  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
-
-  /** Default constructor without initialization */
-  inline Hyperplane() {}
-
-  /** Constructs a dynamic-size hyperplane with \a _dim the dimension
-    * of the ambient space */
-  inline explicit Hyperplane(int _dim) : m_coeffs(_dim+1) {}
-
-  /** Construct a plane from its normal \a n and a point \a e onto the plane.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, const VectorType& e)
-    : m_coeffs(n.size()+1)
-  {
-    normal() = n;
-    offset() = -e.eigen2_dot(n);
-  }
-
-  /** Constructs a plane from its normal \a n and distance to the origin \a d
-    * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, Scalar d)
-    : m_coeffs(n.size()+1)
-  {
-    normal() = n;
-    offset() = d;
-  }
-
-  /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
-    * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
-  {
-    Hyperplane result(p0.size());
-    result.normal() = (p1 - p0).unitOrthogonal();
-    result.offset() = -result.normal().eigen2_dot(p0);
-    return result;
-  }
-
-  /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
-    * is required to be exactly 3.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
-  {
-    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
-    Hyperplane result(p0.size());
-    result.normal() = (p2 - p0).cross(p1 - p0).normalized();
-    result.offset() = -result.normal().eigen2_dot(p0);
-    return result;
-  }
-
-  /** Constructs a hyperplane passing through the parametrized line \a parametrized.
-    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
-    * so an arbitrary choice is made.
-    */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
-  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
-  {
-    normal() = parametrized.direction().unitOrthogonal();
-    offset() = -normal().eigen2_dot(parametrized.origin());
-  }
-
-  ~Hyperplane() {}
-
-  /** \returns the dimension in which the plane holds */
-  inline int dim() const { return int(AmbientDimAtCompileTime)==Dynamic ? m_coeffs.size()-1 : int(AmbientDimAtCompileTime); }
-
-  /** normalizes \c *this */
-  void normalize(void)
-  {
-    m_coeffs /= normal().norm();
-  }
-
-  /** \returns the signed distance between the plane \c *this and a point \a p.
-    * \sa absDistance()
-    */
-  inline Scalar signedDistance(const VectorType& p) const { return p.eigen2_dot(normal()) + offset(); }
-
-  /** \returns the absolute distance between the plane \c *this and a point \a p.
-    * \sa signedDistance()
-    */
-  inline Scalar absDistance(const VectorType& p) const { return ei_abs(signedDistance(p)); }
-
-  /** \returns the projection of a point \a p onto the plane \c *this.
-    */
-  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
-
-  /** \returns a constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline const NormalReturnType normal() const { return NormalReturnType(*const_cast<Coefficients*>(&m_coeffs),0,0,dim(),1); }
-
-  /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
-
-  /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
-
-  /** \returns a non-constant reference to the distance to the origin, which is also the constant part
-    * of the implicit equation */
-  inline Scalar& offset() { return m_coeffs(dim()); }
-
-  /** \returns a constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a non-constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** \returns the intersection of *this with \a other.
-    *
-    * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
-    *
-    * \note If \a other is approximately parallel to *this, this method will return any point on *this.
-    */
-  VectorType intersection(const Hyperplane& other)
-  {
-    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
-    Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
-    // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
-    // whether the two lines are approximately parallel.
-    if(ei_isMuchSmallerThan(det, Scalar(1)))
-    {   // special case where the two lines are approximately parallel. Pick any point on the first line.
-        if(ei_abs(coeffs().coeff(1))>ei_abs(coeffs().coeff(0)))
-            return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
-        else
-            return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
-    }
-    else
-    {   // general case
-        Scalar invdet = Scalar(1) / det;
-        return VectorType(invdet*(coeffs().coeff(1)*other.coeffs().coeff(2)-other.coeffs().coeff(1)*coeffs().coeff(2)),
-                          invdet*(other.coeffs().coeff(0)*coeffs().coeff(2)-coeffs().coeff(0)*other.coeffs().coeff(2)));
-    }
-  }
-
-  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
-    *
-    * \param mat the Dim x Dim transformation matrix
-    * \param traits specifies whether the matrix \a mat represents an Isometry
-    *               or a more generic Affine transformation. The default is Affine.
-    */
-  template<typename XprType>
-  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
-  {
-    if (traits==Affine)
-      normal() = mat.inverse().transpose() * normal();
-    else if (traits==Isometry)
-      normal() = mat * normal();
-    else
-    {
-      ei_assert("invalid traits value in Hyperplane::transform()");
-    }
-    return *this;
-  }
-
-  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
-    *
-    * \param t the transformation of dimension Dim
-    * \param traits specifies whether the transformation \a t represents an Isometry
-    *               or a more generic Affine transformation. The default is Affine.
-    *               Other kind of transformations are not supported.
-    */
-  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime>& t,
-                                TransformTraits traits = Affine)
-  {
-    transform(t.linear(), traits);
-    offset() -= t.translation().eigen2_dot(normal());
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Hyperplane,
-           Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<Hyperplane,
-                    Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Hyperplane& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
-
-  Coefficients m_coeffs;
-};
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
deleted file mode 100644
index 9b57b7e0..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class ParametrizedLine
-  *
-  * \brief A parametrized line
-  *
-  * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
-  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
-  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ l \in \mathbf{R} \f$.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  */
-template <typename _Scalar, int _AmbientDim>
-class ParametrizedLine
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-
-  /** Default constructor without initialization */
-  inline ParametrizedLine() {}
-
-  /** Constructs a dynamic-size line with \a _dim the dimension
-    * of the ambient space */
-  inline explicit ParametrizedLine(int _dim) : m_origin(_dim), m_direction(_dim) {}
-
-  /** Initializes a parametrized line of direction \a direction and origin \a origin.
-    * \warning the vector direction is assumed to be normalized.
-    */
-  ParametrizedLine(const VectorType& origin, const VectorType& direction)
-    : m_origin(origin), m_direction(direction) {}
-
-  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
-
-  /** Constructs a parametrized line going from \a p0 to \a p1. */
-  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
-  { return ParametrizedLine(p0, (p1-p0).normalized()); }
-
-  ~ParametrizedLine() {}
-
-  /** \returns the dimension in which the line holds */
-  inline int dim() const { return m_direction.size(); }
-
-  const VectorType& origin() const { return m_origin; }
-  VectorType& origin() { return m_origin; }
-
-  const VectorType& direction() const { return m_direction; }
-  VectorType& direction() { return m_direction; }
-
-  /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
-    * \sa distance()
-    */
-  RealScalar squaredDistance(const VectorType& p) const
-  {
-    VectorType diff = p-origin();
-    return (diff - diff.eigen2_dot(direction())* direction()).squaredNorm();
-  }
-  /** \returns the distance of a point \a p to its projection onto the line \c *this.
-    * \sa squaredDistance()
-    */
-  RealScalar distance(const VectorType& p) const { return ei_sqrt(squaredDistance(p)); }
-
-  /** \returns the projection of a point \a p onto the line \c *this. */
-  VectorType projection(const VectorType& p) const
-  { return origin() + (p-origin()).eigen2_dot(direction()) * direction(); }
-
-  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<ParametrizedLine,
-           ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<ParametrizedLine,
-                    ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
-    m_origin = other.origin().template cast<Scalar>();
-    m_direction = other.direction().template cast<Scalar>();
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
-
-protected:
-
-  VectorType m_origin, m_direction;
-};
-
-/** Constructs a parametrized line from a 2D hyperplane
-  *
-  * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
-  */
-template <typename _Scalar, int _AmbientDim>
-inline ParametrizedLine<_Scalar, _AmbientDim>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
-  direction() = hyperplane.normal().unitOrthogonal();
-  origin() = -hyperplane.normal()*hyperplane.offset();
-}
-
-/** \returns the parameter value of the intersection between \c *this and the given hyperplane
-  */
-template <typename _Scalar, int _AmbientDim>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim>::intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
-{
-  return -(hyperplane.offset()+origin().eigen2_dot(hyperplane.normal()))
-          /(direction().eigen2_dot(hyperplane.normal()));
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Quaternion.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/Quaternion.h
deleted file mode 100644
index 4b6390cf..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Quaternion.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
-struct ei_quaternion_assign_impl;
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Quaternion
-  *
-  * \brief The quaternion class used to represent 3D orientations and rotations
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
-  * orientations and rotations of objects in three dimensions. Compared to other representations
-  * like Euler angles or 3x3 matrices, quatertions offer the following advantages:
-  * \li \b compact storage (4 scalars)
-  * \li \b efficient to compose (28 flops),
-  * \li \b stable spherical interpolation
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c Quaternionf for \c float
-  * \li \c Quaterniond for \c double
-  *
-  * \sa  class AngleAxis, class Transform
-  */
-
-template<typename _Scalar> struct ei_traits<Quaternion<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Quaternion : public RotationBase<Quaternion<_Scalar>,3>
-{
-  typedef RotationBase<Quaternion<_Scalar>,3> Base;
-
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,4)
-
-  using Base::operator*;
-
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-
-  /** the type of the Coefficients 4-vector */
-  typedef Matrix<Scalar, 4, 1> Coefficients;
-  /** the type of a 3D vector */
-  typedef Matrix<Scalar,3,1> Vector3;
-  /** the equivalent rotation matrix type */
-  typedef Matrix<Scalar,3,3> Matrix3;
-  /** the equivalent angle-axis type */
-  typedef AngleAxis<Scalar> AngleAxisType;
-
-  /** \returns the \c x coefficient */
-  inline Scalar x() const { return m_coeffs.coeff(0); }
-  /** \returns the \c y coefficient */
-  inline Scalar y() const { return m_coeffs.coeff(1); }
-  /** \returns the \c z coefficient */
-  inline Scalar z() const { return m_coeffs.coeff(2); }
-  /** \returns the \c w coefficient */
-  inline Scalar w() const { return m_coeffs.coeff(3); }
-
-  /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return m_coeffs.coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return m_coeffs.coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return m_coeffs.coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return m_coeffs.coeffRef(3); }
-
-  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const Block<const Coefficients,3,1> vec() const { return m_coeffs.template start<3>(); }
-
-  /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline Block<Coefficients,3,1> vec() { return m_coeffs.template start<3>(); }
-
-  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
-
-  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
-    * its four coefficients \a w, \a x, \a y and \a z.
-    *
-    * \warning Note the order of the arguments: the real \a w coefficient first,
-    * while internally the coefficients are stored in the following order:
-    * [\c x, \c y, \c z, \c w]
-    */
-  inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
-  { m_coeffs << x, y, z, w; }
-
-  /** Copy constructor */
-  inline Quaternion(const Quaternion& other) { m_coeffs = other.m_coeffs; }
-
-  /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
-
-  /** Constructs and initializes a quaternion from either:
-    *  - a rotation matrix expression,
-    *  - a 4D vector expression representing quaternion coefficients.
-    * \sa operator=(MatrixBase<Derived>)
-    */
-  template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
-
-  Quaternion& operator=(const Quaternion& other);
-  Quaternion& operator=(const AngleAxisType& aa);
-  template<typename Derived>
-  Quaternion& operator=(const MatrixBase<Derived>& m);
-
-  /** \returns a quaternion representing an identity rotation
-    * \sa MatrixBase::Identity()
-    */
-  static inline Quaternion Identity() { return Quaternion(1, 0, 0, 0); }
-
-  /** \sa Quaternion::Identity(), MatrixBase::setIdentity()
-    */
-  inline Quaternion& setIdentity() { m_coeffs << 0, 0, 0, 1; return *this; }
-
-  /** \returns the squared norm of the quaternion's coefficients
-    * \sa Quaternion::norm(), MatrixBase::squaredNorm()
-    */
-  inline Scalar squaredNorm() const { return m_coeffs.squaredNorm(); }
-
-  /** \returns the norm of the quaternion's coefficients
-    * \sa Quaternion::squaredNorm(), MatrixBase::norm()
-    */
-  inline Scalar norm() const { return m_coeffs.norm(); }
-
-  /** Normalizes the quaternion \c *this
-    * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { m_coeffs.normalize(); }
-  /** \returns a normalized version of \c *this
-    * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion normalized() const { return Quaternion(m_coeffs.normalized()); }
-
-  /** \returns the dot product of \c *this and \a other
-    * Geometrically speaking, the dot product of two unit quaternions
-    * corresponds to the cosine of half the angle between the two rotations.
-    * \sa angularDistance()
-    */
-  inline Scalar eigen2_dot(const Quaternion& other) const { return m_coeffs.eigen2_dot(other.m_coeffs); }
-
-  inline Scalar angularDistance(const Quaternion& other) const;
-
-  Matrix3 toRotationMatrix(void) const;
-
-  template<typename Derived1, typename Derived2>
-  Quaternion& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
-
-  inline Quaternion operator* (const Quaternion& q) const;
-  inline Quaternion& operator*= (const Quaternion& q);
-
-  Quaternion inverse(void) const;
-  Quaternion conjugate(void) const;
-
-  Quaternion slerp(Scalar t, const Quaternion& other) const;
-
-  template<typename Derived>
-  Vector3 operator* (const MatrixBase<Derived>& vec) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Quaternion(const Quaternion<OtherScalarType>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Quaternion& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
-  Coefficients m_coeffs;
-};
-
-/** \ingroup Geometry_Module
-  * single precision quaternion type */
-typedef Quaternion<float> Quaternionf;
-/** \ingroup Geometry_Module
-  * double precision quaternion type */
-typedef Quaternion<double> Quaterniond;
-
-// Generic Quaternion * Quaternion product
-template<typename Scalar> inline Quaternion<Scalar>
-ei_quaternion_product(const Quaternion<Scalar>& a, const Quaternion<Scalar>& b)
-{
-  return Quaternion<Scalar>
-  (
-    a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
-    a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
-    a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
-    a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
-  );
-}
-
-/** \returns the concatenation of two rotations as a quaternion-quaternion product */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::operator* (const Quaternion& other) const
-{
-  return ei_quaternion_product(*this,other);
-}
-
-/** \sa operator*(Quaternion) */
-template <typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator*= (const Quaternion& other)
-{
-  return (*this = *this * other);
-}
-
-/** Rotation of a vector by a quaternion.
-  * \remarks If the quaternion is used to rotate several points (>1)
-  * then it is much more efficient to first convert it to a 3x3 Matrix.
-  * Comparison of the operation cost for n transformations:
-  *   - Quaternion:    30n
-  *   - Via a Matrix3: 24 + 15n
-  */
-template <typename Scalar>
-template<typename Derived>
-inline typename Quaternion<Scalar>::Vector3
-Quaternion<Scalar>::operator* (const MatrixBase<Derived>& v) const
-{
-    // Note that this algorithm comes from the optimization by hand
-    // of the conversion to a Matrix followed by a Matrix/Vector product.
-    // It appears to be much faster than the common algorithm found
-    // in the litterature (30 versus 39 flops). It also requires two
-    // Vector3 as temporaries.
-    Vector3 uv;
-    uv = 2 * this->vec().cross(v);
-    return v + this->w() * uv + this->vec().cross(uv);
-}
-
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const Quaternion& other)
-{
-  m_coeffs = other.m_coeffs;
-  return *this;
-}
-
-/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
-  */
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const AngleAxisType& aa)
-{
-  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
-  this->w() = ei_cos(ha);
-  this->vec() = ei_sin(ha) * aa.axis();
-  return *this;
-}
-
-/** Set \c *this from the expression \a xpr:
-  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
-  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
-  *     and \a xpr is converted to a quaternion
-  */
-template<typename Scalar>
-template<typename Derived>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const MatrixBase<Derived>& xpr)
-{
-  ei_quaternion_assign_impl<Derived>::run(*this, xpr.derived());
-  return *this;
-}
-
-/** Convert the quaternion to a 3x3 rotation matrix */
-template<typename Scalar>
-inline typename Quaternion<Scalar>::Matrix3
-Quaternion<Scalar>::toRotationMatrix(void) const
-{
-  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
-  // if not inlined then the cost of the return by value is huge ~ +35%,
-  // however, not inlining this function is an order of magnitude slower, so
-  // it has to be inlined, and so the return by value is not an issue
-  Matrix3 res;
-
-  const Scalar tx  = Scalar(2)*this->x();
-  const Scalar ty  = Scalar(2)*this->y();
-  const Scalar tz  = Scalar(2)*this->z();
-  const Scalar twx = tx*this->w();
-  const Scalar twy = ty*this->w();
-  const Scalar twz = tz*this->w();
-  const Scalar txx = tx*this->x();
-  const Scalar txy = ty*this->x();
-  const Scalar txz = tz*this->x();
-  const Scalar tyy = ty*this->y();
-  const Scalar tyz = tz*this->y();
-  const Scalar tzz = tz*this->z();
-
-  res.coeffRef(0,0) = Scalar(1)-(tyy+tzz);
-  res.coeffRef(0,1) = txy-twz;
-  res.coeffRef(0,2) = txz+twy;
-  res.coeffRef(1,0) = txy+twz;
-  res.coeffRef(1,1) = Scalar(1)-(txx+tzz);
-  res.coeffRef(1,2) = tyz-twx;
-  res.coeffRef(2,0) = txz-twy;
-  res.coeffRef(2,1) = tyz+twx;
-  res.coeffRef(2,2) = Scalar(1)-(txx+tyy);
-
-  return res;
-}
-
-/** Sets *this to be a quaternion representing a rotation sending the vector \a a to the vector \a b.
-  *
-  * \returns a reference to *this.
-  *
-  * Note that the two input vectors do \b not have to be normalized.
-  */
-template<typename Scalar>
-template<typename Derived1, typename Derived2>
-inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
-{
-  Vector3 v0 = a.normalized();
-  Vector3 v1 = b.normalized();
-  Scalar c = v0.eigen2_dot(v1);
-
-  // if dot == 1, vectors are the same
-  if (ei_isApprox(c,Scalar(1)))
-  {
-    // set to identity
-    this->w() = 1; this->vec().setZero();
-    return *this;
-  }
-  // if dot == -1, vectors are opposites
-  if (ei_isApprox(c,Scalar(-1)))
-  {
-    this->vec() = v0.unitOrthogonal();
-    this->w() = 0;
-    return *this;
-  }
-
-  Vector3 axis = v0.cross(v1);
-  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
-  Scalar invs = Scalar(1)/s;
-  this->vec() = axis * invs;
-  this->w() = s * Scalar(0.5);
-
-  return *this;
-}
-
-/** \returns the multiplicative inverse of \c *this
-  * Note that in most cases, i.e., if you simply want the opposite rotation,
-  * and/or the quaternion is normalized, then it is enough to use the conjugate.
-  *
-  * \sa Quaternion::conjugate()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::inverse() const
-{
-  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
-  Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
-    return Quaternion(conjugate().coeffs() / n2);
-  else
-  {
-    // return an invalid result to flag the error
-    return Quaternion(Coefficients::Zero());
-  }
-}
-
-/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
-  * if the quaternion is normalized.
-  * The conjugate of a quaternion represents the opposite rotation.
-  *
-  * \sa Quaternion::inverse()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::conjugate() const
-{
-  return Quaternion(this->w(),-this->x(),-this->y(),-this->z());
-}
-
-/** \returns the angle (in radian) between two rotations
-  * \sa eigen2_dot()
-  */
-template <typename Scalar>
-inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
-{
-  double d = ei_abs(this->eigen2_dot(other));
-  if (d>=1.0)
-    return 0;
-  return Scalar(2) * std::acos(d);
-}
-
-/** \returns the spherical linear interpolation between the two quaternions
-  * \c *this and \a other at the parameter \a t
-  */
-template <typename Scalar>
-Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
-{
-  static const Scalar one = Scalar(1) - machine_epsilon<Scalar>();
-  Scalar d = this->eigen2_dot(other);
-  Scalar absD = ei_abs(d);
-
-  Scalar scale0;
-  Scalar scale1;
-
-  if (absD>=one)
-  {
-    scale0 = Scalar(1) - t;
-    scale1 = t;
-  }
-  else
-  {
-    // theta is the angle between the 2 quaternions
-    Scalar theta = std::acos(absD);
-    Scalar sinTheta = ei_sin(theta);
-
-    scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
-    scale1 = ei_sin( ( t * theta) ) / sinTheta;
-    if (d<0)
-      scale1 = -scale1;
-  }
-
-  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
-}
-
-// set from a rotation matrix
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,3,3>
-{
-  typedef typename Other::Scalar Scalar;
-  static inline void run(Quaternion<Scalar>& q, const Other& mat)
-  {
-    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
-    // Ken Shoemake, 1987 SIGGRAPH course notes
-    Scalar t = mat.trace();
-    if (t > 0)
-    {
-      t = ei_sqrt(t + Scalar(1.0));
-      q.w() = Scalar(0.5)*t;
-      t = Scalar(0.5)/t;
-      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
-      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
-      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
-    }
-    else
-    {
-      int i = 0;
-      if (mat.coeff(1,1) > mat.coeff(0,0))
-        i = 1;
-      if (mat.coeff(2,2) > mat.coeff(i,i))
-        i = 2;
-      int j = (i+1)%3;
-      int k = (j+1)%3;
-
-      t = ei_sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
-      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
-      t = Scalar(0.5)/t;
-      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
-      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
-      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
-    }
-  }
-};
-
-// set from a vector of coefficients assumed to be a quaternion
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,4,1>
-{
-  typedef typename Other::Scalar Scalar;
-  static inline void run(Quaternion<Scalar>& q, const Other& vec)
-  {
-    q.coeffs() = vec;
-  }
-};
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Rotation2D.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
deleted file mode 100644
index 19b8582a..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Rotation2D
-  *
-  * \brief Represents a rotation/orientation in a 2 dimensional space.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class is equivalent to a single scalar representing a counter clock wise rotation
-  * as a single angle in radian. It provides some additional features such as the automatic
-  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
-  * interface to Quaternion in order to facilitate the writing of generic algorithms
-  * dealing with rotations.
-  *
-  * \sa class Quaternion, class Transform
-  */
-template<typename _Scalar> struct ei_traits<Rotation2D<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
-{
-  typedef RotationBase<Rotation2D<_Scalar>,2> Base;
-
-public:
-
-  using Base::operator*;
-
-  enum { Dim = 2 };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,2,1> Vector2;
-  typedef Matrix<Scalar,2,2> Matrix2;
-
-protected:
-
-  Scalar m_angle;
-
-public:
-
-  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(Scalar a) : m_angle(a) {}
-
-  /** \returns the rotation angle */
-  inline Scalar angle() const { return m_angle; }
-
-  /** \returns a read-write reference to the rotation angle */
-  inline Scalar& angle() { return m_angle; }
-
-  /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
-
-  /** Concatenates two rotations */
-  inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
-
-  /** Concatenates two rotations */
-  inline Rotation2D& operator*=(const Rotation2D& other)
-  { return m_angle += other.m_angle; return *this; }
-
-  /** Applies the rotation to a 2D vector */
-  Vector2 operator* (const Vector2& vec) const
-  { return toRotationMatrix() * vec; }
-
-  template<typename Derived>
-  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix(void) const;
-
-  /** \returns the spherical interpolation between \c *this and \a other using
-    * parameter \a t. It is in fact equivalent to a linear interpolation.
-    */
-  inline Rotation2D slerp(Scalar t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
-  {
-    m_angle = Scalar(other.angle());
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Rotation2D& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return ei_isApprox(m_angle,other.m_angle, prec); }
-};
-
-/** \ingroup Geometry_Module
-  * single precision 2D rotation type */
-typedef Rotation2D<float> Rotation2Df;
-/** \ingroup Geometry_Module
-  * double precision 2D rotation type */
-typedef Rotation2D<double> Rotation2Dd;
-
-/** Set \c *this from a 2x2 rotation matrix \a mat.
-  * In other words, this function extract the rotation angle
-  * from the rotation matrix.
-  */
-template<typename Scalar>
-template<typename Derived>
-Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
-{
-  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_angle = ei_atan2(mat.coeff(1,0), mat.coeff(0,0));
-  return *this;
-}
-
-/** Constructs and \returns an equivalent 2x2 rotation matrix.
-  */
-template<typename Scalar>
-typename Rotation2D<Scalar>::Matrix2
-Rotation2D<Scalar>::toRotationMatrix(void) const
-{
-  Scalar sinA = ei_sin(m_angle);
-  Scalar cosA = ei_cos(m_angle);
-  return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/RotationBase.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/RotationBase.h
deleted file mode 100644
index b1c8f38d..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/RotationBase.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-// this file aims to contains the various representations of rotation/orientation
-// in 2D and 3D space excepted Matrix and Quaternion.
-
-/** \class RotationBase
-  *
-  * \brief Common base class for compact rotation representations
-  *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
-  */
-template<typename Derived, int _Dim>
-class RotationBase
-{
-  public:
-    enum { Dim = _Dim };
-    /** the scalar type of the coefficients */
-    typedef typename ei_traits<Derived>::Scalar Scalar;
-    
-    /** corresponding linear transformation matrix type */
-    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
-
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-
-    /** \returns an equivalent rotation matrix */
-    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
-
-    /** \returns the inverse rotation */
-    inline Derived inverse() const { return derived().inverse(); }
-
-    /** \returns the concatenation of the rotation \c *this with a translation \a t */
-    inline Transform<Scalar,Dim> operator*(const Translation<Scalar,Dim>& t) const
-    { return toRotationMatrix() * t; }
-
-    /** \returns the concatenation of the rotation \c *this with a scaling \a s */
-    inline RotationMatrixType operator*(const Scaling<Scalar,Dim>& s) const
-    { return toRotationMatrix() * s; }
-
-    /** \returns the concatenation of the rotation \c *this with an affine transformation \a t */
-    inline Transform<Scalar,Dim> operator*(const Transform<Scalar,Dim>& t) const
-    { return toRotationMatrix() * t; }
-};
-
-/** \geometry_module
-  *
-  * Constructs a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
-  *this = r.toRotationMatrix();
-}
-
-/** \geometry_module
-  *
-  * Set a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
-  return *this = r.toRotationMatrix();
-}
-
-/** \internal
-  *
-  * Helper function to return an arbitrary rotation object to a rotation matrix.
-  *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
-  *
-  * It returns a Dim x Dim fixed size matrix.
-  *
-  * Default specializations are provided for:
-  *   - any scalar type (2D),
-  *   - any matrix expression,
-  *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
-  *
-  * Currently ei_toRotationMatrix is only used by Transform.
-  *
-  * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
-  */
-template<typename Scalar, int Dim>
-static inline Matrix<Scalar,2,2> ei_toRotationMatrix(const Scalar& s)
-{
-  EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return Rotation2D<Scalar>(s).toRotationMatrix();
-}
-
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline Matrix<Scalar,Dim,Dim> ei_toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
-{
-  return r.toRotationMatrix();
-}
-
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline const MatrixBase<OtherDerived>& ei_toRotationMatrix(const MatrixBase<OtherDerived>& mat)
-{
-  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
-    YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return mat;
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Scaling.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/Scaling.h
deleted file mode 100644
index b8fa6cd3..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Scaling.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Scaling
-  *
-  * \brief Represents a possibly non uniform scaling transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a scaling transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Translation, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Scaling
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
-
-protected:
-
-  VectorType m_coeffs;
-
-public:
-
-  /** Default constructor without initialization. */
-  Scaling() {}
-  /** Constructs and initialize a uniform scaling transformation */
-  explicit inline Scaling(const Scalar& s) { m_coeffs.setConstant(s); }
-  /** 2D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /** 3D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Scaling(const VectorType& coeffs) : m_coeffs(coeffs) {}
-
-  const VectorType& coeffs() const { return m_coeffs; }
-  VectorType& coeffs() { return m_coeffs; }
-
-  /** Concatenates two scaling */
-  inline Scaling operator* (const Scaling& other) const
-  { return Scaling(coeffs().cwise() * other.coeffs()); }
-
-  /** Concatenates a scaling and a translation */
-  inline TransformType operator* (const TranslationType& t) const;
-
-  /** Concatenates a scaling and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
-
-  /** Concatenates a scaling and a linear transformation matrix */
-  // TODO returns an expression
-  inline LinearMatrixType operator* (const LinearMatrixType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** Concatenates a linear transformation matrix and a scaling */
-  // TODO returns an expression
-  friend inline LinearMatrixType operator* (const LinearMatrixType& other, const Scaling& s)
-  { return other * s.coeffs().asDiagonal(); }
-
-  template<typename Derived>
-  inline LinearMatrixType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
-
-  /** Applies scaling to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** \returns the inverse scaling */
-  inline Scaling inverse() const
-  { return Scaling(coeffs().cwise().inverse()); }
-
-  inline Scaling& operator=(const Scaling& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Scaling(const Scaling<OtherScalarType,Dim>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Scaling& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-};
-
-/** \addtogroup Geometry_Module */
-//@{
-typedef Scaling<float, 2> Scaling2f;
-typedef Scaling<double,2> Scaling2d;
-typedef Scaling<float, 3> Scaling3f;
-typedef Scaling<double,3> Scaling3d;
-//@}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TranslationType& t) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = coeffs();
-  res.translation() = m_coeffs.cwise() * t.vector();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.prescale(m_coeffs);
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Transform.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/Transform.h
deleted file mode 100644
index fab60b25..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Transform.h
+++ /dev/null
@@ -1,786 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-// Note that we have to pass Dim and HDim because it is not allowed to use a template
-// parameter to define a template specialization. To be more precise, in the following
-// specializations, it is not allowed to use Dim+1 instead of HDim.
-template< typename Other,
-          int Dim,
-          int HDim,
-          int OtherRows=Other::RowsAtCompileTime,
-          int OtherCols=Other::ColsAtCompileTime>
-struct ei_transform_product_impl;
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Transform
-  *
-  * \brief Represents an homogeneous transformation in a N dimensional space
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _Dim the dimension of the space
-  *
-  * The homography is internally represented and stored as a (Dim+1)^2 matrix which
-  * is available through the matrix() method.
-  *
-  * Conversion methods from/to Qt's QMatrix and QTransform are available if the
-  * preprocessor token EIGEN_QT_SUPPORT is defined.
-  *
-  * \sa class Matrix, class Quaternion
-  */
-template<typename _Scalar, int _Dim>
-class Transform
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1))
-  enum {
-    Dim = _Dim,     ///< space dimension in which the transformation holds
-    HDim = _Dim+1   ///< size of a respective homogeneous vector
-  };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** type of the matrix used to represent the transformation */
-  typedef Matrix<Scalar,HDim,HDim> MatrixType;
-  /** type of the matrix used to represent the linear part of the transformation */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim> LinearPart;
-  /** type of read/write reference to the linear part of the transformation */
-  typedef const Block<const MatrixType,Dim,Dim> ConstLinearPart;
-  /** type of a vector */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1> TranslationPart;
-  /** type of a read/write reference to the translation part of the rotation */
-  typedef const Block<const MatrixType,Dim,1> ConstTranslationPart;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
-
-protected:
-
-  MatrixType m_matrix;
-
-public:
-
-  /** Default constructor without initialization of the coefficients. */
-  inline Transform() { }
-
-  inline Transform(const Transform& other)
-  {
-    m_matrix = other.m_matrix;
-  }
-
-  inline explicit Transform(const TranslationType& t) { *this = t; }
-  inline explicit Transform(const ScalingType& s) { *this = s; }
-  template<typename Derived>
-  inline explicit Transform(const RotationBase<Derived, Dim>& r) { *this = r; }
-
-  inline Transform& operator=(const Transform& other)
-  { m_matrix = other.m_matrix; return *this; }
-
-  template<typename OtherDerived, bool BigMatrix> // MSVC 2005 will commit suicide if BigMatrix has a default value
-  struct construct_from_matrix
-  {
-    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
-    {
-      transform->matrix() = other;
-    }
-  };
-
-  template<typename OtherDerived> struct construct_from_matrix<OtherDerived, true>
-  {
-    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
-    {
-      transform->linear() = other;
-      transform->translation().setZero();
-      transform->matrix()(Dim,Dim) = Scalar(1);
-      transform->matrix().template block<1,Dim>(Dim,0).setZero();
-    }
-  };
-
-  /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline explicit Transform(const MatrixBase<OtherDerived>& other)
-  {
-    construct_from_matrix<OtherDerived, int(OtherDerived::RowsAtCompileTime) == Dim>::run(this, other);
-  }
-
-  /** Set \c *this from a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline Transform& operator=(const MatrixBase<OtherDerived>& other)
-  { m_matrix = other; return *this; }
-
-  #ifdef EIGEN_QT_SUPPORT
-  inline Transform(const QMatrix& other);
-  inline Transform& operator=(const QMatrix& other);
-  inline QMatrix toQMatrix(void) const;
-  inline Transform(const QTransform& other);
-  inline Transform& operator=(const QTransform& other);
-  inline QTransform toQTransform(void) const;
-  #endif
-
-  /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operaror(int,int) const */
-  inline Scalar operator() (int row, int col) const { return m_matrix(row,col); }
-  /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operaror(int,int) */
-  inline Scalar& operator() (int row, int col) { return m_matrix(row,col); }
-
-  /** \returns a read-only expression of the transformation matrix */
-  inline const MatrixType& matrix() const { return m_matrix; }
-  /** \returns a writable expression of the transformation matrix */
-  inline MatrixType& matrix() { return m_matrix; }
-
-  /** \returns a read-only expression of the linear (linear) part of the transformation */
-  inline ConstLinearPart linear() const { return m_matrix.template block<Dim,Dim>(0,0); }
-  /** \returns a writable expression of the linear (linear) part of the transformation */
-  inline LinearPart linear() { return m_matrix.template block<Dim,Dim>(0,0); }
-
-  /** \returns a read-only expression of the translation vector of the transformation */
-  inline ConstTranslationPart translation() const { return m_matrix.template block<Dim,1>(0,Dim); }
-  /** \returns a writable expression of the translation vector of the transformation */
-  inline TranslationPart translation() { return m_matrix.template block<Dim,1>(0,Dim); }
-
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
-  *
-  * The right hand side \a other might be either:
-  * \li a vector of size Dim,
-  * \li an homogeneous vector of size Dim+1,
-  * \li a transformation matrix of size Dim+1 x Dim+1.
-  */
-  // note: this function is defined here because some compilers cannot find the respective declaration
-  template<typename OtherDerived>
-  inline const typename ei_transform_product_impl<OtherDerived,_Dim,_Dim+1>::ResultType
-  operator * (const MatrixBase<OtherDerived> &other) const
-  { return ei_transform_product_impl<OtherDerived,Dim,HDim>::run(*this,other.derived()); }
-
-  /** \returns the product expression of a transformation matrix \a a times a transform \a b
-    * The transformation matrix \a a must have a Dim+1 x Dim+1 sizes. */
-  template<typename OtherDerived>
-  friend inline const typename ProductReturnType<OtherDerived,MatrixType>::Type
-  operator * (const MatrixBase<OtherDerived> &a, const Transform &b)
-  { return a.derived() * b.matrix(); }
-
-  /** Contatenates two transformations */
-  inline const Transform
-  operator * (const Transform& other) const
-  { return Transform(m_matrix * other.matrix()); }
-
-  /** \sa MatrixBase::setIdentity() */
-  void setIdentity() { m_matrix.setIdentity(); }
-  static const typename MatrixType::IdentityReturnType Identity()
-  {
-    return MatrixType::Identity();
-  }
-
-  template<typename OtherDerived>
-  inline Transform& scale(const MatrixBase<OtherDerived> &other);
-
-  template<typename OtherDerived>
-  inline Transform& prescale(const MatrixBase<OtherDerived> &other);
-
-  inline Transform& scale(Scalar s);
-  inline Transform& prescale(Scalar s);
-
-  template<typename OtherDerived>
-  inline Transform& translate(const MatrixBase<OtherDerived> &other);
-
-  template<typename OtherDerived>
-  inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
-
-  template<typename RotationType>
-  inline Transform& rotate(const RotationType& rotation);
-
-  template<typename RotationType>
-  inline Transform& prerotate(const RotationType& rotation);
-
-  Transform& shear(Scalar sx, Scalar sy);
-  Transform& preshear(Scalar sx, Scalar sy);
-
-  inline Transform& operator=(const TranslationType& t);
-  inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  inline Transform operator*(const TranslationType& t) const;
-
-  inline Transform& operator=(const ScalingType& t);
-  inline Transform& operator*=(const ScalingType& s) { return scale(s.coeffs()); }
-  inline Transform operator*(const ScalingType& s) const;
-  friend inline Transform operator*(const LinearMatrixType& mat, const Transform& t)
-  {
-    Transform res = t;
-    res.matrix().row(Dim) = t.matrix().row(Dim);
-    res.matrix().template block<Dim,HDim>(0,0) = (mat * t.matrix().template block<Dim,HDim>(0,0)).lazy();
-    return res;
-  }
-
-  template<typename Derived>
-  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
-  template<typename Derived>
-  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
-  template<typename Derived>
-  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
-
-  LinearMatrixType rotation() const;
-  template<typename RotationMatrixType, typename ScalingMatrixType>
-  void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
-  template<typename ScalingMatrixType, typename RotationMatrixType>
-  void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
-
-  template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-  Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-    const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
-
-  inline const MatrixType inverse(TransformTraits traits = Affine) const;
-
-  /** \returns a const pointer to the column major internal matrix */
-  const Scalar* data() const { return m_matrix.data(); }
-  /** \returns a non-const pointer to the column major internal matrix */
-  Scalar* data() { return m_matrix.data(); }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Transform(const Transform<OtherScalarType,Dim>& other)
-  { m_matrix = other.matrix().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Transform& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_matrix.isApprox(other.m_matrix, prec); }
-
-  #ifdef EIGEN_TRANSFORM_PLUGIN
-  #include EIGEN_TRANSFORM_PLUGIN
-  #endif
-
-protected:
-
-};
-
-/** \ingroup Geometry_Module */
-typedef Transform<float,2> Transform2f;
-/** \ingroup Geometry_Module */
-typedef Transform<float,3> Transform3f;
-/** \ingroup Geometry_Module */
-typedef Transform<double,2> Transform2d;
-/** \ingroup Geometry_Module */
-typedef Transform<double,3> Transform3d;
-
-/**************************
-*** Optional QT support ***
-**************************/
-
-#ifdef EIGEN_QT_SUPPORT
-/** Initialises \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>::Transform(const QMatrix& other)
-{
-  *this = other;
-}
-
-/** Set \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QMatrix& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              0, 0, 1;
-   return *this;
-}
-
-/** \returns a QMatrix from \c *this assuming the dimension is 2.
-  *
-  * \warning this convertion might loss data if \c *this is not affine
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-QMatrix Transform<Scalar,Dim>::toQMatrix(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QMatrix(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
-                 m_matrix.coeff(0,1), m_matrix.coeff(1,1),
-                 m_matrix.coeff(0,2), m_matrix.coeff(1,2));
-}
-
-/** Initialises \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>::Transform(const QTransform& other)
-{
-  *this = other;
-}
-
-/** Set \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QTransform& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              other.m13(), other.m23(), other.m33();
-   return *this;
-}
-
-/** \returns a QTransform from \c *this assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-QTransform Transform<Scalar,Dim>::toQTransform(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(2,0),
-                    m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(2,1),
-                    m_matrix.coeff(0,2), m_matrix.coeff(1,2), m_matrix.coeff(2,2));
-}
-#endif
-
-/*********************
-*** Procedural API ***
-*********************/
-
-/** Applies on the right the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa prescale()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::scale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  linear() = (linear() * other.asDiagonal()).lazy();
-  return *this;
-}
-
-/** Applies on the right a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa prescale(Scalar)
-  */
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::scale(Scalar s)
-{
-  linear() *= s;
-  return *this;
-}
-
-/** Applies on the left the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa scale()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::prescale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  m_matrix.template block<Dim,HDim>(0,0) = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0)).lazy();
-  return *this;
-}
-
-/** Applies on the left a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa scale(Scalar)
-  */
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::prescale(Scalar s)
-{
-  m_matrix.template corner<Dim,HDim>(TopLeft) *= s;
-  return *this;
-}
-
-/** Applies on the right the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa pretranslate()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::translate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  translation() += linear() * other;
-  return *this;
-}
-
-/** Applies on the left the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa translate()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::pretranslate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  translation() += other;
-  return *this;
-}
-
-/** Applies on the right the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * The template parameter \a RotationType is the type of the rotation which
-  * must be known by ei_toRotationMatrix<>.
-  *
-  * Natively supported types includes:
-  *   - any scalar (2D),
-  *   - a Dim x Dim matrix expression,
-  *   - a Quaternion (3D),
-  *   - a AngleAxis (3D)
-  *
-  * This mechanism is easily extendable to support user types such as Euler angles,
-  * or a pair of Quaternion for 4D rotations.
-  *
-  * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
-  */
-template<typename Scalar, int Dim>
-template<typename RotationType>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::rotate(const RotationType& rotation)
-{
-  linear() *= ei_toRotationMatrix<Scalar,Dim>(rotation);
-  return *this;
-}
-
-/** Applies on the left the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * See rotate() for further details.
-  *
-  * \sa rotate()
-  */
-template<typename Scalar, int Dim>
-template<typename RotationType>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::prerotate(const RotationType& rotation)
-{
-  m_matrix.template block<Dim,HDim>(0,0) = ei_toRotationMatrix<Scalar,Dim>(rotation)
-                                         * m_matrix.template block<Dim,HDim>(0,0);
-  return *this;
-}
-
-/** Applies on the right the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa preshear()
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::shear(Scalar sx, Scalar sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  VectorType tmp = linear().col(0)*sy + linear().col(1);
-  linear() << linear().col(0) + linear().col(1)*sx, tmp;
-  return *this;
-}
-
-/** Applies on the left the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa shear()
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::preshear(Scalar sx, Scalar sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
-  return *this;
-}
-
-/******************************************************
-*** Scaling, Translation and Rotation compatibility ***
-******************************************************/
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const TranslationType& t)
-{
-  linear().setIdentity();
-  translation() = t.vector();
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const TranslationType& t) const
-{
-  Transform res = *this;
-  res.translate(t.vector());
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const ScalingType& s)
-{
-  m_matrix.setZero();
-  linear().diagonal() = s.coeffs();
-  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const ScalingType& s) const
-{
-  Transform res = *this;
-  res.scale(s.coeffs());
-  return res;
-}
-
-template<typename Scalar, int Dim>
-template<typename Derived>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const RotationBase<Derived,Dim>& r)
-{
-  linear() = ei_toRotationMatrix<Scalar,Dim>(r);
-  translation().setZero();
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-template<typename Derived>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const RotationBase<Derived,Dim>& r) const
-{
-  Transform res = *this;
-  res.rotate(r.derived());
-  return res;
-}
-
-/************************
-*** Special functions ***
-************************/
-
-/** \returns the rotation part of the transformation
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), computeScalingRotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-typename Transform<Scalar,Dim>::LinearMatrixType
-Transform<Scalar,Dim>::rotation() const
-{
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
-}
-
-
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeScalingRotation(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-template<typename RotationMatrixType, typename ScalingMatrixType>
-void Transform<Scalar,Dim>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling)
-  {
-    scaling->noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
-  }
-  if(rotation)
-  {
-    LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->noalias() = m * svd.matrixV().adjoint();
-  }
-}
-
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-template<typename ScalingMatrixType, typename RotationMatrixType>
-void Transform<Scalar,Dim>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling)
-  {
-    scaling->noalias() = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
-  }
-  if(rotation)
-  {
-    LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->noalias() = m * svd.matrixV().adjoint();
-  }
-}
-
-/** Convenient method to set \c *this from a position, orientation and scale
-  * of a 3D object.
-  */
-template<typename Scalar, int Dim>
-template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-  const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
-{
-  linear() = ei_toRotationMatrix<Scalar,Dim>(orientation);
-  linear() *= scale.asDiagonal();
-  translation() = position;
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-/** \nonstableyet
-  *
-  * \returns the inverse transformation matrix according to some given knowledge
-  * on \c *this.
-  *
-  * \param traits allows to optimize the inversion process when the transformion
-  * is known to be not a general transformation. The possible values are:
-  *  - Projective if the transformation is not necessarily affine, i.e., if the
-  *    last row is not guaranteed to be [0 ... 0 1]
-  *  - Affine is the default, the last row is assumed to be [0 ... 0 1]
-  *  - Isometry if the transformation is only a concatenations of translations
-  *    and rotations.
-  *
-  * \warning unless \a traits is always set to NoShear or NoScaling, this function
-  * requires the generic inverse method of MatrixBase defined in the LU module. If
-  * you forget to include this module, then you will get hard to debug linking errors.
-  *
-  * \sa MatrixBase::inverse()
-  */
-template<typename Scalar, int Dim>
-inline const typename Transform<Scalar,Dim>::MatrixType
-Transform<Scalar,Dim>::inverse(TransformTraits traits) const
-{
-  if (traits == Projective)
-  {
-    return m_matrix.inverse();
-  }
-  else
-  {
-    MatrixType res;
-    if (traits == Affine)
-    {
-      res.template corner<Dim,Dim>(TopLeft) = linear().inverse();
-    }
-    else if (traits == Isometry)
-    {
-      res.template corner<Dim,Dim>(TopLeft) = linear().transpose();
-    }
-    else
-    {
-      ei_assert("invalid traits value in Transform::inverse()");
-    }
-    // translation and remaining parts
-    res.template corner<Dim,1>(TopRight) = - res.template corner<Dim,Dim>(TopLeft) * translation();
-    res.template corner<1,Dim>(BottomLeft).setZero();
-    res.coeffRef(Dim,Dim) = Scalar(1);
-    return res;
-  }
-}
-
-/*****************************************************
-*** Specializations of operator* with a MatrixBase ***
-*****************************************************/
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, HDim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, Dim,Dim>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef TransformType ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  {
-    TransformType res;
-    res.translation() = tr.translation();
-    res.matrix().row(Dim) = tr.matrix().row(Dim);
-    res.linear() = (tr.linear() * other).lazy();
-    return res;
-  }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, HDim,1>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, Dim,1>
-{
-  typedef typename Other::Scalar Scalar;
-  typedef Transform<Scalar,Dim> TransformType;
-  typedef Matrix<Scalar,Dim,1> ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return ((tr.linear() * other) + tr.translation())
-          * (Scalar(1) / ( (tr.matrix().template block<1,Dim>(Dim,0) * other).coeff(0) + tr.matrix().coeff(Dim,Dim))); }
-};
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Translation.h b/nuparu/include/Eigen/src/Eigen2Support/Geometry/Translation.h
deleted file mode 100644
index 2b9859f6..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Geometry/Translation.h
+++ /dev/null
@@ -1,184 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Translation
-  *
-  * \brief Represents a translation transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a translation transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Scaling, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Translation
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
-
-protected:
-
-  VectorType m_coeffs;
-
-public:
-
-  /** Default constructor without initialization. */
-  Translation() {}
-  /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
-
-  const VectorType& vector() const { return m_coeffs; }
-  VectorType& vector() { return m_coeffs; }
-
-  /** Concatenates two translation */
-  inline Translation operator* (const Translation& other) const
-  { return Translation(m_coeffs + other.m_coeffs); }
-
-  /** Concatenates a translation and a scaling */
-  inline TransformType operator* (const ScalingType& other) const;
-
-  /** Concatenates a translation and a linear transformation */
-  inline TransformType operator* (const LinearMatrixType& linear) const;
-
-  template<typename Derived>
-  inline TransformType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
-
-  /** Concatenates a linear transformation and a translation */
-  // its a nightmare to define a templated friend function outside its declaration
-  friend inline TransformType operator* (const LinearMatrixType& linear, const Translation& t)
-  {
-    TransformType res;
-    res.matrix().setZero();
-    res.linear() = linear;
-    res.translation() = linear * t.m_coeffs;
-    res.matrix().row(Dim).setZero();
-    res(Dim,Dim) = Scalar(1);
-    return res;
-  }
-
-  /** Concatenates a translation and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
-
-  /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
-
-  /** \returns the inverse translation (opposite) */
-  Translation inverse() const { return Translation(-m_coeffs); }
-
-  Translation& operator=(const Translation& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
-  { m_coeffs = other.vector().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-};
-
-/** \addtogroup Geometry_Module */
-//@{
-typedef Translation<float, 2> Translation2f;
-typedef Translation<double,2> Translation2d;
-typedef Translation<float, 3> Translation3f;
-typedef Translation<double,3> Translation3d;
-//@}
-
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const ScalingType& other) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = other.coeffs();
-  res.translation() = m_coeffs;
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const LinearMatrixType& linear) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear() = linear;
-  res.translation() = m_coeffs;
-  res.matrix().row(Dim).setZero();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.pretranslate(m_coeffs);
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Eigen2Support/LU.h b/nuparu/include/Eigen/src/Eigen2Support/LU.h
deleted file mode 100644
index 49f19ad7..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/LU.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_LU_H
-#define EIGEN2_LU_H
-
-namespace Eigen { 
-
-template<typename MatrixType>
-class LU : public FullPivLU<MatrixType>
-{
-  public:
-
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> IntColVectorType;
-    typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> RowVectorType;
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> ColVectorType;
-
-    typedef Matrix<typename MatrixType::Scalar,
-                  MatrixType::ColsAtCompileTime, // the number of rows in the "kernel matrix" is the number of cols of the original matrix
-                                                 // so that the product "matrix * kernel = zero" makes sense
-                  Dynamic,                       // we don't know at compile-time the dimension of the kernel
-                  MatrixType::Options,
-                  MatrixType::MaxColsAtCompileTime, // see explanation for 2nd template parameter
-                  MatrixType::MaxColsAtCompileTime // the kernel is a subspace of the domain space, whose dimension is the number
-                                                   // of columns of the original matrix
-    > KernelResultType;
-
-    typedef Matrix<typename MatrixType::Scalar,
-                   MatrixType::RowsAtCompileTime, // the image is a subspace of the destination space, whose dimension is the number
-                                                  // of rows of the original matrix
-                   Dynamic,                       // we don't know at compile time the dimension of the image (the rank)
-                   MatrixType::Options,
-                   MatrixType::MaxRowsAtCompileTime, // the image matrix will consist of columns from the original matrix,
-                   MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
-    > ImageResultType;
-
-    typedef FullPivLU<MatrixType> Base;
-
-    template<typename T>
-    explicit LU(const T& t) : Base(t), m_originalMatrix(t) {}
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = static_cast<const Base*>(this)->solve(b);
-      return true;
-    }
-
-    template<typename ResultType>
-    inline void computeInverse(ResultType *result) const
-    {
-      solve(MatrixType::Identity(this->rows(), this->cols()), result);
-    }
-    
-    template<typename KernelMatrixType>
-    void computeKernel(KernelMatrixType *result) const
-    {
-      *result = static_cast<const Base*>(this)->kernel();
-    }
-    
-    template<typename ImageMatrixType>
-    void computeImage(ImageMatrixType *result) const
-    {
-      *result = static_cast<const Base*>(this)->image(m_originalMatrix);
-    }
-    
-    const ImageResultType image() const
-    {
-      return static_cast<const Base*>(this)->image(m_originalMatrix);
-    }
-    
-    const MatrixType& m_originalMatrix;
-};
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-/** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const LU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::lu() const
-{
-  return LU<PlainObject>(eval());
-}
-#endif
-
-#ifdef EIGEN2_SUPPORT
-/** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const LU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::eigen2_lu() const
-{
-  return LU<PlainObject>(eval());
-}
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN2_LU_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Lazy.h b/nuparu/include/Eigen/src/Eigen2Support/Lazy.h
deleted file mode 100644
index 593fc78e..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Lazy.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_LAZY_H
-#define EIGEN_LAZY_H
-
-namespace Eigen { 
-
-/** \deprecated it is only used by lazy() which is deprecated
-  *
-  * \returns an expression of *this with added flags
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * \sa class Flagged, extract(), part()
-  */
-template<typename Derived>
-template<unsigned int Added>
-inline const Flagged<Derived, Added, 0>
-MatrixBase<Derived>::marked() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::noalias()
-  *
-  * \returns an expression of *this with the EvalBeforeAssigningBit flag removed.
-  *
-  * Example: \include MatrixBase_lazy.cpp
-  * Output: \verbinclude MatrixBase_lazy.out
-  *
-  * \sa class Flagged, marked()
-  */
-template<typename Derived>
-inline const Flagged<Derived, 0, EvalBeforeAssigningBit>
-MatrixBase<Derived>::lazy() const
-{
-  return derived();
-}
-
-
-/** \internal
-  * Overloaded to perform an efficient C += (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                                       EvalBeforeAssigningBit>& other)
-{
-  other._expression().derived().addTo(derived()); return derived();
-}
-
-/** \internal
-  * Overloaded to perform an efficient C -= (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                                       EvalBeforeAssigningBit>& other)
-{
-  other._expression().derived().subTo(derived()); return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_LAZY_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/LeastSquares.h b/nuparu/include/Eigen/src/Eigen2Support/LeastSquares.h
deleted file mode 100644
index 0e6fdb48..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/LeastSquares.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_LEASTSQUARES_H
-#define EIGEN2_LEASTSQUARES_H
-
-namespace Eigen { 
-
-/** \ingroup LeastSquares_Module
-  *
-  * \leastsquares_module
-  *
-  * For a set of points, this function tries to express
-  * one of the coords as a linear (affine) function of the other coords.
-  *
-  * This is best explained by an example. This function works in full
-  * generality, for points in a space of arbitrary dimension, and also over
-  * the complex numbers, but for this example we will work in dimension 3
-  * over the real numbers (doubles).
-  *
-  * So let us work with the following set of 5 points given by their
-  * \f$(x,y,z)\f$ coordinates:
-  * @code
-    Vector3d points[5];
-    points[0] = Vector3d( 3.02, 6.89, -4.32 );
-    points[1] = Vector3d( 2.01, 5.39, -3.79 );
-    points[2] = Vector3d( 2.41, 6.01, -4.01 );
-    points[3] = Vector3d( 2.09, 5.55, -3.86 );
-    points[4] = Vector3d( 2.58, 6.32, -4.10 );
-  * @endcode
-  * Suppose that we want to express the second coordinate (\f$y\f$) as a linear
-  * expression in \f$x\f$ and \f$z\f$, that is,
-  * \f[ y=ax+bz+c \f]
-  * for some constants \f$a,b,c\f$. Thus, we want to find the best possible
-  * constants \f$a,b,c\f$ so that the plane of equation \f$y=ax+bz+c\f$ fits
-  * best the five above points. To do that, call this function as follows:
-  * @code
-    Vector3d coeffs; // will store the coefficients a, b, c
-    linearRegression(
-      5,
-      &points,
-      &coeffs,
-      1 // the coord to express as a function of
-        // the other ones. 0 means x, 1 means y, 2 means z.
-    );
-  * @endcode
-  * Now the vector \a coeffs is approximately
-  * \f$( 0.495 ,  -1.927 ,  -2.906 )\f$.
-  * Thus, we get \f$a=0.495, b = -1.927, c = -2.906\f$. Let us check for
-  * instance how near points[0] is from the plane of equation \f$y=ax+bz+c\f$.
-  * Looking at the coords of points[0], we see that:
-  * \f[ax+bz+c = 0.495 * 3.02 + (-1.927) * (-4.32) + (-2.906) = 6.91.\f]
-  * On the other hand, we have \f$y=6.89\f$. We see that the values
-  * \f$6.91\f$ and \f$6.89\f$
-  * are near, so points[0] is very near the plane of equation \f$y=ax+bz+c\f$.
-  *
-  * Let's now describe precisely the parameters:
-  * @param numPoints the number of points
-  * @param points the array of pointers to the points on which to perform the linear regression
-  * @param result pointer to the vector in which to store the result.
-                  This vector must be of the same type and size as the
-                  data points. The meaning of its coords is as follows.
-                  For brevity, let \f$n=Size\f$,
-                  \f$r_i=result[i]\f$,
-                  and \f$f=funcOfOthers\f$. Denote by
-                  \f$x_0,\ldots,x_{n-1}\f$
-                  the n coordinates in the n-dimensional space.
-                  Then the resulting equation is:
-                  \f[ x_f = r_0 x_0 + \cdots + r_{f-1}x_{f-1}
-                   + r_{f+1}x_{f+1} + \cdots + r_{n-1}x_{n-1} + r_n. \f]
-  * @param funcOfOthers Determines which coord to express as a function of the
-                        others. Coords are numbered starting from 0, so that a
-                        value of 0 means \f$x\f$, 1 means \f$y\f$,
-                        2 means \f$z\f$, ...
-  *
-  * \sa fitHyperplane()
-  */
-template<typename VectorType>
-void linearRegression(int numPoints,
-                      VectorType **points,
-                      VectorType *result,
-                      int funcOfOthers )
-{
-  typedef typename VectorType::Scalar Scalar;
-  typedef Hyperplane<Scalar, VectorType::SizeAtCompileTime> HyperplaneType;
-  const int size = points[0]->size();
-  result->resize(size);
-  HyperplaneType h(size);
-  fitHyperplane(numPoints, points, &h);
-  for(int i = 0; i < funcOfOthers; i++)
-    result->coeffRef(i) = - h.coeffs()[i] / h.coeffs()[funcOfOthers];
-  for(int i = funcOfOthers; i < size; i++)
-    result->coeffRef(i) = - h.coeffs()[i+1] / h.coeffs()[funcOfOthers];
-}
-
-/** \ingroup LeastSquares_Module
-  *
-  * \leastsquares_module
-  *
-  * This function is quite similar to linearRegression(), so we refer to the
-  * documentation of this function and only list here the differences.
-  *
-  * The main difference from linearRegression() is that this function doesn't
-  * take a \a funcOfOthers argument. Instead, it finds a general equation
-  * of the form
-  * \f[ r_0 x_0 + \cdots + r_{n-1}x_{n-1} + r_n = 0, \f]
-  * where \f$n=Size\f$, \f$r_i=retCoefficients[i]\f$, and we denote by
-  * \f$x_0,\ldots,x_{n-1}\f$ the n coordinates in the n-dimensional space.
-  *
-  * Thus, the vector \a retCoefficients has size \f$n+1\f$, which is another
-  * difference from linearRegression().
-  *
-  * In practice, this function performs an hyper-plane fit in a total least square sense
-  * via the following steps:
-  *  1 - center the data to the mean
-  *  2 - compute the covariance matrix
-  *  3 - pick the eigenvector corresponding to the smallest eigenvalue of the covariance matrix
-  * The ratio of the smallest eigenvalue and the second one gives us a hint about the relevance
-  * of the solution. This value is optionally returned in \a soundness.
-  *
-  * \sa linearRegression()
-  */
-template<typename VectorType, typename HyperplaneType>
-void fitHyperplane(int numPoints,
-                   VectorType **points,
-                   HyperplaneType *result,
-                   typename NumTraits<typename VectorType::Scalar>::Real* soundness = 0)
-{
-  typedef typename VectorType::Scalar Scalar;
-  typedef Matrix<Scalar,VectorType::SizeAtCompileTime,VectorType::SizeAtCompileTime> CovMatrixType;
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType)
-  ei_assert(numPoints >= 1);
-  int size = points[0]->size();
-  ei_assert(size+1 == result->coeffs().size());
-
-  // compute the mean of the data
-  VectorType mean = VectorType::Zero(size);
-  for(int i = 0; i < numPoints; ++i)
-    mean += *(points[i]);
-  mean /= numPoints;
-
-  // compute the covariance matrix
-  CovMatrixType covMat = CovMatrixType::Zero(size, size);
-  VectorType remean = VectorType::Zero(size);
-  for(int i = 0; i < numPoints; ++i)
-  {
-    VectorType diff = (*(points[i]) - mean).conjugate();
-    covMat += diff * diff.adjoint();
-  }
-
-  // now we just have to pick the eigen vector with smallest eigen value
-  SelfAdjointEigenSolver<CovMatrixType> eig(covMat);
-  result->normal() = eig.eigenvectors().col(0);
-  if (soundness)
-    *soundness = eig.eigenvalues().coeff(0)/eig.eigenvalues().coeff(1);
-
-  // let's compute the constant coefficient such that the
-  // plane pass trough the mean point:
-  result->offset() = - (result->normal().cwise()* mean).sum();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_LEASTSQUARES_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Macros.h b/nuparu/include/Eigen/src/Eigen2Support/Macros.h
deleted file mode 100644
index 351c32af..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Macros.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MACROS_H
-#define EIGEN2_MACROS_H
-
-#define ei_assert eigen_assert
-#define ei_internal_assert eigen_internal_assert
-
-#define EIGEN_ALIGN_128 EIGEN_ALIGN16
-
-#define EIGEN_ARCH_WANTS_ALIGNMENT EIGEN_ALIGN_STATICALLY
-
-#endif // EIGEN2_MACROS_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/MathFunctions.h b/nuparu/include/Eigen/src/Eigen2Support/MathFunctions.h
deleted file mode 100644
index 3544af25..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/MathFunctions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MATH_FUNCTIONS_H
-#define EIGEN2_MATH_FUNCTIONS_H
-
-namespace Eigen { 
-
-template<typename T> inline typename NumTraits<T>::Real ei_real(const T& x) { return numext::real(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_imag(const T& x) { return numext::imag(x); }
-template<typename T> inline T ei_conj(const T& x) { return numext::conj(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_abs (const T& x) { using std::abs; return abs(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_abs2(const T& x) { return numext::abs2(x); }
-template<typename T> inline T ei_sqrt(const T& x) { using std::sqrt; return sqrt(x); }
-template<typename T> inline T ei_exp (const T& x) { using std::exp;  return exp(x); }
-template<typename T> inline T ei_log (const T& x) { using std::log;  return log(x); }
-template<typename T> inline T ei_sin (const T& x) { using std::sin;  return sin(x); }
-template<typename T> inline T ei_cos (const T& x) { using std::cos;  return cos(x); }
-template<typename T> inline T ei_atan2(const T& x,const T& y) { using std::atan2; return atan2(x,y); }
-template<typename T> inline T ei_pow (const T& x,const T& y) { return numext::pow(x,y); }
-template<typename T> inline T ei_random () { return internal::random<T>(); }
-template<typename T> inline T ei_random (const T& x, const T& y) { return internal::random(x, y); }
-
-template<typename T> inline T precision () { return NumTraits<T>::dummy_precision(); }
-template<typename T> inline T machine_epsilon () { return NumTraits<T>::epsilon(); }
-
-
-template<typename Scalar, typename OtherScalar>
-inline bool ei_isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isMuchSmallerThan(x, y, precision);
-}
-
-template<typename Scalar>
-inline bool ei_isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isApprox(x, y, precision);
-}
-
-template<typename Scalar>
-inline bool ei_isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isApproxOrLessThan(x, y, precision);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_MATH_FUNCTIONS_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Memory.h b/nuparu/include/Eigen/src/Eigen2Support/Memory.h
deleted file mode 100644
index f86372b6..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Memory.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MEMORY_H
-#define EIGEN2_MEMORY_H
-
-namespace Eigen { 
-
-inline void* ei_aligned_malloc(size_t size) { return internal::aligned_malloc(size); }
-inline void  ei_aligned_free(void *ptr) { internal::aligned_free(ptr); }
-inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size) { return internal::aligned_realloc(ptr, new_size, old_size); }
-inline void* ei_handmade_aligned_malloc(size_t size) { return internal::handmade_aligned_malloc(size); }
-inline void  ei_handmade_aligned_free(void *ptr) { internal::handmade_aligned_free(ptr); }
-
-template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
-{
-  return internal::conditional_aligned_malloc<Align>(size);
-}
-template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
-{
-  internal::conditional_aligned_free<Align>(ptr);
-}
-template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
-{
-  return internal::conditional_aligned_realloc<Align>(ptr, new_size, old_size);
-}
-
-template<typename T> inline T* ei_aligned_new(size_t size)
-{
-  return internal::aligned_new<T>(size);
-}
-template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
-{
-  return internal::aligned_delete(ptr, size);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_MACROS_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Meta.h b/nuparu/include/Eigen/src/Eigen2Support/Meta.h
deleted file mode 100644
index fa37cfc9..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Meta.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_META_H
-#define EIGEN2_META_H
-
-namespace Eigen { 
-
-template<typename T>
-struct ei_traits : internal::traits<T>
-{};
-
-struct ei_meta_true {  enum { ret = 1 }; };
-struct ei_meta_false { enum { ret = 0 }; };
-
-template<bool Condition, typename Then, typename Else>
-struct ei_meta_if { typedef Then ret; };
-
-template<typename Then, typename Else>
-struct ei_meta_if <false, Then, Else> { typedef Else ret; };
-
-template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; };
-template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; };
-
-template<typename T> struct ei_unref { typedef T type; };
-template<typename T> struct ei_unref<T&> { typedef T type; };
-
-template<typename T> struct ei_unpointer { typedef T type; };
-template<typename T> struct ei_unpointer<T*> { typedef T type; };
-template<typename T> struct ei_unpointer<T*const> { typedef T type; };
-
-template<typename T> struct ei_unconst { typedef T type; };
-template<typename T> struct ei_unconst<const T> { typedef T type; };
-template<typename T> struct ei_unconst<T const &> { typedef T & type; };
-template<typename T> struct ei_unconst<T const *> { typedef T * type; };
-
-template<typename T> struct ei_cleantype { typedef T type; };
-template<typename T> struct ei_cleantype<const T>   { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<const T&>  { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<T&>        { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<const T*>  { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<T*>        { typedef typename ei_cleantype<T>::type type; };
-
-/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
-  * Usage example: \code ei_meta_sqrt<1023>::ret \endcode
-  */
-template<int Y,
-         int InfX = 0,
-         int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
-                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
-class ei_meta_sqrt
-{
-    enum {
-      MidX = (InfX+SupX)/2,
-      TakeInf = MidX*MidX > Y ? 1 : 0,
-      NewInf = int(TakeInf) ? InfX : int(MidX),
-      NewSup = int(TakeInf) ? int(MidX) : SupX
-    };
-  public:
-    enum { ret = ei_meta_sqrt<Y,NewInf,NewSup>::ret };
-};
-
-template<int Y, int InfX, int SupX>
-class ei_meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
-
-} // end namespace Eigen
-
-#endif // EIGEN2_META_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/Minor.h b/nuparu/include/Eigen/src/Eigen2Support/Minor.h
deleted file mode 100644
index 4cded573..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/Minor.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MINOR_H
-#define EIGEN_MINOR_H
-
-namespace Eigen { 
-
-/**
-  * \class Minor
-  *
-  * \brief Expression of a minor
-  *
-  * \param MatrixType the type of the object in which we are taking a minor
-  *
-  * This class represents an expression of a minor. It is the return
-  * type of MatrixBase::minor() and most of the time this is the only way it
-  * is used.
-  *
-  * \sa MatrixBase::minor()
-  */
-
-namespace internal {
-template<typename MatrixType>
-struct traits<Minor<MatrixType> >
- : traits<MatrixType>
-{
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
-  typedef typename MatrixType::StorageKind StorageKind;
-  enum {
-    RowsAtCompileTime = (MatrixType::RowsAtCompileTime != Dynamic) ?
-                          int(MatrixType::RowsAtCompileTime) - 1 : Dynamic,
-    ColsAtCompileTime = (MatrixType::ColsAtCompileTime != Dynamic) ?
-                          int(MatrixType::ColsAtCompileTime) - 1 : Dynamic,
-    MaxRowsAtCompileTime = (MatrixType::MaxRowsAtCompileTime != Dynamic) ?
-                             int(MatrixType::MaxRowsAtCompileTime) - 1 : Dynamic,
-    MaxColsAtCompileTime = (MatrixType::MaxColsAtCompileTime != Dynamic) ?
-                             int(MatrixType::MaxColsAtCompileTime) - 1 : Dynamic,
-    Flags = _MatrixTypeNested::Flags & (HereditaryBits | LvalueBit),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost // minor is used typically on tiny matrices,
-      // where loops are unrolled and the 'if' evaluates at compile time
-  };
-};
-}
-
-template<typename MatrixType> class Minor
-  : public MatrixBase<Minor<MatrixType> >
-{
-  public:
-
-    typedef MatrixBase<Minor> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Minor)
-
-    inline Minor(const MatrixType& matrix,
-                       Index row, Index col)
-      : m_matrix(matrix), m_row(row), m_col(col)
-    {
-      eigen_assert(row >= 0 && row < matrix.rows()
-          && col >= 0 && col < matrix.cols());
-    }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Minor)
-
-    inline Index rows() const { return m_matrix.rows() - 1; }
-    inline Index cols() const { return m_matrix.cols() - 1; }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row + (row >= m_row), col + (col >= m_col));
-    }
-
-    inline const Scalar coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row + (row >= m_row), col + (col >= m_col));
-    }
-
-  protected:
-    const typename MatrixType::Nested m_matrix;
-    const Index m_row, m_col;
-};
-
-/**
-  * \return an expression of the (\a row, \a col)-minor of *this,
-  * i.e. an expression constructed from *this by removing the specified
-  * row and column.
-  *
-  * Example: \include MatrixBase_minor.cpp
-  * Output: \verbinclude MatrixBase_minor.out
-  *
-  * \sa class Minor
-  */
-template<typename Derived>
-inline Minor<Derived>
-MatrixBase<Derived>::minor(Index row, Index col)
-{
-  return Minor<Derived>(derived(), row, col);
-}
-
-/**
-  * This is the const version of minor(). */
-template<typename Derived>
-inline const Minor<Derived>
-MatrixBase<Derived>::minor(Index row, Index col) const
-{
-  return Minor<Derived>(derived(), row, col);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MINOR_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/QR.h b/nuparu/include/Eigen/src/Eigen2Support/QR.h
deleted file mode 100644
index 2042c985..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/QR.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_QR_H
-#define EIGEN2_QR_H
-
-namespace Eigen { 
-
-template<typename MatrixType>
-class QR : public HouseholderQR<MatrixType>
-{
-  public:
-
-    typedef HouseholderQR<MatrixType> Base;
-    typedef Block<const MatrixType, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixRBlockType;
-
-    QR() : Base() {}
-
-    template<typename T>
-    explicit QR(const T& t) : Base(t) {}
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = static_cast<const Base*>(this)->solve(b);
-      return true;
-    }
-
-    MatrixType matrixQ(void) const {
-      MatrixType ret = MatrixType::Identity(this->rows(), this->cols());
-      ret = this->householderQ() * ret;
-      return ret;
-    }
-
-    bool isFullRank() const {
-      return true;
-    }
-    
-    const TriangularView<MatrixRBlockType, UpperTriangular>
-    matrixR(void) const
-    {
-      int cols = this->cols();
-      return MatrixRBlockType(this->matrixQR(), 0, 0, cols, cols).template triangularView<UpperTriangular>();
-    }
-};
-
-/** \return the QR decomposition of \c *this.
-  *
-  * \sa class QR
-  */
-template<typename Derived>
-const QR<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::qr() const
-{
-  return QR<PlainObject>(eval());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_QR_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/SVD.h b/nuparu/include/Eigen/src/Eigen2Support/SVD.h
deleted file mode 100644
index 077d26d5..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/SVD.h
+++ /dev/null
@@ -1,638 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_SVD_H
-#define EIGEN2_SVD_H
-
-namespace Eigen {
-
-/** \ingroup SVD_Module
-  * \nonstableyet
-  *
-  * \class SVD
-  *
-  * \brief Standard SVD decomposition of a matrix and associated features
-  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  *
-  * This class performs a standard SVD decomposition of a real matrix A of size \c M x \c N
-  * with \c M \>= \c N.
-  *
-  *
-  * \sa MatrixBase::SVD()
-  */
-template<typename MatrixType> class SVD
-{
-  private:
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      AlignmentMask = int(PacketSize)-1,
-      MinSize = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime)
-    };
-
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVector;
-    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> RowVector;
-
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MinSize> MatrixUType;
-    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixVType;
-    typedef Matrix<Scalar, MinSize, 1> SingularValuesType;
-
-  public:
-
-    SVD() {} // a user who relied on compiler-generated default compiler reported problems with MSVC in 2.0.7
-    
-    SVD(const MatrixType& matrix)
-      : m_matU(matrix.rows(), (std::min)(matrix.rows(), matrix.cols())),
-        m_matV(matrix.cols(),matrix.cols()),
-        m_sigma((std::min)(matrix.rows(),matrix.cols()))
-    {
-      compute(matrix);
-    }
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived> &b, ResultType* result) const;
-
-    const MatrixUType& matrixU() const { return m_matU; }
-    const SingularValuesType& singularValues() const { return m_sigma; }
-    const MatrixVType& matrixV() const { return m_matV; }
-
-    void compute(const MatrixType& matrix);
-    SVD& sort();
-
-    template<typename UnitaryType, typename PositiveType>
-    void computeUnitaryPositive(UnitaryType *unitary, PositiveType *positive) const;
-    template<typename PositiveType, typename UnitaryType>
-    void computePositiveUnitary(PositiveType *positive, UnitaryType *unitary) const;
-    template<typename RotationType, typename ScalingType>
-    void computeRotationScaling(RotationType *unitary, ScalingType *positive) const;
-    template<typename ScalingType, typename RotationType>
-    void computeScalingRotation(ScalingType *positive, RotationType *unitary) const;
-
-  protected:
-    /** \internal */
-    MatrixUType m_matU;
-    /** \internal */
-    MatrixVType m_matV;
-    /** \internal */
-    SingularValuesType m_sigma;
-};
-
-/** Computes / recomputes the SVD decomposition A = U S V^* of \a matrix
-  *
-  * \note this code has been adapted from JAMA (public domain)
-  */
-template<typename MatrixType>
-void SVD<MatrixType>::compute(const MatrixType& matrix)
-{
-  const int m = matrix.rows();
-  const int n = matrix.cols();
-  const int nu = (std::min)(m,n);
-  ei_assert(m>=n && "In Eigen 2.0, SVD only works for MxN matrices with M>=N. Sorry!");
-  ei_assert(m>1 && "In Eigen 2.0, SVD doesn't work on 1x1 matrices");
-
-  m_matU.resize(m, nu);
-  m_matU.setZero();
-  m_sigma.resize((std::min)(m,n));
-  m_matV.resize(n,n);
-
-  RowVector e(n);
-  ColVector work(m);
-  MatrixType matA(matrix);
-  const bool wantu = true;
-  const bool wantv = true;
-  int i=0, j=0, k=0;
-
-  // Reduce A to bidiagonal form, storing the diagonal elements
-  // in s and the super-diagonal elements in e.
-  int nct = (std::min)(m-1,n);
-  int nrt = (std::max)(0,(std::min)(n-2,m));
-  for (k = 0; k < (std::max)(nct,nrt); ++k)
-  {
-    if (k < nct)
-    {
-      // Compute the transformation for the k-th column and
-      // place the k-th diagonal in m_sigma[k].
-      m_sigma[k] = matA.col(k).end(m-k).norm();
-      if (m_sigma[k] != 0.0) // FIXME
-      {
-        if (matA(k,k) < 0.0)
-          m_sigma[k] = -m_sigma[k];
-        matA.col(k).end(m-k) /= m_sigma[k];
-        matA(k,k) += 1.0;
-      }
-      m_sigma[k] = -m_sigma[k];
-    }
-
-    for (j = k+1; j < n; ++j)
-    {
-      if ((k < nct) && (m_sigma[k] != 0.0))
-      {
-        // Apply the transformation.
-        Scalar t = matA.col(k).end(m-k).eigen2_dot(matA.col(j).end(m-k)); // FIXME dot product or cwise prod + .sum() ??
-        t = -t/matA(k,k);
-        matA.col(j).end(m-k) += t * matA.col(k).end(m-k);
-      }
-
-      // Place the k-th row of A into e for the
-      // subsequent calculation of the row transformation.
-      e[j] = matA(k,j);
-    }
-
-    // Place the transformation in U for subsequent back multiplication.
-    if (wantu & (k < nct))
-      m_matU.col(k).end(m-k) = matA.col(k).end(m-k);
-
-    if (k < nrt)
-    {
-      // Compute the k-th row transformation and place the
-      // k-th super-diagonal in e[k].
-      e[k] = e.end(n-k-1).norm();
-      if (e[k] != 0.0)
-      {
-          if (e[k+1] < 0.0)
-            e[k] = -e[k];
-          e.end(n-k-1) /= e[k];
-          e[k+1] += 1.0;
-      }
-      e[k] = -e[k];
-      if ((k+1 < m) & (e[k] != 0.0))
-      {
-        // Apply the transformation.
-        work.end(m-k-1) = matA.corner(BottomRight,m-k-1,n-k-1) * e.end(n-k-1);
-        for (j = k+1; j < n; ++j)
-          matA.col(j).end(m-k-1) += (-e[j]/e[k+1]) * work.end(m-k-1);
-      }
-
-      // Place the transformation in V for subsequent back multiplication.
-      if (wantv)
-        m_matV.col(k).end(n-k-1) = e.end(n-k-1);
-    }
-  }
-
-
-  // Set up the final bidiagonal matrix or order p.
-  int p = (std::min)(n,m+1);
-  if (nct < n)
-    m_sigma[nct] = matA(nct,nct);
-  if (m < p)
-    m_sigma[p-1] = 0.0;
-  if (nrt+1 < p)
-    e[nrt] = matA(nrt,p-1);
-  e[p-1] = 0.0;
-
-  // If required, generate U.
-  if (wantu)
-  {
-    for (j = nct; j < nu; ++j)
-    {
-      m_matU.col(j).setZero();
-      m_matU(j,j) = 1.0;
-    }
-    for (k = nct-1; k >= 0; k--)
-    {
-      if (m_sigma[k] != 0.0)
-      {
-        for (j = k+1; j < nu; ++j)
-        {
-          Scalar t = m_matU.col(k).end(m-k).eigen2_dot(m_matU.col(j).end(m-k)); // FIXME is it really a dot product we want ?
-          t = -t/m_matU(k,k);
-          m_matU.col(j).end(m-k) += t * m_matU.col(k).end(m-k);
-        }
-        m_matU.col(k).end(m-k) = - m_matU.col(k).end(m-k);
-        m_matU(k,k) = Scalar(1) + m_matU(k,k);
-        if (k-1>0)
-          m_matU.col(k).start(k-1).setZero();
-      }
-      else
-      {
-        m_matU.col(k).setZero();
-        m_matU(k,k) = 1.0;
-      }
-    }
-  }
-
-  // If required, generate V.
-  if (wantv)
-  {
-    for (k = n-1; k >= 0; k--)
-    {
-      if ((k < nrt) & (e[k] != 0.0))
-      {
-        for (j = k+1; j < nu; ++j)
-        {
-          Scalar t = m_matV.col(k).end(n-k-1).eigen2_dot(m_matV.col(j).end(n-k-1)); // FIXME is it really a dot product we want ?
-          t = -t/m_matV(k+1,k);
-          m_matV.col(j).end(n-k-1) += t * m_matV.col(k).end(n-k-1);
-        }
-      }
-      m_matV.col(k).setZero();
-      m_matV(k,k) = 1.0;
-    }
-  }
-
-  // Main iteration loop for the singular values.
-  int pp = p-1;
-  int iter = 0;
-  Scalar eps = ei_pow(Scalar(2),ei_is_same_type<Scalar,float>::ret ? Scalar(-23) : Scalar(-52));
-  while (p > 0)
-  {
-    int k=0;
-    int kase=0;
-
-    // Here is where a test for too many iterations would go.
-
-    // This section of the program inspects for
-    // negligible elements in the s and e arrays.  On
-    // completion the variables kase and k are set as follows.
-
-    // kase = 1     if s(p) and e[k-1] are negligible and k<p
-    // kase = 2     if s(k) is negligible and k<p
-    // kase = 3     if e[k-1] is negligible, k<p, and
-    //              s(k), ..., s(p) are not negligible (qr step).
-    // kase = 4     if e(p-1) is negligible (convergence).
-
-    for (k = p-2; k >= -1; --k)
-    {
-      if (k == -1)
-          break;
-      if (ei_abs(e[k]) <= eps*(ei_abs(m_sigma[k]) + ei_abs(m_sigma[k+1])))
-      {
-          e[k] = 0.0;
-          break;
-      }
-    }
-    if (k == p-2)
-    {
-      kase = 4;
-    }
-    else
-    {
-      int ks;
-      for (ks = p-1; ks >= k; --ks)
-      {
-        if (ks == k)
-          break;
-        Scalar t = (ks != p ? ei_abs(e[ks]) : Scalar(0)) + (ks != k+1 ? ei_abs(e[ks-1]) : Scalar(0));
-        if (ei_abs(m_sigma[ks]) <= eps*t)
-        {
-          m_sigma[ks] = 0.0;
-          break;
-        }
-      }
-      if (ks == k)
-      {
-        kase = 3;
-      }
-      else if (ks == p-1)
-      {
-        kase = 1;
-      }
-      else
-      {
-        kase = 2;
-        k = ks;
-      }
-    }
-    ++k;
-
-    // Perform the task indicated by kase.
-    switch (kase)
-    {
-
-      // Deflate negligible s(p).
-      case 1:
-      {
-        Scalar f(e[p-2]);
-        e[p-2] = 0.0;
-        for (j = p-2; j >= k; --j)
-        {
-          Scalar t(numext::hypot(m_sigma[j],f));
-          Scalar cs(m_sigma[j]/t);
-          Scalar sn(f/t);
-          m_sigma[j] = t;
-          if (j != k)
-          {
-            f = -sn*e[j-1];
-            e[j-1] = cs*e[j-1];
-          }
-          if (wantv)
-          {
-            for (i = 0; i < n; ++i)
-            {
-              t = cs*m_matV(i,j) + sn*m_matV(i,p-1);
-              m_matV(i,p-1) = -sn*m_matV(i,j) + cs*m_matV(i,p-1);
-              m_matV(i,j) = t;
-            }
-          }
-        }
-      }
-      break;
-
-      // Split at negligible s(k).
-      case 2:
-      {
-        Scalar f(e[k-1]);
-        e[k-1] = 0.0;
-        for (j = k; j < p; ++j)
-        {
-          Scalar t(numext::hypot(m_sigma[j],f));
-          Scalar cs( m_sigma[j]/t);
-          Scalar sn(f/t);
-          m_sigma[j] = t;
-          f = -sn*e[j];
-          e[j] = cs*e[j];
-          if (wantu)
-          {
-            for (i = 0; i < m; ++i)
-            {
-              t = cs*m_matU(i,j) + sn*m_matU(i,k-1);
-              m_matU(i,k-1) = -sn*m_matU(i,j) + cs*m_matU(i,k-1);
-              m_matU(i,j) = t;
-            }
-          }
-        }
-      }
-      break;
-
-      // Perform one qr step.
-      case 3:
-      {
-        // Calculate the shift.
-        Scalar scale = (std::max)((std::max)((std::max)((std::max)(
-                        ei_abs(m_sigma[p-1]),ei_abs(m_sigma[p-2])),ei_abs(e[p-2])),
-                        ei_abs(m_sigma[k])),ei_abs(e[k]));
-        Scalar sp = m_sigma[p-1]/scale;
-        Scalar spm1 = m_sigma[p-2]/scale;
-        Scalar epm1 = e[p-2]/scale;
-        Scalar sk = m_sigma[k]/scale;
-        Scalar ek = e[k]/scale;
-        Scalar b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/Scalar(2);
-        Scalar c = (sp*epm1)*(sp*epm1);
-        Scalar shift(0);
-        if ((b != 0.0) || (c != 0.0))
-        {
-          shift = ei_sqrt(b*b + c);
-          if (b < 0.0)
-            shift = -shift;
-          shift = c/(b + shift);
-        }
-        Scalar f = (sk + sp)*(sk - sp) + shift;
-        Scalar g = sk*ek;
-
-        // Chase zeros.
-
-        for (j = k; j < p-1; ++j)
-        {
-          Scalar t = numext::hypot(f,g);
-          Scalar cs = f/t;
-          Scalar sn = g/t;
-          if (j != k)
-            e[j-1] = t;
-          f = cs*m_sigma[j] + sn*e[j];
-          e[j] = cs*e[j] - sn*m_sigma[j];
-          g = sn*m_sigma[j+1];
-          m_sigma[j+1] = cs*m_sigma[j+1];
-          if (wantv)
-          {
-            for (i = 0; i < n; ++i)
-            {
-              t = cs*m_matV(i,j) + sn*m_matV(i,j+1);
-              m_matV(i,j+1) = -sn*m_matV(i,j) + cs*m_matV(i,j+1);
-              m_matV(i,j) = t;
-            }
-          }
-          t = numext::hypot(f,g);
-          cs = f/t;
-          sn = g/t;
-          m_sigma[j] = t;
-          f = cs*e[j] + sn*m_sigma[j+1];
-          m_sigma[j+1] = -sn*e[j] + cs*m_sigma[j+1];
-          g = sn*e[j+1];
-          e[j+1] = cs*e[j+1];
-          if (wantu && (j < m-1))
-          {
-            for (i = 0; i < m; ++i)
-            {
-              t = cs*m_matU(i,j) + sn*m_matU(i,j+1);
-              m_matU(i,j+1) = -sn*m_matU(i,j) + cs*m_matU(i,j+1);
-              m_matU(i,j) = t;
-            }
-          }
-        }
-        e[p-2] = f;
-        iter = iter + 1;
-      }
-      break;
-
-      // Convergence.
-      case 4:
-      {
-        // Make the singular values positive.
-        if (m_sigma[k] <= 0.0)
-        {
-          m_sigma[k] = m_sigma[k] < Scalar(0) ? -m_sigma[k] : Scalar(0);
-          if (wantv)
-            m_matV.col(k).start(pp+1) = -m_matV.col(k).start(pp+1);
-        }
-
-        // Order the singular values.
-        while (k < pp)
-        {
-          if (m_sigma[k] >= m_sigma[k+1])
-            break;
-          Scalar t = m_sigma[k];
-          m_sigma[k] = m_sigma[k+1];
-          m_sigma[k+1] = t;
-          if (wantv && (k < n-1))
-            m_matV.col(k).swap(m_matV.col(k+1));
-          if (wantu && (k < m-1))
-            m_matU.col(k).swap(m_matU.col(k+1));
-          ++k;
-        }
-        iter = 0;
-        p--;
-      }
-      break;
-    } // end big switch
-  } // end iterations
-}
-
-template<typename MatrixType>
-SVD<MatrixType>& SVD<MatrixType>::sort()
-{
-  int mu = m_matU.rows();
-  int mv = m_matV.rows();
-  int n  = m_matU.cols();
-
-  for (int i=0; i<n; ++i)
-  {
-    int  k = i;
-    Scalar p = m_sigma.coeff(i);
-
-    for (int j=i+1; j<n; ++j)
-    {
-      if (m_sigma.coeff(j) > p)
-      {
-        k = j;
-        p = m_sigma.coeff(j);
-      }
-    }
-    if (k != i)
-    {
-      m_sigma.coeffRef(k) = m_sigma.coeff(i);  // i.e.
-      m_sigma.coeffRef(i) = p;                 // swaps the i-th and the k-th elements
-
-      int j = mu;
-      for(int s=0; j!=0; ++s, --j)
-        std::swap(m_matU.coeffRef(s,i), m_matU.coeffRef(s,k));
-
-      j = mv;
-      for (int s=0; j!=0; ++s, --j)
-        std::swap(m_matV.coeffRef(s,i), m_matV.coeffRef(s,k));
-    }
-  }
-  return *this;
-}
-
-/** \returns the solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-  * The parts of the solution corresponding to zero singular values are ignored.
-  *
-  * \sa MatrixBase::svd(), LU::solve(), LLT::solve()
-  */
-template<typename MatrixType>
-template<typename OtherDerived, typename ResultType>
-bool SVD<MatrixType>::solve(const MatrixBase<OtherDerived> &b, ResultType* result) const
-{
-  const int rows = m_matU.rows();
-  ei_assert(b.rows() == rows);
-
-  Scalar maxVal = m_sigma.cwise().abs().maxCoeff();
-  for (int j=0; j<b.cols(); ++j)
-  {
-    Matrix<Scalar,MatrixUType::RowsAtCompileTime,1> aux = m_matU.transpose() * b.col(j);
-
-    for (int i = 0; i <m_matU.cols(); ++i)
-    {
-      Scalar si = m_sigma.coeff(i);
-      if (ei_isMuchSmallerThan(ei_abs(si),maxVal))
-        aux.coeffRef(i) = 0;
-      else
-        aux.coeffRef(i) /= si;
-    }
-
-    result->col(j) = m_matV * aux;
-  }
-  return true;
-}
-
-/** Computes the polar decomposition of the matrix, as a product unitary x positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * Only for square matrices.
-  *
-  * \sa computePositiveUnitary(), computeRotationScaling()
-  */
-template<typename MatrixType>
-template<typename UnitaryType, typename PositiveType>
-void SVD<MatrixType>::computeUnitaryPositive(UnitaryType *unitary,
-                                             PositiveType *positive) const
-{
-  ei_assert(m_matU.cols() == m_matV.cols() && "Polar decomposition is only for square matrices");
-  if(unitary) *unitary = m_matU * m_matV.adjoint();
-  if(positive) *positive = m_matV * m_sigma.asDiagonal() * m_matV.adjoint();
-}
-
-/** Computes the polar decomposition of the matrix, as a product positive x unitary.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * Only for square matrices.
-  *
-  * \sa computeUnitaryPositive(), computeRotationScaling()
-  */
-template<typename MatrixType>
-template<typename UnitaryType, typename PositiveType>
-void SVD<MatrixType>::computePositiveUnitary(UnitaryType *positive,
-                                             PositiveType *unitary) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  if(unitary) *unitary = m_matU * m_matV.adjoint();
-  if(positive) *positive = m_matU * m_sigma.asDiagonal() * m_matU.adjoint();
-}
-
-/** decomposes the matrix as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * This method requires the Geometry module.
-  *
-  * \sa computeScalingRotation(), computeUnitaryPositive()
-  */
-template<typename MatrixType>
-template<typename RotationType, typename ScalingType>
-void SVD<MatrixType>::computeRotationScaling(RotationType *rotation, ScalingType *scaling) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(m_matV * sv.asDiagonal() * m_matV.adjoint());
-  if(rotation)
-  {
-    MatrixType m(m_matU);
-    m.col(0) /= x;
-    rotation->lazyAssign(m * m_matV.adjoint());
-  }
-}
-
-/** decomposes the matrix as a product scaling x rotation, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * This method requires the Geometry module.
-  *
-  * \sa computeRotationScaling(), computeUnitaryPositive()
-  */
-template<typename MatrixType>
-template<typename ScalingType, typename RotationType>
-void SVD<MatrixType>::computeScalingRotation(ScalingType *scaling, RotationType *rotation) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(m_matU * sv.asDiagonal() * m_matU.adjoint());
-  if(rotation)
-  {
-    MatrixType m(m_matU);
-    m.col(0) /= x;
-    rotation->lazyAssign(m * m_matV.adjoint());
-  }
-}
-
-
-/** \svd_module
-  * \returns the SVD decomposition of \c *this
-  */
-template<typename Derived>
-inline SVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::svd() const
-{
-  return SVD<PlainObject>(derived());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_SVD_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/TriangularSolver.h b/nuparu/include/Eigen/src/Eigen2Support/TriangularSolver.h
deleted file mode 100644
index ebbeb3b4..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/TriangularSolver.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TRIANGULAR_SOLVER2_H
-#define EIGEN_TRIANGULAR_SOLVER2_H
-
-namespace Eigen { 
-
-const unsigned int UnitDiagBit = UnitDiag;
-const unsigned int SelfAdjointBit = SelfAdjoint;
-const unsigned int UpperTriangularBit = Upper;
-const unsigned int LowerTriangularBit = Lower;
-
-const unsigned int UpperTriangular = Upper;
-const unsigned int LowerTriangular = Lower;
-const unsigned int UnitUpperTriangular = UnitUpper;
-const unsigned int UnitLowerTriangular = UnitLower;
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-template<typename OtherDerived>
-typename ExpressionType::PlainObject
-Flagged<ExpressionType,Added,Removed>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  return m_matrix.template triangularView<Added>().solve(other.derived());
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-template<typename OtherDerived>
-void Flagged<ExpressionType,Added,Removed>::solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const
-{
-  m_matrix.template triangularView<Added>().solveInPlace(other.derived());
-}
-
-} // end namespace Eigen
-    
-#endif // EIGEN_TRIANGULAR_SOLVER2_H
diff --git a/nuparu/include/Eigen/src/Eigen2Support/VectorBlock.h b/nuparu/include/Eigen/src/Eigen2Support/VectorBlock.h
deleted file mode 100644
index 71a8080a..00000000
--- a/nuparu/include/Eigen/src/Eigen2Support/VectorBlock.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_VECTORBLOCK_H
-#define EIGEN2_VECTORBLOCK_H
-
-namespace Eigen { 
-
-/** \deprecated use DenseMase::head(Index) */
-template<typename Derived>
-inline VectorBlock<Derived>
-MatrixBase<Derived>::start(Index size)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived>(derived(), 0, size);
-}
-
-/** \deprecated use DenseMase::head(Index) */
-template<typename Derived>
-inline const VectorBlock<const Derived>
-MatrixBase<Derived>::start(Index size) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived>(derived(), 0, size);
-}
-
-/** \deprecated use DenseMase::tail(Index) */
-template<typename Derived>
-inline VectorBlock<Derived>
-MatrixBase<Derived>::end(Index size)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived>(derived(), this->size() - size, size);
-}
-
-/** \deprecated use DenseMase::tail(Index) */
-template<typename Derived>
-inline const VectorBlock<const Derived>
-MatrixBase<Derived>::end(Index size) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived>(derived(), this->size() - size, size);
-}
-
-/** \deprecated use DenseMase::head() */
-template<typename Derived>
-template<int Size>
-inline VectorBlock<Derived,Size>
-MatrixBase<Derived>::start()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived,Size>(derived(), 0);
-}
-
-/** \deprecated use DenseMase::head() */
-template<typename Derived>
-template<int Size>
-inline const VectorBlock<const Derived,Size>
-MatrixBase<Derived>::start() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived,Size>(derived(), 0);
-}
-
-/** \deprecated use DenseMase::tail() */
-template<typename Derived>
-template<int Size>
-inline VectorBlock<Derived,Size>
-MatrixBase<Derived>::end()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived, Size>(derived(), size() - Size);
-}
-
-/** \deprecated use DenseMase::tail() */
-template<typename Derived>
-template<int Size>
-inline const VectorBlock<const Derived,Size>
-MatrixBase<Derived>::end() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived, Size>(derived(), size() - Size);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_VECTORBLOCK_H
diff --git a/nuparu/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/nuparu/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index af434bc9..ec3b1633 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -60,7 +60,7 @@ template<typename _MatrixType> class ComplexEigenSolver
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType.
       *
@@ -104,7 +104,7 @@ template<typename _MatrixType> class ComplexEigenSolver
       * according to the specified problem \a size.
       * \sa ComplexEigenSolver()
       */
-    ComplexEigenSolver(Index size)
+    explicit ComplexEigenSolver(Index size)
             : m_eivec(size, size),
               m_eivalues(size),
               m_schur(size),
@@ -122,7 +122,8 @@ template<typename _MatrixType> class ComplexEigenSolver
       *
       * This constructor calls compute() to compute the eigendecomposition.
       */
-      ComplexEigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+    template<typename InputType>
+    explicit ComplexEigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
             : m_eivec(matrix.rows(),matrix.cols()),
               m_eivalues(matrix.cols()),
               m_schur(matrix.rows()),
@@ -130,7 +131,7 @@ template<typename _MatrixType> class ComplexEigenSolver
               m_eigenvectorsOk(false),
               m_matX(matrix.rows(),matrix.cols())
     {
-      compute(matrix, computeEigenvectors);
+      compute(matrix.derived(), computeEigenvectors);
     }
 
     /** \brief Returns the eigenvectors of given matrix.
@@ -208,7 +209,8 @@ template<typename _MatrixType> class ComplexEigenSolver
       * Example: \include ComplexEigenSolver_compute.cpp
       * Output: \verbinclude ComplexEigenSolver_compute.out
       */
-    ComplexEigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+    template<typename InputType>
+    ComplexEigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -234,6 +236,12 @@ template<typename _MatrixType> class ComplexEigenSolver
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     EigenvectorType m_eivec;
     EigenvalueType m_eivalues;
     ComplexSchur<MatrixType> m_schur;
@@ -248,21 +256,24 @@ template<typename _MatrixType> class ComplexEigenSolver
 
 
 template<typename MatrixType>
+template<typename InputType>
 ComplexEigenSolver<MatrixType>& 
-ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+ComplexEigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   // this code is inspired from Jampack
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Do a complex Schur decomposition, A = U T U^*
   // The eigenvalues are on the diagonal of T.
-  m_schur.compute(matrix, computeEigenvectors);
+  m_schur.compute(matrix.derived(), computeEigenvectors);
 
   if(m_schur.info() == Success)
   {
     m_eivalues = m_schur.matrixT().diagonal();
     if(computeEigenvectors)
-      doComputeEigenvectors(matrix.norm());
+      doComputeEigenvectors(m_schur.matrixT().norm());
     sortEigenvalues(computeEigenvectors);
   }
 
diff --git a/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur.h b/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur.h
index 89e6cade..7f38919f 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -63,7 +63,7 @@ template<typename _MatrixType> class ComplexSchur
     /** \brief Scalar type for matrices of type \p _MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for \p _MatrixType. 
       *
@@ -91,7 +91,7 @@ template<typename _MatrixType> class ComplexSchur
       *
       * \sa compute() for an example.
       */
-    ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+    explicit ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
       : m_matT(size,size),
         m_matU(size,size),
         m_hess(size),
@@ -109,7 +109,8 @@ template<typename _MatrixType> class ComplexSchur
       *
       * \sa matrixT() and matrixU() for examples.
       */
-    ComplexSchur(const MatrixType& matrix, bool computeU = true)
+    template<typename InputType>
+    explicit ComplexSchur(const EigenBase<InputType>& matrix, bool computeU = true)
       : m_matT(matrix.rows(),matrix.cols()),
         m_matU(matrix.rows(),matrix.cols()),
         m_hess(matrix.rows()),
@@ -117,7 +118,7 @@ template<typename _MatrixType> class ComplexSchur
         m_matUisUptodate(false),
         m_maxIters(-1)
     {
-      compute(matrix, computeU);
+      compute(matrix.derived(), computeU);
     }
 
     /** \brief Returns the unitary matrix in the Schur decomposition. 
@@ -186,7 +187,8 @@ template<typename _MatrixType> class ComplexSchur
       *
       * \sa compute(const MatrixType&, bool, Index)
       */
-    ComplexSchur& compute(const MatrixType& matrix, bool computeU = true);
+    template<typename InputType>
+    ComplexSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
     
     /** \brief Compute Schur decomposition from a given Hessenberg matrix
      *  \param[in] matrixH Matrix in Hessenberg form H
@@ -313,14 +315,15 @@ typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::compu
 
 
 template<typename MatrixType>
-ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+template<typename InputType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
   m_matUisUptodate = false;
   eigen_assert(matrix.cols() == matrix.rows());
 
   if(matrix.cols() == 1)
   {
-    m_matT = matrix.template cast<ComplexScalar>();
+    m_matT = matrix.derived().template cast<ComplexScalar>();
     if(computeU)  m_matU = ComplexMatrixType::Identity(1,1);
     m_info = Success;
     m_isInitialized = true;
@@ -328,7 +331,7 @@ ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& ma
     return *this;
   }
 
-  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix, computeU);
+  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix.derived(), computeU);
   computeFromHessenberg(m_matT, m_matU, computeU);
   return *this;
 }
diff --git a/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
index 91496ae5..e20c3725 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
@@ -40,12 +40,11 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+template<> template<typename InputType> inline \
 ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
   typedef MatrixType::RealScalar RealScalar; \
   typedef std::complex<RealScalar> ComplexScalar; \
 \
@@ -54,7 +53,7 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
   m_matUisUptodate = false; \
   if(matrix.cols() == 1) \
   { \
-    m_matT = matrix.cast<ComplexScalar>(); \
+    m_matT = matrix.derived().template cast<ComplexScalar>(); \
     if(computeU)  m_matU = ComplexMatrixType::Identity(1,1); \
       m_info = Success; \
       m_isInitialized = true; \
@@ -62,7 +61,6 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
       return *this; \
   } \
   lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
   lapack_int matrix_order = MKLCOLROW; \
   char jobvs, sort='N'; \
   LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
@@ -70,6 +68,7 @@ ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matri
   m_matU.resize(n, n); \
   lapack_int ldvs  = m_matU.outerStride(); \
   m_matT = matrix; \
+  lapack_int lda = m_matT.outerStride(); \
   Matrix<EIGTYPE, Dynamic, Dynamic> w; \
   w.resize(n, 1);\
   info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
diff --git a/nuparu/include/Eigen/src/Eigenvalues/EigenSolver.h b/nuparu/include/Eigen/src/Eigenvalues/EigenSolver.h
index 6e715068..532ca7d6 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/EigenSolver.h
@@ -79,7 +79,7 @@ template<typename _MatrixType> class EigenSolver
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType. 
       *
@@ -110,7 +110,7 @@ template<typename _MatrixType> class EigenSolver
       *
       * \sa compute() for an example.
       */
- EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
+    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -118,7 +118,7 @@ template<typename _MatrixType> class EigenSolver
       * according to the specified problem \a size.
       * \sa EigenSolver()
       */
-    EigenSolver(Index size)
+    explicit EigenSolver(Index size)
       : m_eivec(size, size),
         m_eivalues(size),
         m_isInitialized(false),
@@ -143,7 +143,8 @@ template<typename _MatrixType> class EigenSolver
       *
       * \sa compute()
       */
-    EigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+    template<typename InputType>
+    explicit EigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_isInitialized(false),
@@ -152,7 +153,7 @@ template<typename _MatrixType> class EigenSolver
         m_matT(matrix.rows(), matrix.cols()), 
         m_tmp(matrix.cols())
     {
-      compute(matrix, computeEigenvectors);
+      compute(matrix.derived(), computeEigenvectors);
     }
 
     /** \brief Returns the eigenvectors of given matrix. 
@@ -273,12 +274,14 @@ template<typename _MatrixType> class EigenSolver
       * Example: \include EigenSolver_compute.cpp
       * Output: \verbinclude EigenSolver_compute.out
       */
-    EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
+    template<typename InputType>
+    EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      return m_realSchur.info();
+      return m_info;
     }
 
     /** \brief Sets the maximum number of iterations allowed. */
@@ -298,10 +301,18 @@ template<typename _MatrixType> class EigenSolver
     void doComputeEigenvectors();
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     EigenvalueType m_eivalues;
     bool m_isInitialized;
     bool m_eigenvectorsOk;
+    ComputationInfo m_info;
     RealSchur<MatrixType> m_realSchur;
     MatrixType m_matT;
 
@@ -361,17 +372,23 @@ typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eige
 }
 
 template<typename MatrixType>
+template<typename InputType>
 EigenSolver<MatrixType>& 
-EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
+EigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
+  using numext::isfinite;
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Reduce to real Schur form.
-  m_realSchur.compute(matrix, computeEigenvectors);
+  m_realSchur.compute(matrix.derived(), computeEigenvectors);
+  
+  m_info = m_realSchur.info();
 
-  if (m_realSchur.info() == Success)
+  if (m_info == Success)
   {
     m_matT = m_realSchur.matrixT();
     if (computeEigenvectors)
@@ -385,14 +402,40 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
       if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
       {
         m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
+        if(!(isfinite)(m_eivalues.coeffRef(i)))
+        {
+          m_isInitialized = true;
+          m_eigenvectorsOk = false;
+          m_info = NumericalIssue;
+          return *this;
+        }
         ++i;
       }
       else
       {
         Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        Scalar z;
+        // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        // without overflow
+        {
+          Scalar t0 = m_matT.coeff(i+1, i);
+          Scalar t1 = m_matT.coeff(i, i+1);
+          Scalar maxval = numext::maxi<Scalar>(abs(p),numext::maxi<Scalar>(abs(t0),abs(t1)));
+          t0 /= maxval;
+          t1 /= maxval;
+          Scalar p0 = p/maxval;
+          z = maxval * sqrt(abs(p0 * p0 + t0 * t1));
+        }
+        
         m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
         m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
+        if(!((isfinite)(m_eivalues.coeffRef(i)) && (isfinite)(m_eivalues.coeffRef(i+1))))
+        {
+          m_isInitialized = true;
+          m_eigenvectorsOk = false;
+          m_info = NumericalIssue;
+          return *this;
+        }
         i += 2;
       }
     }
@@ -444,7 +487,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
   }
   
   // Backsubstitute to find vectors of upper triangular form
-  if (norm == 0.0)
+  if (norm == Scalar(0))
   {
     return;
   }
@@ -466,7 +509,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
         Scalar w = m_matT.coeff(i,i) - p;
         Scalar r = m_matT.row(i).segment(l,n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
         {
           lastw = w;
           lastr = r;
@@ -474,9 +517,9 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
         else
         {
           l = i;
-          if (m_eivalues.coeff(i).imag() == 0.0)
+          if (m_eivalues.coeff(i).imag() == Scalar(0))
           {
-            if (w != 0.0)
+            if (w != Scalar(0))
               m_matT.coeffRef(i,n) = -r / w;
             else
               m_matT.coeffRef(i,n) = -r / (eps * norm);
@@ -514,19 +557,19 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
       }
       else
       {
-        std::complex<Scalar> cc = cdiv<Scalar>(0.0,-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
+        std::complex<Scalar> cc = cdiv<Scalar>(Scalar(0),-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
         m_matT.coeffRef(n-1,n-1) = numext::real(cc);
         m_matT.coeffRef(n-1,n) = numext::imag(cc);
       }
-      m_matT.coeffRef(n,n-1) = 0.0;
-      m_matT.coeffRef(n,n) = 1.0;
+      m_matT.coeffRef(n,n-1) = Scalar(0);
+      m_matT.coeffRef(n,n) = Scalar(1);
       for (Index i = n-2; i >= 0; i--)
       {
         Scalar ra = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n-1).segment(l, n-l+1));
         Scalar sa = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
         Scalar w = m_matT.coeff(i,i) - p;
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
+        if (m_eivalues.coeff(i).imag() < Scalar(0))
         {
           lastw = w;
           lastra = ra;
@@ -548,7 +591,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
             Scalar y = m_matT.coeff(i+1,i);
             Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
             Scalar vi = (m_eivalues.coeff(i).real() - p) * Scalar(2) * q;
-            if ((vr == 0.0) && (vi == 0.0))
+            if ((vr == Scalar(0)) && (vi == Scalar(0)))
               vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));
 
             std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
@@ -568,8 +611,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
           }
 
           // Overflow control
-          using std::max;
-          Scalar t = (max)(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
           if ((eps * t) * t > Scalar(1))
             m_matT.block(i, n-1, size-i, 2) /= t;
 
@@ -581,7 +623,7 @@ void EigenSolver<MatrixType>::doComputeEigenvectors()
     }
     else
     {
-      eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen
+      eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)"); // this should not happen
     }
   }
 
diff --git a/nuparu/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/nuparu/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index dc240e13..a9d6790d 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -72,7 +72,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Complex scalar type for #MatrixType. 
       *
@@ -122,7 +122,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * according to the specified problem \a size.
       * \sa GeneralizedEigenSolver()
       */
-    GeneralizedEigenSolver(Index size)
+    explicit GeneralizedEigenSolver(Index size)
       : m_eivec(size, size),
         m_alphas(size),
         m_betas(size),
@@ -263,6 +263,13 @@ template<typename _MatrixType> class GeneralizedEigenSolver
     }
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+    }
+    
     MatrixType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
@@ -290,6 +297,8 @@ template<typename MatrixType>
 GeneralizedEigenSolver<MatrixType>&
 GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
 {
+  check_template_parameters();
+  
   using std::sqrt;
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
diff --git a/nuparu/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/nuparu/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 07bf1ea0..5f6bb828 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -50,7 +50,6 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
     typedef SelfAdjointEigenSolver<_MatrixType> Base;
   public:
 
-    typedef typename Base::Index Index;
     typedef _MatrixType MatrixType;
 
     /** \brief Default constructor for fixed-size matrices.
@@ -74,7 +73,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
       *
       * \sa compute() for an example
       */
-    GeneralizedSelfAdjointEigenSolver(Index size)
+    explicit GeneralizedSelfAdjointEigenSolver(Index size)
         : Base(size)
     {}
 
diff --git a/nuparu/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/nuparu/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index 3db0c010..f647f69b 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -71,7 +71,7 @@ template<typename _MatrixType> class HessenbergDecomposition
 
     /** \brief Scalar type for matrices of type #MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     /** \brief Type for vector of Householder coefficients.
       *
@@ -97,7 +97,7 @@ template<typename _MatrixType> class HessenbergDecomposition
       *
       * \sa compute() for an example.
       */
-    HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
+    explicit HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
       : m_matrix(size,size),
         m_temp(size),
         m_isInitialized(false)
@@ -115,8 +115,9 @@ template<typename _MatrixType> class HessenbergDecomposition
       *
       * \sa matrixH() for an example.
       */
-    HessenbergDecomposition(const MatrixType& matrix)
-      : m_matrix(matrix),
+    template<typename InputType>
+    explicit HessenbergDecomposition(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
         m_temp(matrix.rows()),
         m_isInitialized(false)
     {
@@ -147,9 +148,10 @@ template<typename _MatrixType> class HessenbergDecomposition
       * Example: \include HessenbergDecomposition_compute.cpp
       * Output: \verbinclude HessenbergDecomposition_compute.out
       */
-    HessenbergDecomposition& compute(const MatrixType& matrix)
+    template<typename InputType>
+    HessenbergDecomposition& compute(const EigenBase<InputType>& matrix)
     {
-      m_matrix = matrix;
+      m_matrix = matrix.derived();
       if(matrix.rows()<2)
       {
         m_isInitialized = true;
@@ -337,7 +339,6 @@ namespace internal {
 template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType
 : public ReturnByValue<HessenbergDecompositionMatrixHReturnType<MatrixType> >
 {
-    typedef typename MatrixType::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/nuparu/include/Eigen/src/Eigenvalues/RealQZ.h b/nuparu/include/Eigen/src/Eigenvalues/RealQZ.h
index 5706eeeb..a62071d4 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/RealQZ.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/RealQZ.h
@@ -67,7 +67,7 @@ namespace Eigen {
       };
       typedef typename MatrixType::Scalar Scalar;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef typename MatrixType::Index Index;
+      typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
       typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
       typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
@@ -83,7 +83,7 @@ namespace Eigen {
        *
        * \sa compute() for an example.
        */
-      RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) : 
+      explicit RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) :
         m_S(size, size),
         m_T(size, size),
         m_Q(size, size),
@@ -240,10 +240,10 @@ namespace Eigen {
             m_S.coeffRef(i,j) = Scalar(0.0);
             m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
             m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
+            // update Q
+            if (m_computeQZ)
+              m_Q.applyOnTheRight(i-1,i,G);
           }
-          // update Q
-          if (m_computeQZ)
-            m_Q.applyOnTheRight(i-1,i,G);
           // kill T(i,i-1)
           if(m_T.coeff(i,i-1)!=Scalar(0))
           {
@@ -251,10 +251,10 @@ namespace Eigen {
             m_T.coeffRef(i,i-1) = Scalar(0.0);
             m_S.applyOnTheRight(i,i-1,G);
             m_T.topRows(i).applyOnTheRight(i,i-1,G);
+            // update Z
+            if (m_computeQZ)
+              m_Z.applyOnTheLeft(i,i-1,G.adjoint());
           }
-          // update Z
-          if (m_computeQZ)
-            m_Z.applyOnTheLeft(i,i-1,G.adjoint());
         }
       }
     }
@@ -276,7 +276,7 @@ namespace Eigen {
 
   /** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0) */
   template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
+    inline Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
     {
       using std::abs;
       Index res = iu;
@@ -294,7 +294,7 @@ namespace Eigen {
 
   /** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1)  */
   template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
+    inline Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
     {
       using std::abs;
       Index res = l;
@@ -313,10 +313,10 @@ namespace Eigen {
       using std::abs;
       using std::sqrt;
       const Index dim=m_S.cols();
-      if (abs(m_S.coeff(i+1,i)==Scalar(0)))
+      if (abs(m_S.coeff(i+1,i))==Scalar(0))
         return;
-      Index z = findSmallDiagEntry(i,i+1);
-      if (z==i-1)
+      Index j = findSmallDiagEntry(i,i+1);
+      if (j==i-1)
       {
         // block of (S T^{-1})
         Matrix2s STi = m_T.template block<2,2>(i,i).template triangularView<Upper>().
@@ -352,7 +352,7 @@ namespace Eigen {
       }
       else
       {
-        pushDownZero(z,i,i+1);
+        pushDownZero(j,i,i+1);
       }
     }
 
diff --git a/nuparu/include/Eigen/src/Eigenvalues/RealSchur.h b/nuparu/include/Eigen/src/Eigenvalues/RealSchur.h
index 64d13634..f4ded69b 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/RealSchur.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/RealSchur.h
@@ -64,7 +64,7 @@ template<typename _MatrixType> class RealSchur
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
     typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
@@ -80,7 +80,7 @@ template<typename _MatrixType> class RealSchur
       *
       * \sa compute() for an example.
       */
-    RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
+    explicit RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
             : m_matT(size, size),
               m_matU(size, size),
               m_workspaceVector(size),
@@ -100,7 +100,8 @@ template<typename _MatrixType> class RealSchur
       * Example: \include RealSchur_RealSchur_MatrixType.cpp
       * Output: \verbinclude RealSchur_RealSchur_MatrixType.out
       */
-    RealSchur(const MatrixType& matrix, bool computeU = true)
+    template<typename InputType>
+    explicit RealSchur(const EigenBase<InputType>& matrix, bool computeU = true)
             : m_matT(matrix.rows(),matrix.cols()),
               m_matU(matrix.rows(),matrix.cols()),
               m_workspaceVector(matrix.rows()),
@@ -109,7 +110,7 @@ template<typename _MatrixType> class RealSchur
               m_matUisUptodate(false),
               m_maxIters(-1)
     {
-      compute(matrix, computeU);
+      compute(matrix.derived(), computeU);
     }
 
     /** \brief Returns the orthogonal matrix in the Schur decomposition. 
@@ -165,7 +166,8 @@ template<typename _MatrixType> class RealSchur
       *
       * \sa compute(const MatrixType&, bool, Index)
       */
-    RealSchur& compute(const MatrixType& matrix, bool computeU = true);
+    template<typename InputType>
+    RealSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
 
     /** \brief Computes Schur decomposition of a Hessenberg matrix H = Z T Z^T
      *  \param[in] matrixH Matrix in Hessenberg form H
@@ -234,7 +236,7 @@ template<typename _MatrixType> class RealSchur
     typedef Matrix<Scalar,3,1> Vector3s;
 
     Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
+    Index findSmallSubdiagEntry(Index iu);
     void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
     void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
     void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@@ -243,7 +245,8 @@ template<typename _MatrixType> class RealSchur
 
 
 template<typename MatrixType>
-RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
+template<typename InputType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU)
 {
   eigen_assert(matrix.cols() == matrix.rows());
   Index maxIters = m_maxIters;
@@ -251,7 +254,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const MatrixType& matrix,
     maxIters = m_maxIterationsPerRow * matrix.rows();
 
   // Step 1. Reduce to Hessenberg form
-  m_hess.compute(matrix);
+  m_hess.compute(matrix.derived());
 
   // Step 2. Reduce to real Schur form  
   computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
@@ -286,7 +289,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   {
     while (iu >= 0)
     {
-      Index il = findSmallSubdiagEntry(iu, norm);
+      Index il = findSmallSubdiagEntry(iu);
 
       // Check for convergence
       if (il == iu) // One root found
@@ -343,16 +346,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
 
 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
+inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
 {
   using std::abs;
   Index res = iu;
   while (res > 0)
   {
     Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (s == 0.0)
-      s = norm;
-    if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
+    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
       break;
     res--;
   }
@@ -457,9 +458,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V
     const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
     const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
     if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
-    {
       break;
-    }
   }
 }
 
diff --git a/nuparu/include/Eigen/src/Eigenvalues/RealSchur_MKL.h b/nuparu/include/Eigen/src/Eigenvalues/RealSchur_MKL.h
index ad973646..e8092640 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/RealSchur_MKL.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/RealSchur_MKL.h
@@ -40,18 +40,13 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+template<> template<typename InputType> inline \
 RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
+RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, bool computeU) \
 { \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
-\
   eigen_assert(matrix.cols() == matrix.rows()); \
 \
   lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
   lapack_int matrix_order = MKLCOLROW; \
   char jobvs, sort='N'; \
   LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
@@ -59,6 +54,7 @@ RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<E
   m_matU.resize(n, n); \
   lapack_int ldvs  = m_matU.outerStride(); \
   m_matT = matrix; \
+  lapack_int lda = m_matT.outerStride(); \
   Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
   wr.resize(n, 1); wi.resize(n, 1); \
   info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
diff --git a/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 3993046a..c6455509 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,6 +20,8 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
+template<typename MatrixType, typename DiagType, typename SubDiagType>
+ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
@@ -79,7 +81,9 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     
     /** \brief Scalar type for matrices of type \p _MatrixType. */
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    
+    typedef Matrix<Scalar,Size,Size,ColMajor,MaxColsAtCompileTime,MaxColsAtCompileTime> EigenvectorsType;
 
     /** \brief Real scalar type for \p _MatrixType.
       *
@@ -98,6 +102,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       */
     typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
     typedef Tridiagonalization<MatrixType> TridiagonalizationType;
+    typedef typename TridiagonalizationType::SubDiagonalType SubDiagonalType;
 
     /** \brief Default constructor for fixed-size matrices.
       *
@@ -109,6 +114,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
       * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver.out
       */
+    EIGEN_DEVICE_FUNC
     SelfAdjointEigenSolver()
         : m_eivec(),
           m_eivalues(),
@@ -128,7 +134,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa compute() for an example
       */
-    SelfAdjointEigenSolver(Index size)
+    EIGEN_DEVICE_FUNC
+    explicit SelfAdjointEigenSolver(Index size)
         : m_eivec(size, size),
           m_eivalues(size),
           m_subdiag(size > 1 ? size - 1 : 1),
@@ -150,13 +157,15 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa compute(const MatrixType&, int)
       */
-    SelfAdjointEigenSolver(const MatrixType& matrix, int options = ComputeEigenvectors)
+    template<typename InputType>
+    EIGEN_DEVICE_FUNC
+    explicit SelfAdjointEigenSolver(const EigenBase<InputType>& matrix, int options = ComputeEigenvectors)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
         m_isInitialized(false)
     {
-      compute(matrix, options);
+      compute(matrix.derived(), options);
     }
 
     /** \brief Computes eigendecomposition of given matrix.
@@ -189,24 +198,44 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa SelfAdjointEigenSolver(const MatrixType&, int)
       */
-    SelfAdjointEigenSolver& compute(const MatrixType& matrix, int options = ComputeEigenvectors);
+    template<typename InputType>
+    EIGEN_DEVICE_FUNC
+    SelfAdjointEigenSolver& compute(const EigenBase<InputType>& matrix, int options = ComputeEigenvectors);
     
-    /** \brief Computes eigendecomposition of given matrix using a direct algorithm
+    /** \brief Computes eigendecomposition of given matrix using a closed-form algorithm
       *
       * This is a variant of compute(const MatrixType&, int options) which
       * directly solves the underlying polynomial equation.
       * 
-      * Currently only 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
+      * Currently only 2x2 and 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
       * 
-      * This method is usually significantly faster than the QR algorithm
+      * This method is usually significantly faster than the QR iterative algorithm
       * but it might also be less accurate. It is also worth noting that
       * for 3x3 matrices it involves trigonometric operations which are
       * not necessarily available for all scalar types.
+      * 
+      * For the 3x3 case, we observed the following worst case relative error regarding the eigenvalues:
+      *   - double: 1e-8
+      *   - float:  1e-3
       *
       * \sa compute(const MatrixType&, int options)
       */
+    EIGEN_DEVICE_FUNC
     SelfAdjointEigenSolver& computeDirect(const MatrixType& matrix, int options = ComputeEigenvectors);
 
+    /**
+      *\brief Computes the eigen decomposition from a tridiagonal symmetric matrix
+      *
+      * \param[in] diag The vector containing the diagonal of the matrix.
+      * \param[in] subdiag The subdiagonal of the matrix.
+      * \returns Reference to \c *this
+      *
+      * This function assumes that the matrix has been reduced to tridiagonal form.
+      *
+      * \sa compute(const MatrixType&, int) for more information
+      */
+    SelfAdjointEigenSolver& computeFromTridiagonal(const RealVectorType& diag, const SubDiagonalType& subdiag , int options=ComputeEigenvectors);
+
     /** \brief Returns the eigenvectors of given matrix.
       *
       * \returns  A const reference to the matrix whose columns are the eigenvectors.
@@ -225,7 +254,8 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa eigenvalues()
       */
-    const MatrixType& eigenvectors() const
+    EIGEN_DEVICE_FUNC
+    const EigenvectorsType& eigenvectors() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
       eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
@@ -247,6 +277,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \sa eigenvectors(), MatrixBase::eigenvalues()
       */
+    EIGEN_DEVICE_FUNC
     const RealVectorType& eigenvalues() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -271,6 +302,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * \sa operatorInverseSqrt(),
       *     \ref MatrixFunctions_Module "MatrixFunctions Module"
       */
+    EIGEN_DEVICE_FUNC
     MatrixType operatorSqrt() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -296,6 +328,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       * \sa operatorSqrt(), MatrixBase::inverse(),
       *     \ref MatrixFunctions_Module "MatrixFunctions Module"
       */
+    EIGEN_DEVICE_FUNC
     MatrixType operatorInverseSqrt() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -307,6 +340,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       *
       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
       */
+    EIGEN_DEVICE_FUNC
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
@@ -320,38 +354,13 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       */
     static const int m_maxIterations = 30;
 
-    #ifdef EIGEN2_SUPPORT
-    SelfAdjointEigenSolver(const MatrixType& matrix, bool computeEigenvectors)
-      : m_eivec(matrix.rows(), matrix.cols()),
-        m_eivalues(matrix.cols()),
-        m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
-    {
-      compute(matrix, computeEigenvectors);
-    }
-    
-    SelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-        : m_eivec(matA.cols(), matA.cols()),
-          m_eivalues(matA.cols()),
-          m_subdiag(matA.cols() > 1 ? matA.cols() - 1 : 1),
-          m_isInitialized(false)
+  protected:
+    static void check_template_parameters()
     {
-      static_cast<GeneralizedSelfAdjointEigenSolver<MatrixType>*>(this)->compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
     
-    void compute(const MatrixType& matrix, bool computeEigenvectors)
-    {
-      compute(matrix, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-
-    void compute(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-    {
-      compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    #endif // EIGEN2_SUPPORT
-
-  protected:
-    MatrixType m_eivec;
+    EigenvectorsType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
     ComputationInfo m_info;
@@ -359,6 +368,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     bool m_eigenvectorsOk;
 };
 
+namespace internal {
 /** \internal
   *
   * \eigenvalues_module \ingroup Eigenvalues_Module
@@ -375,15 +385,21 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
   * Implemented from Golub's "Matrix Computations", algorithm 8.3.2:
   * "implicit symmetric QR step with Wilkinson shift"
   */
-namespace internal {
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n);
 }
 
 template<typename MatrixType>
+template<typename InputType>
+EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
-::compute(const MatrixType& matrix, int options)
+::compute(const EigenBase<InputType>& a_matrix, int options)
 {
+  check_template_parameters();
+  
+  const InputType &matrix(a_matrix.derived());
+  
   using std::abs;
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
@@ -395,7 +411,7 @@ ::compute(const MatrixType& matrix, int options)
 
   if(n==1)
   {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0));
+    m_eivalues.coeffRef(0,0) = numext::real(matrix(0,0));
     if(computeEigenvectors)
       m_eivec.setOnes(n,n);
     m_info = Success;
@@ -406,7 +422,7 @@ ::compute(const MatrixType& matrix, int options)
 
   // declare some aliases
   RealVectorType& diag = m_eivalues;
-  MatrixType& mat = m_eivec;
+  EigenvectorsType& mat = m_eivec;
 
   // map the matrix coefficients to [-1:1] to avoid over- and underflow.
   mat = matrix.template triangularView<Lower>();
@@ -415,19 +431,72 @@ ::compute(const MatrixType& matrix, int options)
   mat.template triangularView<Lower>() /= scale;
   m_subdiag.resize(n-1);
   internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
+
+  m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
   
+  // scale back the eigen values
+  m_eivalues *= scale;
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+template<typename MatrixType>
+SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
+::computeFromTridiagonal(const RealVectorType& diag, const SubDiagonalType& subdiag , int options)
+{
+  //TODO : Add an option to scale the values beforehand
+  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
+
+  m_eivalues = diag;
+  m_subdiag = subdiag;
+  if (computeEigenvectors)
+  {
+    m_eivec.setIdentity(diag.size(), diag.size());
+  }
+  m_info = computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+namespace internal {
+/**
+  * \internal
+  * \brief Compute the eigendecomposition from a tridiagonal matrix
+  *
+  * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
+  * \param[in] subdiag : The subdiagonal part of the matrix.
+  * \param[in,out] : On input, the maximum number of iterations, on output, the effective number of iterations.
+  * \param[out] eivec : The matrix to store the eigenvectors... if needed. allocated on input
+  * \returns \c Success or \c NoConvergence
+  */
+template<typename MatrixType, typename DiagType, typename SubDiagType>
+ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
+{
+  using std::abs;
+
+  ComputationInfo info;
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index n = diag.size();
   Index end = n-1;
   Index start = 0;
   Index iter = 0; // total number of iterations
-
+  
+  typedef typename DiagType::RealScalar RealScalar;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  
   while (end>0)
   {
     for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(m_subdiag[i]),(abs(diag[i])+abs(diag[i+1]))))
-        m_subdiag[i] = 0;
+      if (internal::isMuchSmallerThan(abs(subdiag[i]),(abs(diag[i])+abs(diag[i+1]))) || abs(subdiag[i]) <= considerAsZero)
+        subdiag[i] = 0;
 
     // find the largest unreduced block
-    while (end>0 && m_subdiag[end-1]==0)
+    while (end>0 && subdiag[end-1]==0)
     {
       end--;
     }
@@ -436,51 +505,42 @@ ::compute(const MatrixType& matrix, int options)
 
     // if we spent too many iterations, we give up
     iter++;
-    if(iter > m_maxIterations * n) break;
+    if(iter > maxIterations * n) break;
 
     start = end - 1;
-    while (start>0 && m_subdiag[start-1]!=0)
+    while (start>0 && subdiag[start-1]!=0)
       start--;
 
-    internal::tridiagonal_qr_step<MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor>(diag.data(), m_subdiag.data(), start, end, computeEigenvectors ? m_eivec.data() : (Scalar*)0, n);
+    internal::tridiagonal_qr_step<MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor>(diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
   }
-
-  if (iter <= m_maxIterations * n)
-    m_info = Success;
+  if (iter <= maxIterations * n)
+    info = Success;
   else
-    m_info = NoConvergence;
+    info = NoConvergence;
 
   // Sort eigenvalues and corresponding vectors.
   // TODO make the sort optional ?
   // TODO use a better sort algorithm !!
-  if (m_info == Success)
+  if (info == Success)
   {
     for (Index i = 0; i < n-1; ++i)
     {
       Index k;
-      m_eivalues.segment(i,n-i).minCoeff(&k);
+      diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        std::swap(m_eivalues[i], m_eivalues[k+i]);
+        std::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
-          m_eivec.col(i).swap(m_eivec.col(k+i));
+          eivec.col(i).swap(eivec.col(k+i));
       }
     }
   }
-  
-  // scale back the eigen values
-  m_eivalues *= scale;
-
-  m_isInitialized = true;
-  m_eigenvectorsOk = computeEigenvectors;
-  return *this;
+  return info;
 }
-
-
-namespace internal {
   
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(SolverType& eig, const typename SolverType::MatrixType& A, int options)
   { eig.compute(A,options); }
 };
@@ -490,13 +550,20 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
+  typedef typename SolverType::EigenvectorsType EigenvectorsType;
   
+
+  /** \internal
+   * Computes the roots of the characteristic polynomial of \a m.
+   * For numerical stability m.trace() should be near zero and to avoid over- or underflow m should be normalized.
+   */
+  EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    using std::sqrt;
-    using std::atan2;
-    using std::cos;
-    using std::sin;
+    EIGEN_USING_STD_MATH(sqrt)
+    EIGEN_USING_STD_MATH(atan2)
+    EIGEN_USING_STD_MATH(cos)
+    EIGEN_USING_STD_MATH(sin)
     const Scalar s_inv3 = Scalar(1.0)/Scalar(3.0);
     const Scalar s_sqrt3 = sqrt(Scalar(3.0));
 
@@ -510,148 +577,123 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     // Construct the parameters used in classifying the roots of the equation
     // and in solving the equation for the roots in closed form.
     Scalar c2_over_3 = c2*s_inv3;
-    Scalar a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
-    if (a_over_3 > Scalar(0))
-      a_over_3 = Scalar(0);
+    Scalar a_over_3 = (c2*c2_over_3 - c1)*s_inv3;
+    a_over_3 = numext::maxi(a_over_3, Scalar(0));
 
     Scalar half_b = Scalar(0.5)*(c0 + c2_over_3*(Scalar(2)*c2_over_3*c2_over_3 - c1));
 
-    Scalar q = half_b*half_b + a_over_3*a_over_3*a_over_3;
-    if (q > Scalar(0))
-      q = Scalar(0);
+    Scalar q = a_over_3*a_over_3*a_over_3 - half_b*half_b;
+    q = numext::maxi(q, Scalar(0));
 
     // Compute the eigenvalues by solving for the roots of the polynomial.
-    Scalar rho = sqrt(-a_over_3);
-    Scalar theta = atan2(sqrt(-q),half_b)*s_inv3;
+    Scalar rho = sqrt(a_over_3);
+    Scalar theta = atan2(sqrt(q),half_b)*s_inv3;  // since sqrt(q) > 0, atan2 is in [0, pi] and theta is in [0, pi/3]
     Scalar cos_theta = cos(theta);
     Scalar sin_theta = sin(theta);
-    roots(0) = c2_over_3 + Scalar(2)*rho*cos_theta;
-    roots(1) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta);
-    roots(2) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta);
-
-    // Sort in increasing order.
-    if (roots(0) >= roots(1))
-      std::swap(roots(0),roots(1));
-    if (roots(1) >= roots(2))
-    {
-      std::swap(roots(1),roots(2));
-      if (roots(0) >= roots(1))
-        std::swap(roots(0),roots(1));
-    }
+    // roots are already sorted, since cos is monotonically decreasing on [0, pi]
+    roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta); // == 2*rho*cos(theta+2pi/3)
+    roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta); // == 2*rho*cos(theta+ pi/3)
+    roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
   }
-  
+
+  EIGEN_DEVICE_FUNC
+  static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
+  {
+    using std::abs;
+    Index i0;
+    // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
+    mat.diagonal().cwiseAbs().maxCoeff(&i0);
+    // mat.col(i0) is a good candidate for an orthogonal vector to the current eigenvector,
+    // so let's save it:
+    representative = mat.col(i0);
+    Scalar n0, n1;
+    VectorType c0, c1;
+    n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
+    n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
+    if(n0>n1) res = c0/std::sqrt(n0);
+    else      res = c1/std::sqrt(n1);
+
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    using std::sqrt;
     eigen_assert(mat.cols() == 3 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
             && (options&EigVecMask)!=EigVecMask
             && "invalid option parameter");
     bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
     
-    MatrixType& eivecs = solver.m_eivec;
+    EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    MatrixType scaledMat = mat / scale;
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(3);
+    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for computing the eigenvectors later
+    MatrixType scaledMat = mat.template selfadjointView<Lower>();
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if(scale > 0) scaledMat /= scale;   // TODO for scale==0 we could save the remaining operations
 
     // compute the eigenvalues
     computeRoots(scaledMat,eivals);
 
-    // compute the eigen vectors
+    // compute the eigenvectors
     if(computeEigenvectors)
     {
-      Scalar safeNorm2 = Eigen::NumTraits<Scalar>::epsilon();
-      safeNorm2 *= safeNorm2;
       if((eivals(2)-eivals(0))<=Eigen::NumTraits<Scalar>::epsilon())
       {
+        // All three eigenvalues are numerically the same
         eivecs.setIdentity();
       }
       else
       {
-        scaledMat = scaledMat.template selfadjointView<Lower>();
         MatrixType tmp;
         tmp = scaledMat;
 
+        // Compute the eigenvector of the most distinct eigenvalue
         Scalar d0 = eivals(2) - eivals(1);
         Scalar d1 = eivals(1) - eivals(0);
-        int k =  d0 > d1 ? 2 : 0;
-        d0 = d0 > d1 ? d1 : d0;
-
-        tmp.diagonal().array () -= eivals(k);
-        VectorType cross;
-        Scalar n;
-        n = (cross = tmp.row(0).cross(tmp.row(1))).squaredNorm();
-
-        if(n>safeNorm2)
-          eivecs.col(k) = cross / sqrt(n);
-        else
+        Index k(0), l(2);
+        if(d0 > d1)
         {
-          n = (cross = tmp.row(0).cross(tmp.row(2))).squaredNorm();
-
-          if(n>safeNorm2)
-            eivecs.col(k) = cross / sqrt(n);
-          else
-          {
-            n = (cross = tmp.row(1).cross(tmp.row(2))).squaredNorm();
-
-            if(n>safeNorm2)
-              eivecs.col(k) = cross / sqrt(n);
-            else
-            {
-              // the input matrix and/or the eigenvaues probably contains some inf/NaN,
-              // => exit
-              // scale back to the original size.
-              eivals *= scale;
-
-              solver.m_info = NumericalIssue;
-              solver.m_isInitialized = true;
-              solver.m_eigenvectorsOk = computeEigenvectors;
-              return;
-            }
-          }
+          numext::swap(k,l);
+          d0 = d1;
         }
 
-        tmp = scaledMat;
-        tmp.diagonal().array() -= eivals(1);
+        // Compute the eigenvector of index k
+        {
+          tmp.diagonal().array () -= eivals(k);
+          // By construction, 'tmp' is of rank 2, and its kernel corresponds to the respective eigenvector.
+          extract_kernel(tmp, eivecs.col(k), eivecs.col(l));
+        }
 
-        if(d0<=Eigen::NumTraits<Scalar>::epsilon())
-          eivecs.col(1) = eivecs.col(k).unitOrthogonal();
+        // Compute eigenvector of index l
+        if(d0<=2*Eigen::NumTraits<Scalar>::epsilon()*d1)
+        {
+          // If d0 is too small, then the two other eigenvalues are numerically the same,
+          // and thus we only have to ortho-normalize the near orthogonal vector we saved above.
+          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l))*eivecs.col(l);
+          eivecs.col(l).normalize();
+        }
         else
         {
-          n = (cross = eivecs.col(k).cross(tmp.row(0).normalized())).squaredNorm();
-          if(n>safeNorm2)
-            eivecs.col(1) = cross / sqrt(n);
-          else
-          {
-            n = (cross = eivecs.col(k).cross(tmp.row(1))).squaredNorm();
-            if(n>safeNorm2)
-              eivecs.col(1) = cross / sqrt(n);
-            else
-            {
-              n = (cross = eivecs.col(k).cross(tmp.row(2))).squaredNorm();
-              if(n>safeNorm2)
-                eivecs.col(1) = cross / sqrt(n);
-              else
-              {
-                // we should never reach this point,
-                // if so the last two eigenvalues are likely to ve very closed to each other
-                eivecs.col(1) = eivecs.col(k).unitOrthogonal();
-              }
-            }
-          }
-
-          // make sure that eivecs[1] is orthogonal to eivecs[2]
-          Scalar d = eivecs.col(1).dot(eivecs.col(k));
-          eivecs.col(1) = (eivecs.col(1) - d * eivecs.col(k)).normalized();
+          tmp = scaledMat;
+          tmp.diagonal().array () -= eivals(l);
+
+          VectorType dummy;
+          extract_kernel(tmp, eivecs.col(l), dummy);
         }
 
-        eivecs.col(k==2 ? 0 : 2) = eivecs.col(k).cross(eivecs.col(1)).normalized();
+        // Compute last eigenvector from the other two
+        eivecs.col(1) = eivecs.col(2).cross(eivecs.col(0)).normalized();
       }
     }
+
     // Rescale back to the original size.
     eivals *= scale;
+    eivals.array() += shift;
     
     solver.m_info = Success;
     solver.m_isInitialized = true;
@@ -660,36 +702,42 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
 };
 
 // 2x2 direct eigenvalues decomposition, code from Hauke Heibel
-template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2,false>
+template<typename SolverType> 
+struct direct_selfadjoint_eigenvalues<SolverType,2,false>
 {
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
+  typedef typename SolverType::EigenvectorsType EigenvectorsType;
   
+  EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
     using std::sqrt;
-    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*m(1,0)*m(1,0));
+    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
     roots(1) = t1 + t0;
   }
   
+  EIGEN_DEVICE_FUNC
   static inline void run(SolverType& solver, const MatrixType& mat, int options)
   {
-    using std::sqrt;
+    EIGEN_USING_STD_MATH(sqrt);
+    EIGEN_USING_STD_MATH(abs);
+    
     eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
     eigen_assert((options&~(EigVecMask|GenEigMask))==0
             && (options&EigVecMask)!=EigVecMask
             && "invalid option parameter");
     bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
     
-    MatrixType& eivecs = solver.m_eivec;
+    EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
   
     // map the matrix coefficients to [-1:1] to avoid over- and underflow.
     Scalar scale = mat.cwiseAbs().maxCoeff();
-    scale = (std::max)(scale,Scalar(1));
+    scale = numext::maxi(scale,Scalar(1));
     MatrixType scaledMat = mat / scale;
     
     // Compute the eigenvalues
@@ -698,22 +746,29 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2
     // compute the eigen vectors
     if(computeEigenvectors)
     {
-      scaledMat.diagonal().array () -= eivals(1);
-      Scalar a2 = numext::abs2(scaledMat(0,0));
-      Scalar c2 = numext::abs2(scaledMat(1,1));
-      Scalar b2 = numext::abs2(scaledMat(1,0));
-      if(a2>c2)
+      if((eivals(1)-eivals(0))<=abs(eivals(1))*Eigen::NumTraits<Scalar>::epsilon())
       {
-        eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
-        eivecs.col(1) /= sqrt(a2+b2);
+        eivecs.setIdentity();
       }
       else
       {
-        eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
-        eivecs.col(1) /= sqrt(c2+b2);
-      }
+        scaledMat.diagonal().array () -= eivals(1);
+        Scalar a2 = numext::abs2(scaledMat(0,0));
+        Scalar c2 = numext::abs2(scaledMat(1,1));
+        Scalar b2 = numext::abs2(scaledMat(1,0));
+        if(a2>c2)
+        {
+          eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
+          eivecs.col(1) /= sqrt(a2+b2);
+        }
+        else
+        {
+          eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
+          eivecs.col(1) /= sqrt(c2+b2);
+        }
 
-      eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+        eivecs.col(0) << eivecs.col(1).unitOrthogonal();
+      }
     }
     
     // Rescale back to the original size.
@@ -728,6 +783,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2
 }
 
 template<typename MatrixType>
+EIGEN_DEVICE_FUNC
 SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
 ::computeDirect(const MatrixType& matrix, int options)
 {
@@ -737,6 +793,7 @@ ::computeDirect(const MatrixType& matrix, int options)
 
 namespace internal {
 template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
   using std::abs;
diff --git a/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
index 17c0dadd..3499dc78 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
@@ -40,9 +40,9 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
-template<> inline \
+template<> template<typename InputType> inline \
 SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, int options) \
+SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, int options) \
 { \
   eigen_assert(matrix.cols() == matrix.rows()); \
   eigen_assert((options&~(EigVecMask|GenEigMask))==0 \
@@ -56,7 +56,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
 \
   if(n==1) \
   { \
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0)); \
+    m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); \
     if(computeEigenvectors) m_eivec.setOnes(n,n); \
     m_info = Success; \
     m_isInitialized = true; \
@@ -64,7 +64,7 @@ SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(c
     return *this; \
   } \
 \
-  lda = matrix.outerStride(); \
+  lda = m_eivec.outerStride(); \
   matrix_order=MKLCOLROW; \
   char jobz, uplo='L'/*, range='A'*/; \
   jobz = computeEigenvectors ? 'V' : 'N'; \
diff --git a/nuparu/include/Eigen/src/Eigenvalues/Tridiagonalization.h b/nuparu/include/Eigen/src/Eigenvalues/Tridiagonalization.h
index 192278d6..2030b5be 100644
--- a/nuparu/include/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/nuparu/include/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -18,8 +18,10 @@ namespace internal {
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
 template<typename MatrixType>
 struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
+  : public traits<typename MatrixType::PlainObject>
 {
-  typedef typename MatrixType::PlainObject ReturnType;
+  typedef typename MatrixType::PlainObject ReturnType; // FIXME shall it be a BandMatrix?
+  enum { Flags = 0 };
 };
 
 template<typename MatrixType, typename CoeffVectorType>
@@ -67,7 +69,7 @@ template<typename _MatrixType> class Tridiagonalization
 
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
 
     enum {
       Size = MatrixType::RowsAtCompileTime,
@@ -89,10 +91,8 @@ template<typename _MatrixType> class Tridiagonalization
             >::type DiagonalReturnType;
 
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-              typename internal::add_const_on_value_type<typename Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >::RealReturnType>::type,
-              const Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >
+              typename internal::add_const_on_value_type<typename Diagonal<const MatrixType, -1>::RealReturnType>::type,
+              const Diagonal<const MatrixType, -1>
             >::type SubDiagonalReturnType;
 
     /** \brief Return type of matrixQ() */
@@ -110,7 +110,7 @@ template<typename _MatrixType> class Tridiagonalization
       *
       * \sa compute() for an example.
       */
-    Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
+    explicit Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
       : m_matrix(size,size),
         m_hCoeffs(size > 1 ? size-1 : 1),
         m_isInitialized(false)
@@ -126,8 +126,9 @@ template<typename _MatrixType> class Tridiagonalization
       * Example: \include Tridiagonalization_Tridiagonalization_MatrixType.cpp
       * Output: \verbinclude Tridiagonalization_Tridiagonalization_MatrixType.out
       */
-    Tridiagonalization(const MatrixType& matrix)
-      : m_matrix(matrix),
+    template<typename InputType>
+    explicit Tridiagonalization(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
         m_hCoeffs(matrix.cols() > 1 ? matrix.cols()-1 : 1),
         m_isInitialized(false)
     {
@@ -152,9 +153,10 @@ template<typename _MatrixType> class Tridiagonalization
       * Example: \include Tridiagonalization_compute.cpp
       * Output: \verbinclude Tridiagonalization_compute.out
       */
-    Tridiagonalization& compute(const MatrixType& matrix)
+    template<typename InputType>
+    Tridiagonalization& compute(const EigenBase<InputType>& matrix)
     {
-      m_matrix = matrix;
+      m_matrix = matrix.derived();
       m_hCoeffs.resize(matrix.rows()-1, 1);
       internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
       m_isInitialized = true;
@@ -305,7 +307,7 @@ typename Tridiagonalization<MatrixType>::DiagonalReturnType
 Tridiagonalization<MatrixType>::diagonal() const
 {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  return m_matrix.diagonal();
+  return m_matrix.diagonal().real();
 }
 
 template<typename MatrixType>
@@ -313,8 +315,7 @@ typename Tridiagonalization<MatrixType>::SubDiagonalReturnType
 Tridiagonalization<MatrixType>::subDiagonal() const
 {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  Index n = m_matrix.rows();
-  return Block<const MatrixType,SizeMinusOne,SizeMinusOne>(m_matrix, 1, 0, n-1,n-1).diagonal();
+  return m_matrix.template diagonal<-1>().real();
 }
 
 namespace internal {
@@ -346,7 +347,6 @@ template<typename MatrixType, typename CoeffVectorType>
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   Index n = matA.rows();
@@ -438,7 +438,6 @@ struct tridiagonalization_inplace_selector
 {
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  typedef typename MatrixType::Index Index;
   template<typename DiagonalType, typename SubDiagonalType>
   static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
@@ -467,9 +466,10 @@ struct tridiagonalization_inplace_selector<MatrixType,3,false>
   static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     using std::sqrt;
+    const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
     diag[0] = mat(0,0);
     RealScalar v1norm2 = numext::abs2(mat(2,0));
-    if(v1norm2 == RealScalar(0))
+    if(v1norm2 <= tol)
     {
       diag[1] = mat(1,1);
       diag[2] = mat(2,2);
@@ -526,7 +526,6 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
 template<typename MatrixType> struct TridiagonalizationMatrixTReturnType
 : public ReturnByValue<TridiagonalizationMatrixTReturnType<MatrixType> >
 {
-    typedef typename MatrixType::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/nuparu/include/Eigen/src/Geometry/AlignedBox.h b/nuparu/include/Eigen/src/Geometry/AlignedBox.h
index 8e186d57..03f1a11f 100644
--- a/nuparu/include/Eigen/src/Geometry/AlignedBox.h
+++ b/nuparu/include/Eigen/src/Geometry/AlignedBox.h
@@ -19,10 +19,12 @@ namespace Eigen {
   *
   * \brief An axis aligned box
   *
-  * \param _Scalar the type of the scalar coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
+  * \tparam _Scalar the type of the scalar coefficients
+  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
   *
   * This class represents an axis aligned box as a pair of the minimal and maximal corners.
+  * \warning The result of most methods is undefined when applied to an empty box. You can check for empty boxes using isEmpty().
+  * \sa alignedboxtypedefs
   */
 template <typename _Scalar, int _AmbientDim>
 class AlignedBox
@@ -32,7 +34,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   enum { AmbientDimAtCompileTime = _AmbientDim };
   typedef _Scalar                                   Scalar;
   typedef NumTraits<Scalar>                         ScalarTraits;
-  typedef DenseIndex                                Index;
+  typedef Eigen::Index                              Index; ///< \deprecated since Eigen 3.3
   typedef typename ScalarTraits::Real               RealScalar;
   typedef typename ScalarTraits::NonInteger      NonInteger;
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1>  VectorType;
@@ -40,18 +42,21 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
   enum CornerType
   {
-    /** 1D names */
+    /** 1D names @{ */
     Min=0, Max=1,
+    /** @} */
 
-    /** Added names for 2D */
+    /** Identifier for 2D corner @{ */
     BottomLeft=0, BottomRight=1,
     TopLeft=2, TopRight=3,
+    /** @} */
 
-    /** Added names for 3D */
+    /** Identifier for 3D corner  @{ */
     BottomLeftFloor=0, BottomRightFloor=1,
     TopLeftFloor=2, TopRightFloor=3,
     BottomLeftCeil=4, BottomRightCeil=5,
     TopLeftCeil=6, TopRightCeil=7
+    /** @} */
   };
 
 
@@ -63,34 +68,33 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
   { setEmpty(); }
 
-  /** Constructs a box with extremities \a _min and \a _max. */
+  /** Constructs a box with extremities \a _min and \a _max.
+   * \warning If either component of \a _min is larger than the same component of \a _max, the constructed box is empty. */
   template<typename OtherVectorType1, typename OtherVectorType2>
   inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
 
   /** Constructs a box containing a single point \a p. */
   template<typename Derived>
-  inline explicit AlignedBox(const MatrixBase<Derived>& a_p)
-  {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    m_min = p;
-    m_max = p;
-  }
+  inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min)
+  { }
 
   ~AlignedBox() {}
 
   /** \returns the dimension in which the box holds */
   inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
 
-  /** \deprecated use isEmpty */
+  /** \deprecated use isEmpty() */
   inline bool isNull() const { return isEmpty(); }
 
-  /** \deprecated use setEmpty */
+  /** \deprecated use setEmpty() */
   inline void setNull() { setEmpty(); }
 
-  /** \returns true if the box is empty. */
+  /** \returns true if the box is empty.
+   * \sa setEmpty */
   inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
 
-  /** Makes \c *this an empty box. */
+  /** Makes \c *this an empty box.
+   * \sa isEmpty */
   inline void setEmpty()
   {
     m_min.setConstant( ScalarTraits::highest() );
@@ -159,7 +163,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
    * a uniform distribution */
   inline VectorType sample() const
   {
-    VectorType r;
+    VectorType r(dim());
     for(Index d=0; d<dim(); ++d)
     {
       if(!ScalarTraits::IsInteger)
@@ -175,27 +179,34 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns true if the point \a p is inside the box \c *this. */
   template<typename Derived>
-  inline bool contains(const MatrixBase<Derived>& a_p) const
+  inline bool contains(const MatrixBase<Derived>& p) const
   {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    return (m_min.array()<=p.array()).all() && (p.array()<=m_max.array()).all();
+    typename internal::nested_eval<Derived,2>::type p_n(p.derived());
+    return (m_min.array()<=p_n.array()).all() && (p_n.array()<=m_max.array()).all();
   }
 
   /** \returns true if the box \a b is entirely inside the box \c *this. */
   inline bool contains(const AlignedBox& b) const
   { return (m_min.array()<=(b.min)().array()).all() && ((b.max)().array()<=m_max.array()).all(); }
 
-  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
+  /** \returns true if the box \a b is intersecting the box \c *this.
+   * \sa intersection, clamp */
+  inline bool intersects(const AlignedBox& b) const
+  { return (m_min.array()<=(b.max)().array()).all() && ((b.min)().array()<=m_max.array()).all(); }
+
+  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this.
+   * \sa extend(const AlignedBox&) */
   template<typename Derived>
-  inline AlignedBox& extend(const MatrixBase<Derived>& a_p)
+  inline AlignedBox& extend(const MatrixBase<Derived>& p)
   {
-    typename internal::nested<Derived,2>::type p(a_p.derived());
-    m_min = m_min.cwiseMin(p);
-    m_max = m_max.cwiseMax(p);
+    typename internal::nested_eval<Derived,2>::type p_n(p.derived());
+    m_min = m_min.cwiseMin(p_n);
+    m_max = m_max.cwiseMax(p_n);
     return *this;
   }
 
-  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
+  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this.
+   * \sa merged, extend(const MatrixBase&) */
   inline AlignedBox& extend(const AlignedBox& b)
   {
     m_min = m_min.cwiseMin(b.m_min);
@@ -203,7 +214,9 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     return *this;
   }
 
-  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
+  /** Clamps \c *this by the box \a b and returns a reference to \c *this.
+   * \note If the boxes don't intersect, the resulting box is empty.
+   * \sa intersection(), intersects() */
   inline AlignedBox& clamp(const AlignedBox& b)
   {
     m_min = m_min.cwiseMax(b.m_min);
@@ -211,11 +224,15 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
     return *this;
   }
 
-  /** Returns an AlignedBox that is the intersection of \a b and \c *this */
+  /** Returns an AlignedBox that is the intersection of \a b and \c *this
+   * \note If the boxes don't intersect, the resulting box is empty.
+   * \sa intersects(), clamp, contains()  */
   inline AlignedBox intersection(const AlignedBox& b) const
   {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
 
-  /** Returns an AlignedBox that is the union of \a b and \c *this */
+  /** Returns an AlignedBox that is the union of \a b and \c *this.
+   * \note Merging with an empty box may result in a box bigger than \c *this. 
+   * \sa extend(const AlignedBox&) */
   inline AlignedBox merged(const AlignedBox& b) const
   { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
 
@@ -223,7 +240,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   template<typename Derived>
   inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
   {
-    const typename internal::nested<Derived,2>::type t(a_t.derived());
+    const typename internal::nested_eval<Derived,2>::type t(a_t.derived());
     m_min += t;
     m_max += t;
     return *this;
@@ -231,20 +248,20 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns the squared distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
-    * \sa exteriorDistance()
+    * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
-  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& a_p) const;
+  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
 
   /** \returns the squared distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
-    * \sa exteriorDistance()
+    * \sa exteriorDistance(const AlignedBox&), squaredExteriorDistance(const MatrixBase&)
     */
   inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
 
   /** \returns the distance between the point \a p and the box \c *this,
     * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance()
+    * \sa squaredExteriorDistance(const MatrixBase&), exteriorDistance(const AlignedBox&)
     */
   template<typename Derived>
   inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
@@ -252,7 +269,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns the distance between the boxes \a b and \c *this,
     * and zero if the boxes intersect.
-    * \sa squaredExteriorDistance()
+    * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
     */
   inline NonInteger exteriorDistance(const AlignedBox& b) const
   { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(b))); }
@@ -296,7 +313,7 @@ template<typename Scalar,int AmbientDim>
 template<typename Derived>
 inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
 {
-  typename internal::nested<Derived,2*AmbientDim>::type p(a_p.derived());
+  typename internal::nested_eval<Derived,2*AmbientDim>::type p(a_p.derived());
   Scalar dist2(0);
   Scalar aux;
   for (Index k=0; k<dim(); ++k)
diff --git a/nuparu/include/Eigen/src/Geometry/AngleAxis.h b/nuparu/include/Eigen/src/Geometry/AngleAxis.h
index 553d38c7..7fdb8ae8 100644
--- a/nuparu/include/Eigen/src/Geometry/AngleAxis.h
+++ b/nuparu/include/Eigen/src/Geometry/AngleAxis.h
@@ -77,16 +77,25 @@ class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
     *          represents an invalid rotation. */
   template<typename Derived>
   inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
+  /** Constructs and initialize the angle-axis rotation from a quaternion \a q.
+    * This function implicitly normalizes the quaternion \a q.
+    */
   template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
   /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
   template<typename Derived>
   inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
 
+  /** \returns the value of the rotation angle in radian */
   Scalar angle() const { return m_angle; }
+  /** \returns a read-write reference to the stored angle in radian */
   Scalar& angle() { return m_angle; }
 
+  /** \returns the rotation axis */
   const Vector3& axis() const { return m_axis; }
+  /** \returns a read-write reference to the stored rotation axis.
+    *
+    * \warning The rotation axis must remain a \b unit vector.
+    */
   Vector3& axis() { return m_axis; }
 
   /** Concatenates two rotations */
@@ -131,7 +140,7 @@ class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
     m_angle = Scalar(other.angle());
   }
 
-  static inline const AngleAxis Identity() { return AngleAxis(0, Vector3::UnitX()); }
+  static inline const AngleAxis Identity() { return AngleAxis(Scalar(0), Vector3::UnitX()); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
     * determined by \a prec.
@@ -149,29 +158,27 @@ typedef AngleAxis<float> AngleAxisf;
 typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
-  * The axis is normalized.
+  * The resulting axis is normalized.
   * 
-  * \warning As any other method dealing with quaternion, if the input quaternion
-  *          is not normalized then the result is undefined.
+  * This function implicitly normalizes the quaternion \a q.
   */
 template<typename Scalar>
 template<typename QuatDerived>
 AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
 {
-  using std::acos;
-  using std::min;
-  using std::max;
-  using std::sqrt;
-  Scalar n2 = q.vec().squaredNorm();
-  if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
+  using std::atan2;
+  Scalar n = q.vec().norm();
+  if(n<NumTraits<Scalar>::epsilon())
+    n = q.vec().stableNorm();
+  if (n > Scalar(0))
   {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
+    m_angle = Scalar(2)*atan2(n, q.w());
+    m_axis  = q.vec() / n;
   }
   else
   {
-    m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1)));
-    m_axis = q.vec() / sqrt(n2);
+    m_angle = Scalar(0);
+    m_axis << Scalar(1), Scalar(0), Scalar(0);
   }
   return *this;
 }
diff --git a/nuparu/include/Eigen/src/Geometry/EulerAngles.h b/nuparu/include/Eigen/src/Geometry/EulerAngles.h
index 97984d59..b875b7a1 100644
--- a/nuparu/include/Eigen/src/Geometry/EulerAngles.h
+++ b/nuparu/include/Eigen/src/Geometry/EulerAngles.h
@@ -28,7 +28,7 @@ namespace Eigen {
   *      * AngleAxisf(ea[2], Vector3f::UnitZ()); \endcode
   * This corresponds to the right-multiply conventions (with right hand side frames).
   * 
-  * The returned angles are in the ranges [0:pi]x[0:pi]x[-pi:pi].
+  * The returned angles are in the ranges [0:pi]x[-pi:pi]x[-pi:pi].
   * 
   * \sa class AngleAxis
   */
@@ -55,7 +55,7 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,i), coeff(k,i));
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
     {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
       Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
       res[1] = -atan2(s2, coeff(i,i));
     }
@@ -84,7 +84,7 @@ MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
     res[0] = atan2(coeff(j,k), coeff(k,k));
     Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
     if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
+      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
       res[1] = atan2(-coeff(i,k), -c2);
     }
     else
diff --git a/nuparu/include/Eigen/src/Geometry/Homogeneous.h b/nuparu/include/Eigen/src/Geometry/Homogeneous.h
index 00e71d19..4107fba4 100644
--- a/nuparu/include/Eigen/src/Geometry/Homogeneous.h
+++ b/nuparu/include/Eigen/src/Geometry/Homogeneous.h
@@ -34,7 +34,7 @@ struct traits<Homogeneous<MatrixType,Direction> >
  : traits<MatrixType>
 {
   typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
   enum {
     RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
@@ -48,8 +48,7 @@ struct traits<Homogeneous<MatrixType,Direction> >
     TmpFlags = _MatrixTypeNested::Flags & HereditaryBits,
     Flags = ColsAtCompileTime==1 ? (TmpFlags & ~RowMajorBit)
           : RowsAtCompileTime==1 ? (TmpFlags | RowMajorBit)
-          : TmpFlags,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+          : TmpFlags
   };
 };
 
@@ -59,52 +58,54 @@ template<typename MatrixType,typename Rhs> struct homogeneous_right_product_impl
 } // end namespace internal
 
 template<typename MatrixType,int _Direction> class Homogeneous
-  : internal::no_assignment_operator, public MatrixBase<Homogeneous<MatrixType,_Direction> >
+  : public MatrixBase<Homogeneous<MatrixType,_Direction> >, internal::no_assignment_operator
 {
   public:
 
+    typedef MatrixType NestedExpression;
     enum { Direction = _Direction };
 
     typedef MatrixBase<Homogeneous> Base;
     EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous)
 
-    inline Homogeneous(const MatrixType& matrix)
+    explicit inline Homogeneous(const MatrixType& matrix)
       : m_matrix(matrix)
     {}
 
     inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
     inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      if(  (int(Direction)==Vertical   && row==m_matrix.rows())
-        || (int(Direction)==Horizontal && col==m_matrix.cols()))
-        return 1;
-      return m_matrix.coeff(row, col);
-    }
+    
+    const NestedExpression& nestedExpression() const { return m_matrix; }
 
     template<typename Rhs>
-    inline const internal::homogeneous_right_product_impl<Homogeneous,Rhs>
+    inline const Product<Homogeneous,Rhs>
     operator* (const MatrixBase<Rhs>& rhs) const
     {
       eigen_assert(int(Direction)==Horizontal);
-      return internal::homogeneous_right_product_impl<Homogeneous,Rhs>(m_matrix,rhs.derived());
+      return Product<Homogeneous,Rhs>(*this,rhs.derived());
     }
 
     template<typename Lhs> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Lhs>
+    inline const Product<Lhs,Homogeneous>
     operator* (const MatrixBase<Lhs>& lhs, const Homogeneous& rhs)
     {
       eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Lhs>(lhs.derived(),rhs.m_matrix);
+      return Product<Lhs,Homogeneous>(lhs.derived(),rhs);
     }
 
     template<typename Scalar, int Dim, int Mode, int Options> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >
+    inline const Product<Transform<Scalar,Dim,Mode,Options>, Homogeneous >
     operator* (const Transform<Scalar,Dim,Mode,Options>& lhs, const Homogeneous& rhs)
     {
       eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >(lhs,rhs.m_matrix);
+      return Product<Transform<Scalar,Dim,Mode,Options>, Homogeneous>(lhs,rhs);
+    }
+
+    template<typename Func>
+    EIGEN_STRONG_INLINE typename internal::result_of<Func(Scalar,Scalar)>::type
+    redux(const Func& func) const
+    {
+      return func(m_matrix.redux(func), Scalar(1));
     }
 
   protected:
@@ -120,14 +121,14 @@ template<typename MatrixType,int _Direction> class Homogeneous
   * Example: \include MatrixBase_homogeneous.cpp
   * Output: \verbinclude MatrixBase_homogeneous.out
   *
-  * \sa class Homogeneous
+  * \sa VectorwiseOp::homogeneous(), class Homogeneous
   */
 template<typename Derived>
 inline typename MatrixBase<Derived>::HomogeneousReturnType
 MatrixBase<Derived>::homogeneous() const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return derived();
+  return HomogeneousReturnType(derived());
 }
 
 /** \geometry_module
@@ -137,12 +138,12 @@ MatrixBase<Derived>::homogeneous() const
   * Example: \include VectorwiseOp_homogeneous.cpp
   * Output: \verbinclude VectorwiseOp_homogeneous.out
   *
-  * \sa MatrixBase::homogeneous() */
+  * \sa MatrixBase::homogeneous(), class Homogeneous */
 template<typename ExpressionType, int Direction>
 inline Homogeneous<ExpressionType,Direction>
 VectorwiseOp<ExpressionType,Direction>::homogeneous() const
 {
-  return _expression();
+  return HomogeneousReturnType(_expression());
 }
 
 /** \geometry_module
@@ -237,7 +238,6 @@ struct homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>
   typedef typename traits<homogeneous_left_product_impl>::LhsMatrixType LhsMatrixType;
   typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
   typedef typename remove_all<typename LhsMatrixTypeCleaned::Nested>::type LhsMatrixTypeNested;
-  typedef typename MatrixType::Index Index;
   homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
     : m_lhs(take_matrix_for_product<Lhs>::run(lhs)),
       m_rhs(rhs)
@@ -277,7 +277,6 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
   : public ReturnByValue<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
 {
   typedef typename remove_all<typename Rhs::Nested>::type RhsNested;
-  typedef typename MatrixType::Index Index;
   homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
     : m_lhs(lhs), m_rhs(rhs)
   {}
@@ -300,6 +299,157 @@ struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
   typename Rhs::Nested m_rhs;
 };
 
+template<typename ArgType,int Direction>
+struct evaluator_traits<Homogeneous<ArgType,Direction> >
+{
+  typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
+  typedef HomogeneousShape Shape;  
+  static const int AssumeAliasing = 0;
+};
+
+template<> struct AssignmentKind<DenseShape,HomogeneousShape> { typedef Dense2Dense Kind; };
+
+
+template<typename ArgType,int Direction>
+struct unary_evaluator<Homogeneous<ArgType,Direction>, IndexBased>
+  : evaluator<typename Homogeneous<ArgType,Direction>::PlainObject >
+{
+  typedef Homogeneous<ArgType,Direction> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  explicit unary_evaluator(const XprType& op)
+    : Base(), m_temp(op)
+  {
+    ::new (static_cast<Base*>(this)) Base(m_temp);
+  }
+
+protected:
+  PlainObject m_temp;
+};
+
+// dense = homogeneous
+template< typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Vertical>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Homogeneous<ArgType,Vertical> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    dst.template topRows<ArgType::RowsAtCompileTime>(src.nestedExpression().rows()) = src.nestedExpression();
+    dst.row(dst.rows()-1).setOnes();
+  }
+};
+
+// dense = homogeneous
+template< typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType,Horizontal>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Homogeneous<ArgType,Horizontal> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    dst.template leftCols<ArgType::ColsAtCompileTime>(src.nestedExpression().cols()) = src.nestedExpression();
+    dst.col(dst.cols()-1).setOnes();
+  }
+};
+
+template<typename LhsArg, typename Rhs, int ProductTag>
+struct generic_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs, HomogeneousShape, DenseShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Homogeneous<LhsArg,Horizontal>& lhs, const Rhs& rhs)
+  {
+    homogeneous_right_product_impl<Homogeneous<LhsArg,Horizontal>, Rhs>(lhs.nestedExpression(), rhs).evalTo(dst);
+  }
+};
+
+template<typename Lhs,typename Rhs>
+struct homogeneous_right_product_refactoring_helper
+{
+  enum {
+    Dim  = Lhs::ColsAtCompileTime,
+    Rows = Lhs::RowsAtCompileTime
+  };
+  typedef typename Rhs::template ConstNRowsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Rhs::ConstRowXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,Rows,1>                        ConstantBlock;
+  typedef Product<Lhs,LinearBlock,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, HomogeneousShape, DenseShape>
+ : public evaluator<typename homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression,Rhs> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+  
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(  xpr.lhs().nestedExpression() .lazyProduct(  xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols()) )
+            + ConstantBlock(xpr.rhs().row(xpr.rhs().rows()-1),xpr.lhs().rows(), 1) )
+  {}
+};
+
+template<typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    homogeneous_left_product_impl<Homogeneous<RhsArg,Vertical>, Lhs>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+template<typename Lhs,typename Rhs>
+struct homogeneous_left_product_refactoring_helper
+{
+  enum {
+    Dim = Rhs::RowsAtCompileTime,
+    Cols = Rhs::ColsAtCompileTime
+  };
+  typedef typename Lhs::template ConstNColsBlockXpr<Dim>::Type          LinearBlockConst;
+  typedef typename remove_const<LinearBlockConst>::type                 LinearBlock;
+  typedef typename Lhs::ConstColXpr                                     ConstantColumn;
+  typedef Replicate<const ConstantColumn,1,Cols>                        ConstantBlock;
+  typedef Product<LinearBlock,Rhs,LazyProduct>                          LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar>, const LinearProduct, const ConstantBlock> Xpr;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, HomogeneousShape>
+ : public evaluator<typename homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression>::Xpr>
+{
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_left_product_refactoring_helper<Lhs,typename Rhs::NestedExpression> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+  
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+    : Base(   xpr.lhs().template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows()) .lazyProduct( xpr.rhs().nestedExpression() )
+            + ConstantBlock(xpr.lhs().col(xpr.lhs().cols()-1),1,xpr.rhs().cols()) )
+  {}
+};
+
+template<typename Scalar, int Dim, int Mode,int Options, typename RhsArg, int ProductTag>
+struct generic_product_impl<Transform<Scalar,Dim,Mode,Options>, Homogeneous<RhsArg,Vertical>, DenseShape, HomogeneousShape, ProductTag>
+{
+  typedef Transform<Scalar,Dim,Mode,Options> TransformType;
+  template<typename Dest>
+  static void evalTo(Dest& dst, const TransformType& lhs, const Homogeneous<RhsArg,Vertical>& rhs)
+  {
+    homogeneous_left_product_impl<Homogeneous<RhsArg,Vertical>, TransformType>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, HomogeneousShape>
+  : public permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape>
+{};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Geometry/Hyperplane.h b/nuparu/include/Eigen/src/Geometry/Hyperplane.h
index aeff43fe..2d076d7f 100644
--- a/nuparu/include/Eigen/src/Geometry/Hyperplane.h
+++ b/nuparu/include/Eigen/src/Geometry/Hyperplane.h
@@ -41,7 +41,7 @@ class Hyperplane
   };
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
   typedef Matrix<Scalar,Index(AmbientDimAtCompileTime)==Dynamic
                         ? Dynamic
@@ -100,7 +100,17 @@ class Hyperplane
   {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
     Hyperplane result(p0.size());
-    result.normal() = (p2 - p0).cross(p1 - p0).normalized();
+    VectorType v0(p2 - p0), v1(p1 - p0);
+    result.normal() = v0.cross(v1);
+    RealScalar norm = result.normal().norm();
+    if(norm <= v0.norm() * v1.norm() * NumTraits<RealScalar>::epsilon())
+    {
+      Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
+      JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
+      result.normal() = svd.matrixV().col(2);
+    }
+    else
+      result.normal() /= norm;
     result.offset() = -p0.dot(result.normal());
     return result;
   }
diff --git a/nuparu/include/Eigen/src/Geometry/OrthoMethods.h b/nuparu/include/Eigen/src/Geometry/OrthoMethods.h
index 556bc816..39b64b86 100644
--- a/nuparu/include/Eigen/src/Geometry/OrthoMethods.h
+++ b/nuparu/include/Eigen/src/Geometry/OrthoMethods.h
@@ -18,6 +18,10 @@ namespace Eigen {
   * \returns the cross product of \c *this and \a other
   *
   * Here is a very good explanation of cross-product: http://xkcd.com/199/
+  * 
+  * With complex numbers, the cross product is implemented as
+  * \f$ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} - \mathbf{b} \times \mathbf{c})\f$
+  * 
   * \sa MatrixBase::cross3()
   */
 template<typename Derived>
@@ -30,8 +34,8 @@ MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 
   // Note that there is no need for an expression here since the compiler
   // optimize such a small temporary very well (even within a complex expression)
-  typename internal::nested<Derived,2>::type lhs(derived());
-  typename internal::nested<OtherDerived,2>::type rhs(other.derived());
+  typename internal::nested_eval<Derived,2>::type lhs(derived());
+  typename internal::nested_eval<OtherDerived,2>::type rhs(other.derived());
   return typename cross_product_return_type<OtherDerived>::type(
     numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
     numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
@@ -76,8 +80,8 @@ MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,4)
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,4)
 
-  typedef typename internal::nested<Derived,2>::type DerivedNested;
-  typedef typename internal::nested<OtherDerived,2>::type OtherDerivedNested;
+  typedef typename internal::nested_eval<Derived,2>::type DerivedNested;
+  typedef typename internal::nested_eval<OtherDerived,2>::type OtherDerivedNested;
   DerivedNested lhs(derived());
   OtherDerivedNested rhs(other.derived());
 
@@ -103,21 +107,24 @@ VectorwiseOp<ExpressionType,Direction>::cross(const MatrixBase<OtherDerived>& ot
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
     YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  
+  typename internal::nested_eval<ExpressionType,2>::type mat(_expression());
+  typename internal::nested_eval<OtherDerived,2>::type vec(other.derived());
 
   CrossReturnType res(_expression().rows(),_expression().cols());
   if(Direction==Vertical)
   {
     eigen_assert(CrossReturnType::RowsAtCompileTime==3 && "the matrix must have exactly 3 rows");
-    res.row(0) = (_expression().row(1) * other.coeff(2) - _expression().row(2) * other.coeff(1)).conjugate();
-    res.row(1) = (_expression().row(2) * other.coeff(0) - _expression().row(0) * other.coeff(2)).conjugate();
-    res.row(2) = (_expression().row(0) * other.coeff(1) - _expression().row(1) * other.coeff(0)).conjugate();
+    res.row(0) = (mat.row(1) * vec.coeff(2) - mat.row(2) * vec.coeff(1)).conjugate();
+    res.row(1) = (mat.row(2) * vec.coeff(0) - mat.row(0) * vec.coeff(2)).conjugate();
+    res.row(2) = (mat.row(0) * vec.coeff(1) - mat.row(1) * vec.coeff(0)).conjugate();
   }
   else
   {
     eigen_assert(CrossReturnType::ColsAtCompileTime==3 && "the matrix must have exactly 3 columns");
-    res.col(0) = (_expression().col(1) * other.coeff(2) - _expression().col(2) * other.coeff(1)).conjugate();
-    res.col(1) = (_expression().col(2) * other.coeff(0) - _expression().col(0) * other.coeff(2)).conjugate();
-    res.col(2) = (_expression().col(0) * other.coeff(1) - _expression().col(1) * other.coeff(0)).conjugate();
+    res.col(0) = (mat.col(1) * vec.coeff(2) - mat.col(2) * vec.coeff(1)).conjugate();
+    res.col(1) = (mat.col(2) * vec.coeff(0) - mat.col(0) * vec.coeff(2)).conjugate();
+    res.col(2) = (mat.col(0) * vec.coeff(1) - mat.col(1) * vec.coeff(0)).conjugate();
   }
   return res;
 }
@@ -130,8 +137,8 @@ struct unitOrthogonal_selector
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
   typedef Matrix<Scalar,2,1> Vector2;
+  EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
   {
     VectorType perp = VectorType::Zero(src.size());
@@ -154,6 +161,7 @@ struct unitOrthogonal_selector<Derived,3>
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
   {
     VectorType perp;
@@ -192,6 +200,7 @@ template<typename Derived>
 struct unitOrthogonal_selector<Derived,2>
 {
   typedef typename plain_matrix_type<Derived>::type VectorType;
+  EIGEN_DEVICE_FUNC
   static inline VectorType run(const Derived& src)
   { return VectorType(-numext::conj(src.y()), numext::conj(src.x())).normalized(); }
 };
diff --git a/nuparu/include/Eigen/src/Geometry/ParametrizedLine.h b/nuparu/include/Eigen/src/Geometry/ParametrizedLine.h
index 77fa228e..93edd914 100644
--- a/nuparu/include/Eigen/src/Geometry/ParametrizedLine.h
+++ b/nuparu/include/Eigen/src/Geometry/ParametrizedLine.h
@@ -37,7 +37,7 @@ class ParametrizedLine
   };
   typedef _Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   typedef Matrix<Scalar,AmbientDimAtCompileTime,1,Options> VectorType;
 
   /** Default constructor without initialization */
diff --git a/nuparu/include/Eigen/src/Geometry/Quaternion.h b/nuparu/include/Eigen/src/Geometry/Quaternion.h
index e135f2b6..32e7e76f 100644
--- a/nuparu/include/Eigen/src/Geometry/Quaternion.h
+++ b/nuparu/include/Eigen/src/Geometry/Quaternion.h
@@ -34,8 +34,9 @@ struct quaternionbase_assign_impl;
 template<class Derived>
 class QuaternionBase : public RotationBase<Derived, 3>
 {
+ public:
   typedef RotationBase<Derived, 3> Base;
-public:
+
   using Base::operator*;
   using Base::derived;
 
@@ -102,11 +103,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
   /** \returns a quaternion representing an identity rotation
     * \sa MatrixBase::Identity()
     */
-  static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(1, 0, 0, 0); }
+  static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(Scalar(1), Scalar(0), Scalar(0), Scalar(0)); }
 
   /** \sa QuaternionBase::Identity(), MatrixBase::setIdentity()
     */
-  inline QuaternionBase& setIdentity() { coeffs() << 0, 0, 0, 1; return *this; }
+  inline QuaternionBase& setIdentity() { coeffs() << Scalar(0), Scalar(0), Scalar(0), Scalar(1); return *this; }
 
   /** \returns the squared norm of the quaternion's coefficients
     * \sa QuaternionBase::norm(), MatrixBase::squaredNorm()
@@ -150,10 +151,6 @@ class QuaternionBase : public RotationBase<Derived, 3>
   /** \returns the conjugated quaternion */
   Quaternion<Scalar> conjugate() const;
 
-  /** \returns an interpolation for a constant motion between \a other and \c *this
-    * \a t in [0;1]
-    * see http://en.wikipedia.org/wiki/Slerp
-    */
   template<class OtherDerived> Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
@@ -164,8 +161,8 @@ class QuaternionBase : public RotationBase<Derived, 3>
   bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
   { return coeffs().isApprox(other.coeffs(), prec); }
 
-	/** return the result vector of \a v through the rotation*/
-  EIGEN_STRONG_INLINE Vector3 _transformVector(Vector3 v) const;
+  /** return the result vector of \a v through the rotation*/
+  EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -194,11 +191,11 @@ class QuaternionBase : public RotationBase<Derived, 3>
   * \brief The quaternion class used to represent 3D orientations and rotations
   *
   * \tparam _Scalar the scalar type, i.e., the type of the coefficients
-  * \tparam _Options controls the memory alignement of the coeffecients. Can be \# AutoAlign or \# DontAlign. Default is AutoAlign.
+  * \tparam _Options controls the memory alignment of the coefficients. Can be \# AutoAlign or \# DontAlign. Default is AutoAlign.
   *
   * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
   * orientations and rotations of objects in three dimensions. Compared to other representations
-  * like Euler angles or 3x3 matrices, quatertions offer the following advantages:
+  * like Euler angles or 3x3 matrices, quaternions offer the following advantages:
   * \li \b compact storage (4 scalars)
   * \li \b efficient to compose (28 flops),
   * \li \b stable spherical interpolation
@@ -207,6 +204,8 @@ class QuaternionBase : public RotationBase<Derived, 3>
   * \li \c Quaternionf for \c float
   * \li \c Quaterniond for \c double
   *
+  * \warning Operations interpreting the quaternion as rotation have undefined behavior if the quaternion is not normalized.
+  *
   * \sa  class AngleAxis, class Transform
   */
 
@@ -218,8 +217,8 @@ struct traits<Quaternion<_Scalar,_Options> >
   typedef _Scalar Scalar;
   typedef Matrix<_Scalar,4,1,_Options> Coefficients;
   enum{
-    IsAligned = internal::traits<Coefficients>::Flags & AlignedBit,
-    Flags = IsAligned ? (AlignedBit | LvalueBit) : LvalueBit
+    Alignment = internal::traits<Coefficients>::Alignment,
+    Flags = LvalueBit
   };
 };
 }
@@ -227,13 +226,13 @@ struct traits<Quaternion<_Scalar,_Options> >
 template<typename _Scalar, int _Options>
 class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
 {
+public:
   typedef QuaternionBase<Quaternion<_Scalar,_Options> > Base;
-  enum { IsAligned = internal::traits<Quaternion>::IsAligned };
+  enum { NeedsAlignment = internal::traits<Quaternion>::Alignment>0 };
 
-public:
   typedef _Scalar Scalar;
 
-  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Quaternion)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Quaternion)
   using Base::operator*=;
 
   typedef typename internal::traits<Quaternion>::Coefficients Coefficients;
@@ -252,7 +251,7 @@ class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
   inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
 
   /** Constructs and initialize a quaternion from the array data */
-  inline Quaternion(const Scalar* data) : m_coeffs(data) {}
+  explicit inline Quaternion(const Scalar* data) : m_coeffs(data) {}
 
   /** Copy constructor */
   template<class Derived> EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
@@ -278,7 +277,11 @@ class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
   inline Coefficients& coeffs() { return m_coeffs;}
   inline const Coefficients& coeffs() const { return m_coeffs;}
 
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(IsAligned)
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsAlignment)
+  
+#ifdef EIGEN_QUATERNION_PLUGIN
+# include EIGEN_QUATERNION_PLUGIN
+#endif
 
 protected:
   Coefficients m_coeffs;
@@ -338,21 +341,21 @@ template<typename _Scalar, int _Options>
 class Map<const Quaternion<_Scalar>, _Options >
   : public QuaternionBase<Map<const Quaternion<_Scalar>, _Options> >
 {
+  public:
     typedef QuaternionBase<Map<const Quaternion<_Scalar>, _Options> > Base;
 
-  public:
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
     using Base::operator*=;
 
     /** Constructs a Mapped Quaternion object from the pointer \a coeffs
       *
-      * The pointer \a coeffs must reference the four coeffecients of Quaternion in the following order:
+      * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
       * \code *coeffs == {x, y, z, w} \endcode
       *
       * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
+    explicit EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
 
     inline const Coefficients& coeffs() const { return m_coeffs;}
 
@@ -375,21 +378,21 @@ template<typename _Scalar, int _Options>
 class Map<Quaternion<_Scalar>, _Options >
   : public QuaternionBase<Map<Quaternion<_Scalar>, _Options> >
 {
+  public:
     typedef QuaternionBase<Map<Quaternion<_Scalar>, _Options> > Base;
 
-  public:
     typedef _Scalar Scalar;
     typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Map)
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
     using Base::operator*=;
 
     /** Constructs a Mapped Quaternion object from the pointer \a coeffs
       *
-      * The pointer \a coeffs must reference the four coeffecients of Quaternion in the following order:
+      * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
       * \code *coeffs == {x, y, z, w} \endcode
       *
       * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
+    explicit EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
 
     inline Coefficients& coeffs() { return m_coeffs; }
     inline const Coefficients& coeffs() const { return m_coeffs; }
@@ -399,16 +402,16 @@ class Map<Quaternion<_Scalar>, _Options >
 };
 
 /** \ingroup Geometry_Module
-  * Map an unaligned array of single precision scalar as a quaternion */
+  * Map an unaligned array of single precision scalars as a quaternion */
 typedef Map<Quaternion<float>, 0>         QuaternionMapf;
 /** \ingroup Geometry_Module
-  * Map an unaligned array of double precision scalar as a quaternion */
+  * Map an unaligned array of double precision scalars as a quaternion */
 typedef Map<Quaternion<double>, 0>        QuaternionMapd;
 /** \ingroup Geometry_Module
-  * Map a 16-bits aligned array of double precision scalars as a quaternion */
+  * Map a 16-byte aligned array of single precision scalars as a quaternion */
 typedef Map<Quaternion<float>, Aligned>   QuaternionMapAlignedf;
 /** \ingroup Geometry_Module
-  * Map a 16-bits aligned array of double precision scalars as a quaternion */
+  * Map a 16-byte aligned array of double precision scalars as a quaternion */
 typedef Map<Quaternion<double>, Aligned>  QuaternionMapAlignedd;
 
 /***************************************************************************
@@ -442,7 +445,7 @@ QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) c
    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
                          typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+                         EIGEN_PLAIN_ENUM_MIN(internal::traits<Derived>::Alignment, internal::traits<OtherDerived>::Alignment)>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
@@ -463,12 +466,12 @@ EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const Quaterni
   */
 template <class Derived>
 EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
-QuaternionBase<Derived>::_transformVector(Vector3 v) const
+QuaternionBase<Derived>::_transformVector(const Vector3& v) const
 {
     // Note that this algorithm comes from the optimization by hand
     // of the conversion to a Matrix followed by a Matrix/Vector product.
     // It appears to be much faster than the common algorithm found
-    // in the litterature (30 versus 39 flops). It also requires two
+    // in the literature (30 versus 39 flops). It also requires two
     // Vector3 as temporaries.
     Vector3 uv = this->vec().cross(v);
     uv += uv;
@@ -572,14 +575,13 @@ template<class Derived>
 template<typename Derived1, typename Derived2>
 inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
 {
-  using std::max;
   using std::sqrt;
   Vector3 v0 = a.normalized();
   Vector3 v1 = b.normalized();
   Scalar c = v1.dot(v0);
 
   // if dot == -1, vectors are nearly opposites
-  // => accuraletly compute the rotation axis by computing the
+  // => accurately compute the rotation axis by computing the
   //    intersection of the two planes. This is done by solving:
   //       x^T v0 = 0
   //       x^T v1 = 0
@@ -588,7 +590,7 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
   //    which yields a singular value problem
   if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
   {
-    c = max<Scalar>(c,-1);
+    c = numext::maxi(c,Scalar(-1));
     Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
     JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
     Vector3 axis = svd.matrixV().col(2);
@@ -639,7 +641,7 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
 {
   // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
   Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
+  if (n2 > Scalar(0))
     return Quaternion<Scalar>(conjugate().coeffs() / n2);
   else
   {
@@ -648,6 +650,16 @@ inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Der
   }
 }
 
+// Generic conjugate of a Quaternion
+namespace internal {
+template<int Arch, class Derived, typename Scalar, int _Options> struct quat_conj
+{
+  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q){
+    return Quaternion<Scalar>(q.w(),-q.x(),-q.y(),-q.z());
+  }
+};
+}
+                         
 /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
   * if the quaternion is normalized.
   * The conjugate of a quaternion represents the opposite rotation.
@@ -658,7 +670,10 @@ template <class Derived>
 inline Quaternion<typename internal::traits<Derived>::Scalar>
 QuaternionBase<Derived>::conjugate() const
 {
-  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+  return internal::quat_conj<Architecture::Target, Derived,
+                         typename internal::traits<Derived>::Scalar,
+                         internal::traits<Derived>::Alignment>::run(*this);
+                         
 }
 
 /** \returns the angle (in radian) between two rotations
@@ -669,16 +684,19 @@ template <class OtherDerived>
 inline typename internal::traits<Derived>::Scalar
 QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
 {
-  using std::acos;
+  using std::atan2;
   using std::abs;
-  double d = abs(this->dot(other));
-  if (d>=1.0)
-    return Scalar(0);
-  return static_cast<Scalar>(2 * acos(d));
+  Quaternion<Scalar> d = (*this) * other.conjugate();
+  return Scalar(2) * atan2( d.vec().norm(), abs(d.w()) );
 }
 
+ 
+    
 /** \returns the spherical linear interpolation between the two quaternions
-  * \c *this and \a other at the parameter \a t
+  * \c *this and \a other at the parameter \a t in [0;1].
+  * 
+  * This represents an interpolation for a constant motion between \c *this and \a other,
+  * see also http://en.wikipedia.org/wiki/Slerp.
   */
 template <class Derived>
 template <class OtherDerived>
@@ -709,7 +727,7 @@ QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerive
     scale0 = sin( ( Scalar(1) - t ) * theta) / sinTheta;
     scale1 = sin( ( t * theta) ) / sinTheta;
   }
-  if(d<0) scale1 = -scale1;
+  if(d<Scalar(0)) scale1 = -scale1;
 
   return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }
@@ -721,9 +739,9 @@ template<typename Other>
 struct quaternionbase_assign_impl<Other,3,3>
 {
   typedef typename Other::Scalar Scalar;
-  typedef DenseIndex Index;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& mat)
+  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& a_mat)
   {
+    const typename internal::nested_eval<Other,2>::type mat(a_mat);
     using std::sqrt;
     // This algorithm comes from  "Quaternion Calculus and Fast Animation",
     // Ken Shoemake, 1987 SIGGRAPH course notes
@@ -739,13 +757,13 @@ struct quaternionbase_assign_impl<Other,3,3>
     }
     else
     {
-      DenseIndex i = 0;
+      Index i = 0;
       if (mat.coeff(1,1) > mat.coeff(0,0))
         i = 1;
       if (mat.coeff(2,2) > mat.coeff(i,i))
         i = 2;
-      DenseIndex j = (i+1)%3;
-      DenseIndex k = (j+1)%3;
+      Index j = (i+1)%3;
+      Index k = (j+1)%3;
 
       t = sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
       q.coeffs().coeffRef(i) = Scalar(0.5) * t;
diff --git a/nuparu/include/Eigen/src/Geometry/Rotation2D.h b/nuparu/include/Eigen/src/Geometry/Rotation2D.h
index 1cac343a..8b0ddcfb 100644
--- a/nuparu/include/Eigen/src/Geometry/Rotation2D.h
+++ b/nuparu/include/Eigen/src/Geometry/Rotation2D.h
@@ -59,20 +59,47 @@ class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
 public:
 
   /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(const Scalar& a) : m_angle(a) {}
+  explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
+  
+  /** Default constructor wihtout initialization. The represented rotation is undefined. */
+  Rotation2D() {}
+
+  /** Construct a 2D rotation from a 2x2 rotation matrix \a mat.
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  explicit Rotation2D(const MatrixBase<Derived>& m)
+  {
+    fromRotationMatrix(m.derived());
+  }
 
   /** \returns the rotation angle */
   inline Scalar angle() const { return m_angle; }
 
   /** \returns a read-write reference to the rotation angle */
   inline Scalar& angle() { return m_angle; }
+  
+  /** \returns the rotation angle in [0,2pi] */
+  inline Scalar smallestPositiveAngle() const {
+    Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI);
+    return tmp<Scalar(0) ? tmp + Scalar(2)*EIGEN_PI : tmp;
+  }
+  
+  /** \returns the rotation angle in [-pi,pi] */
+  inline Scalar smallestAngle() const {
+    Scalar tmp = fmod(m_angle,Scalar(2)*EIGEN_PI);
+    if(tmp>Scalar(EIGEN_PI))       tmp -= Scalar(2)*Scalar(EIGEN_PI);
+    else if(tmp<-Scalar(EIGEN_PI)) tmp += Scalar(2)*Scalar(EIGEN_PI);
+    return tmp;
+  }
 
   /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
+  inline Rotation2D inverse() const { return Rotation2D(-m_angle); }
 
   /** Concatenates two rotations */
   inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
+  { return Rotation2D(m_angle + other.m_angle); }
 
   /** Concatenates two rotations */
   inline Rotation2D& operator*=(const Rotation2D& other)
@@ -81,16 +108,30 @@ class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
   /** Applies the rotation to a 2D vector */
   Vector2 operator* (const Vector2& vec) const
   { return toRotationMatrix() * vec; }
-
+  
   template<typename Derived>
   Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix(void) const;
+  Matrix2 toRotationMatrix() const;
+
+  /** Set \c *this from a 2x2 rotation matrix \a mat.
+    * In other words, this function extract the rotation angle from the rotation matrix.
+    *
+    * This method is an alias for fromRotationMatrix()
+    *
+    * \sa fromRotationMatrix()
+    */
+  template<typename Derived>
+  Rotation2D& operator=(const  MatrixBase<Derived>& m)
+  { return fromRotationMatrix(m.derived()); }
 
   /** \returns the spherical interpolation between \c *this and \a other using
     * parameter \a t. It is in fact equivalent to a linear interpolation.
     */
   inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
+  {
+    Scalar dist = Rotation2D(other.m_angle-m_angle).smallestAngle();
+    return Rotation2D(m_angle + dist*t);
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
     *
@@ -116,6 +157,7 @@ class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
     * \sa MatrixBase::isApprox() */
   bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
   { return internal::isApprox(m_angle,other.m_angle, prec); }
+  
 };
 
 /** \ingroup Geometry_Module
diff --git a/nuparu/include/Eigen/src/Geometry/Scaling.h b/nuparu/include/Eigen/src/Geometry/Scaling.h
index 1c25f36f..023fba2e 100644
--- a/nuparu/include/Eigen/src/Geometry/Scaling.h
+++ b/nuparu/include/Eigen/src/Geometry/Scaling.h
@@ -62,10 +62,10 @@ class UniformScaling
   template<int Dim, int Mode, int Options>
   inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
   {
-   Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
-   res.prescale(factor());
-   return res;
-}
+    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
+    res.prescale(factor());
+    return res;
+  }
 
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
diff --git a/nuparu/include/Eigen/src/Geometry/Transform.h b/nuparu/include/Eigen/src/Geometry/Transform.h
index 887e718d..75f20bda 100644
--- a/nuparu/include/Eigen/src/Geometry/Transform.h
+++ b/nuparu/include/Eigen/src/Geometry/Transform.h
@@ -62,6 +62,24 @@ struct transform_construct_from_matrix;
 
 template<typename TransformType> struct transform_take_affine_part;
 
+template<typename _Scalar, int _Dim, int _Mode, int _Options>
+struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
+{
+  typedef _Scalar Scalar;
+  typedef Eigen::Index StorageIndex;
+  typedef Dense StorageKind;
+  enum {
+    Dim1 = _Dim==Dynamic ? _Dim : _Dim + 1,
+    RowsAtCompileTime = _Mode==Projective ? Dim1 : _Dim,
+    ColsAtCompileTime = Dim1,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = 0
+  };
+};
+
+template<int Mode> struct transform_make_affine;
+
 } // end namespace internal
 
 /** \geometry_module \ingroup Geometry_Module
@@ -100,15 +118,15 @@ template<typename TransformType> struct transform_take_affine_part;
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
-  * with any other transformations (Transform,Translation,RotationBase,Matrix)
+  * with any other transformations (Transform,Translation,RotationBase,DiagonalMatrix)
   * and can be directly used to transform implicit homogeneous vectors. All these
   * operations are handled via the operator*. For the composition of transformations,
   * its principle consists to first convert the right/left hand sides of the product
   * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
   * Of course, internally, operator* tries to perform the minimal number of operations
   * according to the nature of each terms. Likewise, when applying the transform
-  * to non homogeneous vectors, the latters are automatically promoted to homogeneous
-  * one before doing the matrix product. The convertions to homogeneous representations
+  * to points, the latters are automatically promoted to homogeneous vectors
+  * before doing the matrix product. The conventions to homogeneous representations
   * are performed as follow:
   *
   * \b Translation t (Dim)x(1):
@@ -122,7 +140,7 @@ template<typename TransformType> struct transform_take_affine_part;
   * R & 0\\
   * 0\,...\,0 & 1
   * \end{array} \right) \f$
-  *
+  *<!--
   * \b Linear \b Matrix L (Dim)x(Dim):
   * \f$ \left( \begin{array}{cc}
   * L & 0\\
@@ -134,14 +152,20 @@ template<typename TransformType> struct transform_take_affine_part;
   * A\\
   * 0\,...\,0\,1
   * \end{array} \right) \f$
+  *-->
+  * \b Scaling \b DiagonalMatrix S (Dim)x(Dim):
+  * \f$ \left( \begin{array}{cc}
+  * S & 0\\
+  * 0\,...\,0 & 1
+  * \end{array} \right) \f$
   *
-  * \b Column \b vector v (Dim)x(1):
+  * \b Column \b point v (Dim)x(1):
   * \f$ \left( \begin{array}{c}
   * v\\
   * 1
   * \end{array} \right) \f$
   *
-  * \b Set \b of \b column \b vectors V1...Vn (Dim)x(n):
+  * \b Set \b of \b column \b points V1...Vn (Dim)x(n):
   * \f$ \left( \begin{array}{ccc}
   * v_1 & ... & v_n\\
   * 1 & ... & 1
@@ -186,7 +210,8 @@ class Transform
   };
   /** the scalar type of the coefficients */
   typedef _Scalar Scalar;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   /** type of the matrix used to represent the transformation */
   typedef typename internal::make_proper_matrix_type<Scalar,Rows,HDim,Options>::type MatrixType;
   /** constified MatrixType */
@@ -194,9 +219,9 @@ class Transform
   /** type of the matrix used to represent the linear part of the transformation */
   typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
   /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact)> LinearPart;
+  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
   /** type of read reference to the linear part of the transformation */
-  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact)> ConstLinearPart;
+  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
   /** type of read/write reference to the affine part of the transformation */
   typedef typename internal::conditional<int(Mode)==int(AffineCompact),
                               MatrixType&,
@@ -208,9 +233,9 @@ class Transform
   /** type of a vector */
   typedef Matrix<Scalar,Dim,1> VectorType;
   /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1,int(Mode)==(AffineCompact)> TranslationPart;
+  typedef Block<MatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> TranslationPart;
   /** type of a read reference to the translation part of the rotation */
-  typedef const Block<ConstMatrixType,Dim,1,int(Mode)==(AffineCompact)> ConstTranslationPart;
+  typedef const Block<ConstMatrixType,Dim,1,!(internal::traits<MatrixType>::Flags & RowMajorBit)> ConstTranslationPart;
   /** corresponding translation type */
   typedef Translation<Scalar,Dim> TranslationType;
   
@@ -230,8 +255,7 @@ class Transform
   inline Transform()
   {
     check_template_params();
-    if (int(Mode)==Affine)
-      makeAffine();
+    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
   }
 
   inline Transform(const Transform& other)
@@ -355,6 +379,9 @@ class Transform
   inline Transform& operator=(const QTransform& other);
   inline QTransform toQTransform(void) const;
   #endif
+  
+  Index rows() const { return int(Mode)==int(Projective) ? m_matrix.cols() : (m_matrix.cols()-1); }
+  Index cols() const { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
     * \sa MatrixBase::operator(Index,Index) const */
@@ -383,26 +410,39 @@ class Transform
   /** \returns a writable expression of the translation vector of the transformation */
   inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
 
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other.
     *
-    * The right hand side \a other might be either:
-    * \li a vector of size Dim,
+    * The right-hand-side \a other can be either:
     * \li an homogeneous vector of size Dim+1,
-    * \li a set of vectors of size Dim x Dynamic,
-    * \li a set of homogeneous vectors of size Dim+1 x Dynamic,
-    * \li a linear transformation matrix of size Dim x Dim,
-    * \li an affine transformation matrix of size Dim x Dim+1,
+    * \li a set of homogeneous vectors of size Dim+1 x N,
     * \li a transformation matrix of size Dim+1 x Dim+1.
+    *
+    * Moreover, if \c *this represents an affine transformation (i.e., Mode!=Projective), then \a other can also be:
+    * \li a point of size Dim (computes: \code this->linear() * other + this->translation()\endcode),
+    * \li a set of N points as a Dim x N matrix (computes: \code (this->linear() * other).colwise() + this->translation()\endcode),
+    *
+    * In all cases, the return type is a matrix or vector of same sizes as the right-hand-side \a other.
+    *
+    * If you want to interpret \a other as a linear or affine transformation, then first convert it to a Transform<> type,
+    * or do your own cooking.
+    *
+    * Finally, if you want to apply Affine transformations to vectors, then explicitly apply the linear part only:
+    * \code
+    * Affine3f A;
+    * Vector3f v1, v2;
+    * v2 = A.linear() * v1;
+    * \endcode
+    *
     */
   // note: this function is defined here because some compilers cannot find the respective declaration
   template<typename OtherDerived>
-  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
+  EIGEN_STRONG_INLINE const typename OtherDerived::PlainObject
   operator * (const EigenBase<OtherDerived> &other) const
   { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
 
   /** \returns the product expression of a transformation matrix \a a times a transform \a b
     *
-    * The left hand side \a other might be either:
+    * The left hand side \a other can be either:
     * \li a linear transformation matrix of size Dim x Dim,
     * \li an affine transformation matrix of size Dim x Dim+1,
     * \li a general transformation matrix of size Dim+1 x Dim+1.
@@ -454,7 +494,7 @@ class Transform
     return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
   }
   
-  #ifdef __INTEL_COMPILER
+  #if EIGEN_COMP_ICC
 private:
   // this intermediate structure permits to workaround a bug in ICC 11:
   //   error: template instantiation resulted in unexpected function type of "Eigen::Transform<double, 3, 32, 0>
@@ -530,9 +570,9 @@ class Transform
 
   inline Transform& operator=(const UniformScaling<Scalar>& t);
   inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Isometry)> operator*(const UniformScaling<Scalar>& s) const
+  inline TransformTimeDiagonalReturnType operator*(const UniformScaling<Scalar>& s) const
   {
-    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Isometry),Options> res = *this;
+    TransformTimeDiagonalReturnType res = *this;
     res.scale(s.factor());
     return res;
   }
@@ -591,11 +631,7 @@ class Transform
     */
   void makeAffine()
   {
-    if(int(Mode)!=int(AffineCompact))
-    {
-      matrix().template block<1,Dim>(Dim,0).setZero();
-      matrix().coeffRef(Dim,Dim) = Scalar(1);
-    }
+    internal::transform_make_affine<int(Mode)>::run(m_matrix);
   }
 
   /** \internal
@@ -699,9 +735,13 @@ template<typename Scalar, int Dim, int Mode,int Options>
 Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              0, 0, 1;
+  if (Mode == int(AffineCompact))
+    m_matrix << other.m11(), other.m21(), other.dx(),
+                other.m12(), other.m22(), other.dy();
+  else
+    m_matrix << other.m11(), other.m21(), other.dx(),
+                other.m12(), other.m22(), other.dy(),
+                0, 0, 1;
   return *this;
 }
 
@@ -1079,6 +1119,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas
 
 namespace internal {
 
+template<int Mode>
+struct transform_make_affine
+{
+  template<typename MatrixType>
+  static void run(MatrixType &mat)
+  {
+    static const int Dim = MatrixType::ColsAtCompileTime-1;
+    mat.template block<1,Dim>(Dim,0).setZero();
+    mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
+  }
+};
+
+template<>
+struct transform_make_affine<AffineCompact>
+{
+  template<typename MatrixType> static void run(MatrixType &) { }
+};
+    
 // selector needed to avoid taking the inverse of a 3x4 matrix
 template<typename TransformType, int Mode=TransformType::Mode>
 struct projective_transform_inverse
diff --git a/nuparu/include/Eigen/src/Geometry/Umeyama.h b/nuparu/include/Eigen/src/Geometry/Umeyama.h
index 345b47e0..8d9b7a15 100644
--- a/nuparu/include/Eigen/src/Geometry/Umeyama.h
+++ b/nuparu/include/Eigen/src/Geometry/Umeyama.h
@@ -97,7 +97,6 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   typedef typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type TransformationMatrixType;
   typedef typename internal::traits<TransformationMatrixType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
 
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename internal::traits<OtherDerived>::Scalar>::value),
@@ -113,7 +112,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   const Index n = src.cols(); // number of measurements
 
   // required for demeaning ...
-  const RealScalar one_over_n = 1 / static_cast<RealScalar>(n);
+  const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n);
 
   // computation of mean
   const VectorType src_mean = src.rowwise().sum() * one_over_n;
@@ -136,16 +135,16 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
 
   // Eq. (39)
   VectorType S = VectorType::Ones(m);
-  if (sigma.determinant()<0) S(m-1) = -1;
+  if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
 
   // Eq. (40) and (43)
   const VectorType& d = svd.singularValues();
   Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
   if (rank == m-1) {
-    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > 0 ) {
+    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
       Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
     } else {
-      const Scalar s = S(m-1); S(m-1) = -1;
+      const Scalar s = S(m-1); S(m-1) = Scalar(-1);
       Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
       S(m-1) = s;
     }
@@ -156,7 +155,7 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   if (with_scaling)
   {
     // Eq. (42)
-    const Scalar c = 1/src_var * svd.singularValues().dot(S);
+    const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
 
     // Eq. (41)
     Rt.col(m).head(m) = dst_mean;
diff --git a/nuparu/include/Eigen/src/Geometry/arch/Geometry_SSE.h b/nuparu/include/Eigen/src/Geometry/arch/Geometry_SSE.h
index 3d8284f2..1a86ff83 100644
--- a/nuparu/include/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/nuparu/include/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -16,35 +16,47 @@ namespace Eigen {
 namespace internal {
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned16>
 {
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
     Quaternion<float> res;
-    __m128 a = _a.coeffs().template packet<Aligned>(0);
-    __m128 b = _b.coeffs().template packet<Aligned>(0);
-    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),
-                                         vec4f_swizzle1(b,2,0,1,2)),mask);
-    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),
-                                         vec4f_swizzle1(b,0,1,2,1)),mask);
+    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    __m128 a = _a.coeffs().template packet<Aligned16>(0);
+    __m128 b = _b.coeffs().template packet<Aligned16>(0);
+    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstore(&res.x(),
               _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
                                     _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_add_ps(flip1,flip2)));
+                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
+    
     return res;
   }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, float, Alignment>
+{
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<float> res;
+    const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<Alignment>(0)));
+    return res;
+  }
+};
+
+
 template<typename VectorLhs,typename VectorRhs>
 struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 {
   static inline typename plain_matrix_type<VectorLhs>::type
   run(const VectorLhs& lhs, const VectorRhs& rhs)
   {
-    __m128 a = lhs.template packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
-    __m128 b = rhs.template packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
+    __m128 a = lhs.template packet<traits<VectorLhs>::Alignment>(0);
+    __m128 b = rhs.template packet<traits<VectorRhs>::Alignment>(0);
     __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
     __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
     typename plain_matrix_type<VectorLhs>::type res;
@@ -56,8 +68,8 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
 
 
 
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
+template<class Derived, class OtherDerived, int Alignment>
+struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Alignment>
 {
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
@@ -66,8 +78,8 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
   Quaternion<double> res;
 
   const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<Aligned>(0);
-  Packet2d b_zw = _b.coeffs().template packet<Aligned>(2);
+  Packet2d b_xy = _b.coeffs().template packet<Alignment>(0);
+  Packet2d b_zw = _b.coeffs().template packet<Alignment>(2);
   Packet2d a_xx = pset1<Packet2d>(a[0]);
   Packet2d a_yy = pset1<Packet2d>(a[1]);
   Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -108,6 +120,20 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
 }
 };
 
+template<class Derived, int Alignment>
+struct quat_conj<Architecture::SSE, Derived, double, Alignment>
+{
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
+  {
+    Quaternion<double> res;
+    const __m128d mask0 = _mm_setr_pd(-0.,-0.);
+    const __m128d mask2 = _mm_setr_pd(-0.,0.);
+    pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<Alignment>(0)));
+    pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<Alignment>(2)));
+    return res;
+  }
+};
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/Householder/BlockHouseholder.h b/nuparu/include/Eigen/src/Householder/BlockHouseholder.h
index 1991c652..39bf8c83 100644
--- a/nuparu/include/Eigen/src/Householder/BlockHouseholder.h
+++ b/nuparu/include/Eigen/src/Householder/BlockHouseholder.h
@@ -16,48 +16,82 @@
 namespace Eigen { 
 
 namespace internal {
+  
+/** \internal */
+// template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
+// void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
+// {
+//   typedef typename VectorsType::Scalar Scalar;
+//   const Index nbVecs = vectors.cols();
+//   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
+// 
+//   for(Index i = 0; i < nbVecs; i++)
+//   {
+//     Index rs = vectors.rows() - i;
+//     // Warning, note that hCoeffs may alias with vectors.
+//     // It is then necessary to copy it before modifying vectors(i,i). 
+//     typename CoeffsType::Scalar h = hCoeffs(i);
+//     // This hack permits to pass trough nested Block<> and Transpose<> expressions.
+//     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
+//     Scalar Vii = *Vii_ptr;
+//     *Vii_ptr = Scalar(1);
+//     triFactor.col(i).head(i).noalias() = -h * vectors.block(i, 0, rs, i).adjoint()
+//                                        * vectors.col(i).tail(rs);
+//     *Vii_ptr = Vii;
+//     // FIXME add .noalias() once the triangular product can work inplace
+//     triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
+//                              * triFactor.col(i).head(i);
+//     triFactor(i,i) = hCoeffs(i);
+//   }
+// }
 
 /** \internal */
+// This variant avoid modifications in vectors
 template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
 void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
 {
-  typedef typename TriangularFactorType::Index Index;
-  typedef typename VectorsType::Scalar Scalar;
   const Index nbVecs = vectors.cols();
   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
 
-  for(Index i = 0; i < nbVecs; i++)
+  for(Index i = nbVecs-1; i >=0 ; --i)
   {
-    Index rs = vectors.rows() - i;
-    Scalar Vii = vectors(i,i);
-    vectors.const_cast_derived().coeffRef(i,i) = Scalar(1);
-    triFactor.col(i).head(i).noalias() = -hCoeffs(i) * vectors.block(i, 0, rs, i).adjoint()
-                                       * vectors.col(i).tail(rs);
-    vectors.const_cast_derived().coeffRef(i, i) = Vii;
-    // FIXME add .noalias() once the triangular product can work inplace
-    triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
-                             * triFactor.col(i).head(i);
+    Index rs = vectors.rows() - i - 1;
+    Index rt = nbVecs-i-1;
+
+    if(rt>0)
+    {
+      triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
+                                                        * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
+            
+      // FIXME add .noalias() once the triangular product can work inplace
+      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
+      
+    }
     triFactor(i,i) = hCoeffs(i);
   }
 }
 
-/** \internal */
+/** \internal
+  * if forward then perform   mat = H0 * H1 * H2 * mat
+  * otherwise perform         mat = H2 * H1 * H0 * mat
+  */
 template<typename MatrixType,typename VectorsType,typename CoeffsType>
-void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs)
+void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs, bool forward)
 {
-  typedef typename MatrixType::Index Index;
   enum { TFactorSize = MatrixType::ColsAtCompileTime };
   Index nbVecs = vectors.cols();
-  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize> T(nbVecs,nbVecs);
-  make_block_householder_triangular_factor(T, vectors, hCoeffs);
-
-  const TriangularView<const VectorsType, UnitLower>& V(vectors);
+  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, RowMajor> T(nbVecs,nbVecs);
+  
+  if(forward) make_block_householder_triangular_factor(T, vectors, hCoeffs);
+  else        make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate());  
+  const TriangularView<const VectorsType, UnitLower> V(vectors);
 
   // A -= V T V^* A
   Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
          VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
   // FIXME add .noalias() once the triangular product can work inplace
-  tmp = T.template triangularView<Upper>().adjoint() * tmp;
+  if(forward) tmp = T.template triangularView<Upper>()           * tmp;
+  else        tmp = T.template triangularView<Upper>().adjoint() * tmp;
   mat.noalias() -= V * tmp;
 }
 
diff --git a/nuparu/include/Eigen/src/Householder/Householder.h b/nuparu/include/Eigen/src/Householder/Householder.h
index 32112af9..4c1f499a 100644
--- a/nuparu/include/Eigen/src/Householder/Householder.h
+++ b/nuparu/include/Eigen/src/Householder/Householder.h
@@ -75,8 +75,9 @@ void MatrixBase<Derived>::makeHouseholder(
   
   RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
   Scalar c0 = coeff(0);
+  const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
+  if(tailSqNorm <= tol && numext::abs2(numext::imag(c0))<=tol)
   {
     tau = RealScalar(0);
     beta = numext::real(c0);
diff --git a/nuparu/include/Eigen/src/Householder/HouseholderSequence.h b/nuparu/include/Eigen/src/Householder/HouseholderSequence.h
index d800ca1f..74cd0a47 100644
--- a/nuparu/include/Eigen/src/Householder/HouseholderSequence.h
+++ b/nuparu/include/Eigen/src/Householder/HouseholderSequence.h
@@ -60,7 +60,7 @@ template<typename VectorsType, typename CoeffsType, int Side>
 struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
 {
   typedef typename VectorsType::Scalar Scalar;
-  typedef typename VectorsType::Index Index;
+  typedef typename VectorsType::StorageIndex StorageIndex;
   typedef typename VectorsType::StorageKind StorageKind;
   enum {
     RowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::RowsAtCompileTime
@@ -73,12 +73,20 @@ struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
   };
 };
 
+struct HouseholderSequenceShape {};
+
+template<typename VectorsType, typename CoeffsType, int Side>
+struct evaluator_traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
+  : public evaluator_traits_base<HouseholderSequence<VectorsType,CoeffsType,Side> >
+{
+  typedef HouseholderSequenceShape Shape;
+};
+
 template<typename VectorsType, typename CoeffsType, int Side>
 struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
   static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
@@ -91,7 +99,6 @@ struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight>
 {
   typedef Transpose<Block<const VectorsType, 1, Dynamic> > EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheRight> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
   static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
@@ -122,7 +129,6 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       MaxColsAtCompileTime = internal::traits<HouseholderSequence>::MaxColsAtCompileTime
     };
     typedef typename internal::traits<HouseholderSequence>::Scalar Scalar;
-    typedef typename VectorsType::Index Index;
 
     typedef HouseholderSequence<
       typename internal::conditional<NumTraits<Scalar>::IsComplex,
@@ -307,12 +313,36 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     template<typename Dest, typename Workspace>
     inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
     {
-      workspace.resize(dst.cols());
-      for(Index k = 0; k < m_length; ++k)
+      const Index BlockSize = 48;
+      // if the entries are large enough, then apply the reflectors by block
+      if(m_length>=BlockSize && dst.cols()>1)
       {
-        Index actual_k = m_trans ? k : m_length-k-1;
-        dst.bottomRows(rows()-m_shift-actual_k)
-           .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+        for(Index i = 0; i < m_length; i+=BlockSize)
+        {
+          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
+          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
+          Index bs = end-k;
+          Index start = k + m_shift;
+          
+          typedef Block<typename internal::remove_all<VectorsType>::type,Dynamic,Dynamic> SubVectorsType;
+          SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side==OnTheRight ? k : start,
+                                                                   Side==OnTheRight ? start : k,
+                                                                   Side==OnTheRight ? bs : m_vectors.rows()-start,
+                                                                   Side==OnTheRight ? m_vectors.cols()-start : bs);
+          typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
+        }
+      }
+      else
+      {
+        workspace.resize(dst.cols());
+        for(Index k = 0; k < m_length; ++k)
+        {
+          Index actual_k = m_trans ? k : m_length-k-1;
+          dst.bottomRows(rows()-m_shift-actual_k)
+            .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+        }
       }
     }
 
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 73ca9bfd..358444af 100644
--- a/nuparu/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -17,33 +17,37 @@ namespace Eigen {
   *
   * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
-  * \code
-  * A.diagonal().asDiagonal() . x = b
-  * \endcode
+    \code
+    A.diagonal().asDiagonal() . x = b
+    \endcode
   *
   * \tparam _Scalar the type of the scalar.
   *
+  * \implsparsesolverconcept
+  *
   * This preconditioner is suitable for both selfadjoint and general problems.
   * The diagonal entries are pre-inverted and stored into a dense vector.
   *
   * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
   *
+  * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient
   */
 template <typename _Scalar>
 class DiagonalPreconditioner
 {
     typedef _Scalar Scalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef typename Vector::Index Index;
-
   public:
-    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+    typedef typename Vector::StorageIndex StorageIndex;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
     DiagonalPreconditioner() : m_isInitialized(false) {}
 
     template<typename MatType>
-    DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
+    explicit DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
     {
       compute(mat);
     }
@@ -65,10 +69,10 @@ class DiagonalPreconditioner
       {
         typename MatType::InnerIterator it(mat,j);
         while(it && it.index()!=j) ++it;
-        if(it && it.index()==j)
+        if(it && it.index()==j && it.value()!=Scalar(0))
           m_invdiag(j) = Scalar(1)/it.value();
         else
-          m_invdiag(j) = 0;
+          m_invdiag(j) = Scalar(1);
       }
       m_isInitialized = true;
       return *this;
@@ -80,46 +84,102 @@ class DiagonalPreconditioner
       return factorize(mat);
     }
 
+    /** \internal */
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
       x = m_invdiag.array() * b.array() ;
     }
 
-    template<typename Rhs> inline const internal::solve_retval<DiagonalPreconditioner, Rhs>
+    template<typename Rhs> inline const Solve<DiagonalPreconditioner, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
       eigen_assert(m_invdiag.size()==b.rows()
                 && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<DiagonalPreconditioner, Rhs>(*this, b.derived());
+      return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
     }
+    
+    ComputationInfo info() { return Success; }
 
   protected:
     Vector m_invdiag;
     bool m_isInitialized;
 };
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<DiagonalPreconditioner<_MatrixType>, Rhs>
-  : solve_retval_base<DiagonalPreconditioner<_MatrixType>, Rhs>
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Jacobi preconditioner for LeastSquaresConjugateGradient
+  *
+  * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+    \code
+    (A.adjoint() * A).diagonal().asDiagonal() * x = b
+    \endcode
+  *
+  * \tparam _Scalar the type of the scalar.
+  *
+  * \implsparsesolverconcept
+  *
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  * 
+  * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
+  */
+template <typename _Scalar>
+class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar>
 {
-  typedef DiagonalPreconditioner<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+    typedef _Scalar Scalar;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    typedef DiagonalPreconditioner<_Scalar> Base;
+    using Base::m_invdiag;
+  public:
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
+    LeastSquareDiagonalPreconditioner() : Base() {}
+
+    template<typename MatType>
+    explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base()
+    {
+      compute(mat);
+    }
+
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& analyzePattern(const MatType& )
+    {
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& factorize(const MatType& mat)
+    {
+      // Compute the inverse squared-norm of each column of mat
+      m_invdiag.resize(mat.cols());
+      for(Index j=0; j<mat.outerSize(); ++j)
+      {
+        RealScalar sum = mat.innerVector(j).squaredNorm();
+        if(sum>0)
+          m_invdiag(j) = RealScalar(1)/sum;
+        else
+          m_invdiag(j) = RealScalar(1);
+      }
+      Base::m_isInitialized = true;
+      return *this;
+    }
+    
+    template<typename MatType>
+    LeastSquareDiagonalPreconditioner& compute(const MatType& mat)
+    {
+      return factorize(mat);
+    }
+    
+    ComputationInfo info() { return Success; }
 
-}
+  protected:
+};
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A naive preconditioner which approximates any matrix as the identity matrix
   *
+  * \implsparsesolverconcept
+  *
   * \sa class DiagonalPreconditioner
   */
 class IdentityPreconditioner
@@ -129,7 +189,7 @@ class IdentityPreconditioner
     IdentityPreconditioner() {}
 
     template<typename MatrixType>
-    IdentityPreconditioner(const MatrixType& ) {}
+    explicit IdentityPreconditioner(const MatrixType& ) {}
     
     template<typename MatrixType>
     IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
@@ -142,6 +202,8 @@ class IdentityPreconditioner
     
     template<typename Rhs>
     inline const Rhs& solve(const Rhs& b) const { return b; }
+    
+    ComputationInfo info() { return Success; }
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 6fc6ab85..454f4681 100644
--- a/nuparu/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -27,7 +27,7 @@ namespace internal {
   */
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
-              const Preconditioner& precond, int& iters,
+              const Preconditioner& precond, Index& iters,
               typename Dest::RealScalar& tol_error)
 {
   using std::sqrt;
@@ -36,10 +36,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,1> VectorType;
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
 
-  int n = mat.cols();
-  x = precond.solve(x);
+  Index n = mat.cols();
   VectorType r  = rhs - mat * x;
   VectorType r0 = r;
   
@@ -60,19 +59,21 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
 
   VectorType s(n), t(n);
 
-  RealScalar tol2 = tol*tol;
-  int i = 0;
-  int restarts = 0;
+  RealScalar tol2 = tol*tol*rhs_sqnorm;
+  RealScalar eps2 = NumTraits<Scalar>::epsilon()*NumTraits<Scalar>::epsilon();
+  Index i = 0;
+  Index restarts = 0;
 
-  while ( r.squaredNorm()/rhs_sqnorm > tol2 && i<maxIters )
+  while ( r.squaredNorm() > tol2 && i<maxIters )
   {
     Scalar rho_old = rho;
 
     rho = r0.dot(r);
-    if (internal::isMuchSmallerThan(rho,r0_sqnorm))
+    if (abs(rho) < eps2*r0_sqnorm)
     {
-      // The new residual vector became too orthogonal to the arbitrarily choosen direction r0
+      // The new residual vector became too orthogonal to the arbitrarily chosen direction r0
       // Let's restart with a new r0:
+      r  = rhs - mat * x;
       r0 = r;
       rho = r0_sqnorm = r.squaredNorm();
       if(restarts++ == 0)
@@ -131,48 +132,33 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
   * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+  * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * BiCGSTAB<SparseMatrix<double> > solver;
-  * solver(A);
-  * x = solver.solve(b);
-  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
-  * std::cout << "estimated error: " << solver.error()      << std::endl;
-  * // update b, and solve again
-  * x = solver.solve(b);
-  * \endcode
+  * \include BiCGSTAB_simple.cpp
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * solver.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = solver.solveWithGuess(b,x);
-  *   std::cout << i << " : " << solver.error() << std::endl;
-  *   ++i;
-  * } while (solver.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
+  * BiCGSTAB can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, typename _Preconditioner>
 class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner> >
 {
   typedef IterativeSolverBase<BiCGSTAB> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -180,7 +166,6 @@ class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner
 public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -199,38 +184,23 @@ class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  BiCGSTAB(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit BiCGSTAB(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~BiCGSTAB() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<BiCGSTAB, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "BiCGSTAB is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "BiCGSTAB::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <BiCGSTAB, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
+
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {    
     bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
       
       typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+      if(!internal::bicgstab(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
         failed = true;
     }
     m_info = failed ? NumericalIssue
@@ -240,36 +210,19 @@ class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner
   }
 
   /** \internal */
+  using Base::_solve_impl;
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
-//     x.setZero();
-  x = b;
-    _solveWithGuess(b,x);
+    x.resize(this->rows(),b.cols());
+    x.setZero();
+    _solve_with_guess_impl(b,x);
   }
 
 protected:
 
 };
 
-
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef BiCGSTAB<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_BICGSTAB_H
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index a74a8155..395daa8e 100644
--- a/nuparu/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,7 +26,7 @@ namespace internal {
 template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
 EIGEN_DONT_INLINE
 void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
-                        const Preconditioner& precond, int& iters,
+                        const Preconditioner& precond, Index& iters,
                         typename Dest::RealScalar& tol_error)
 {
   using std::sqrt;
@@ -36,9 +36,9 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   typedef Matrix<Scalar,Dynamic,1> VectorType;
   
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
   
-  int n = mat.cols();
+  Index n = mat.cols();
 
   VectorType residual = rhs - mat * x; //initial residual
 
@@ -60,29 +60,29 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   }
   
   VectorType p(n);
-  p = precond.solve(residual);      //initial search direction
+  p = precond.solve(residual);      // initial search direction
 
   VectorType z(n), tmp(n);
   RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
-  int i = 0;
+  Index i = 0;
   while(i < maxIters)
   {
-    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
+    tmp.noalias() = mat * p;                    // the bottleneck of the algorithm
 
-    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
-    x += alpha * p;                       // update solution
-    residual -= alpha * tmp;              // update residue
+    Scalar alpha = absNew / p.dot(tmp);         // the amount we travel on dir
+    x += alpha * p;                             // update solution
+    residual -= alpha * tmp;                    // update residual
     
     residualNorm2 = residual.squaredNorm();
     if(residualNorm2 < threshold)
       break;
     
-    z = precond.solve(residual);          // approximately solve for "A z = residual"
+    z = precond.solve(residual);                // approximately solve for "A z = residual"
 
     RealScalar absOld = absNew;
     absNew = numext::real(residual.dot(z));     // update the absolute value of r
-    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
-    p = z + beta * p;                             // update search direction
+    RealScalar beta = absNew / absOld;          // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                           // update search direction
     i++;
   }
   tol_error = sqrt(residualNorm2 / rhsNorm2);
@@ -107,58 +107,57 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
 }
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief A conjugate gradient solver for sparse self-adjoint problems
+  * \brief A conjugate gradient solver for sparse (or dense) self-adjoint problems
   *
-  * This class allows to solve for A.x = b sparse linear problems using a conjugate gradient algorithm.
-  * The sparse matrix A must be selfadjoint. The vectors x and b can be either dense or sparse.
+  * This class allows to solve for A.x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
   *
-  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
+  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+  *               \c Upper, or \c Lower|Upper in which the full matrix entries will be considered.
+  *               Default is \c Lower, best performance is \c Lower|Upper.
   * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
   *
+  * \implsparsesolverconcept
+  *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
   * 
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  * 
+  * \b Performance: Even though the default value of \c _UpLo is \c Lower, significantly higher performance is
+  * achieved when using a complete matrix and \b Lower|Upper as the \a _UpLo template parameter. Moreover, in this
+  * case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  * 
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * ConjugateGradient<SparseMatrix<double> > cg;
-  * cg.compute(A);
-  * x = cg.solve(b);
-  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
-  * std::cout << "estimated error: " << cg.error()      << std::endl;
-  * // update b, and solve again
-  * x = cg.solve(b);
-  * \endcode
+    \code
+    int n = 10000;
+    VectorXd x(n), b(n);
+    SparseMatrix<double> A(n,n);
+    // fill A and b
+    ConjugateGradient<SparseMatrix<double>, Lower|Upper> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "#iterations:     " << cg.iterations() << std::endl;
+    std::cout << "estimated error: " << cg.error()      << std::endl;
+    // update b, and solve again
+    x = cg.solve(b);
+    \endcode
   * 
   * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method. Here is a step by
-  * step execution example starting with a random guess and printing the evolution
-  * of the estimated error:
-  * * \code
-  * x = VectorXd::Random(n);
-  * cg.setMaxIterations(1);
-  * int i = 0;
-  * do {
-  *   x = cg.solveWithGuess(b,x);
-  *   std::cout << i << " : " << cg.error() << std::endl;
-  *   ++i;
-  * } while (cg.info()!=Success && i<100);
-  * \endcode
-  * Note that such a step by step excution is slightly slower.
+  * One can control the start using the solveWithGuess() method.
   * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  * ConjugateGradient can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+  *
+  * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename _MatrixType, int _UpLo, typename _Preconditioner>
 class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
 {
   typedef IterativeSolverBase<ConjugateGradient> Base;
-  using Base::mp_matrix;
+  using Base::matrix;
   using Base::m_error;
   using Base::m_iterations;
   using Base::m_info;
@@ -166,7 +165,6 @@ class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixTy
 public:
   typedef _MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef _Preconditioner Preconditioner;
 
@@ -189,41 +187,40 @@ class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixTy
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  ConjugateGradient(const MatrixType& A) : Base(A) {}
+  template<typename MatrixDerived>
+  explicit ConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~ConjugateGradient() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<ConjugateGradient, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "ConjugateGradient::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <ConjugateGradient, Rhs, Guess>(*this, b.derived(), x0);
-  }
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
+    typedef typename Base::MatrixWrapper MatrixWrapper;
+    typedef typename Base::ActualMatrixType ActualMatrixType;
+    enum {
+      TransposeInput  =   (!MatrixWrapper::MatrixFree)
+                      &&  (UpLo==(Lower|Upper))
+                      &&  (!MatrixType::IsRowMajor)
+                      &&  (!NumTraits<Scalar>::IsComplex)
+    };
+    typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
+    typedef typename internal::conditional<UpLo==(Lower|Upper),
+                                           RowMajorWrapper,
+                                           typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+                                          >::type SelfAdjointWrapper;
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
 
       typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
-                                   Base::m_preconditioner, m_iterations, m_error);
+      RowMajorWrapper row_mat(matrix());
+      internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
     }
 
     m_isInitialized = true;
@@ -231,35 +228,18 @@ class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixTy
   }
   
   /** \internal */
+  using Base::_solve_impl;
   template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
   {
-    x.setOnes();
-    _solveWithGuess(b,x);
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
   }
 
 protected:
 
 };
 
-
-namespace internal {
-
-template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
-struct solve_retval<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-  : solve_retval_base<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-{
-  typedef ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_CONJUGATE_GRADIENT_H
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
new file mode 100644
index 00000000..284e37f1
--- /dev/null
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -0,0 +1,368 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
+#define EIGEN_INCOMPLETE_CHOlESKY_H
+
+#include <vector>
+#include <list>
+
+namespace Eigen {  
+/** 
+  * \brief Modified Incomplete Cholesky with dual threshold
+  *
+  * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+  *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+  *
+  * \tparam _MatrixType The type of the sparse matrix. It is advised to give a row-oriented sparse matrix
+  * \tparam _UpLo The triangular part that will be used for the computations. It can be Lower
+    *               or Upper. Default is Lower.
+  * \tparam _OrderingType The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<int>,
+  *                       unless EIGEN_MPL2_ONLY is defined, in which case the default is NaturalOrdering<int>.
+  *
+  * \implsparsesolverconcept
+  *
+  * It performs the following incomplete factorization: \f$ S P A P' S \approx L L' \f$
+  * where L is a lower triangular factor, S is a diagonal scaling matrix, and P is a
+  * fill-in reducing permutation as computed by the ordering method.
+  *
+  * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
+  * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly performed
+  * on the matrix B. Otherwise, the factorization is performed on the shifted matrix \f$ B + (\sigma+|\beta| I \f$ where
+  * \f$ \sigma \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$ \sigma = 10^{-3} \f$.
+  *
+  */
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
+#ifndef EIGEN_MPL2_ONLY
+AMDOrdering<int>
+#else
+NaturalOrdering<int>
+#endif
+>
+class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> >
+{
+  protected:
+    typedef SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> > Base;
+    using Base::m_isInitialized;
+  public:
+    typedef typename NumTraits<Scalar>::Real RealScalar; 
+    typedef _OrderingType OrderingType;
+    typedef typename OrderingType::PermutationType PermutationType;
+    typedef typename PermutationType::StorageIndex StorageIndex; 
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> FactorType;
+    typedef Matrix<Scalar,Dynamic,1> VectorSx;
+    typedef Matrix<RealScalar,Dynamic,1> VectorRx;
+    typedef Matrix<StorageIndex,Dynamic, 1> VectorIx;
+    typedef std::vector<std::list<StorageIndex> > VectorList; 
+    enum { UpLo = _UpLo };
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
+  public:
+
+    /** Default constructor leaving the object in a partly non-initialized stage.
+      *
+      * You must call compute() or the pair analyzePattern()/factorize() to make it valid.
+      *
+      * \sa IncompleteCholesky(const MatrixType&)
+      */
+    IncompleteCholesky() : m_initialShift(1e-3),m_factorizationIsOk(false) {}
+    
+    /** Constructor computing the incomplete factorization for the given matrix \a matrix.
+      */
+    template<typename MatrixType>
+    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false)
+    {
+      compute(matrix);
+    }
+    
+    /** \returns number of rows of the factored matrix */
+    Index rows() const { return m_L.rows(); }
+    
+    /** \returns number of columns of the factored matrix */
+    Index cols() const { return m_L.cols(); }
+    
+
+    /** \brief Reports whether previous computation was successful.
+      *
+      * It triggers an assertion if \c *this has not been initialized through the respective constructor,
+      * or a call to compute() or analyzePattern().
+      *
+      * \returns \c Success if computation was successful,
+      *          \c NumericalIssue if the matrix appears to be negative.
+      */
+    ComputationInfo info() const
+    {
+      eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
+      return m_info;
+    }
+    
+    /** \brief Set the initial shift parameter \f$ \sigma \f$.
+      */
+    void setInitialShift(RealScalar shift) { m_initialShift = shift; }
+    
+    /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
+      */
+    template<typename MatrixType>
+    void analyzePattern(const MatrixType& mat)
+    {
+      OrderingType ord; 
+      PermutationType pinv;
+      ord(mat.template selfadjointView<UpLo>(), pinv); 
+      if(pinv.size()>0) m_perm = pinv.inverse();
+      else              m_perm.resize(0);
+      m_L.resize(mat.rows(), mat.cols());
+      m_analysisIsOk = true;
+      m_isInitialized = true;
+      m_info = Success;
+    }
+    
+    /** \brief Performs the numerical factorization of the input matrix \a mat
+      *
+      * The method analyzePattern() or compute() must have been called beforehand
+      * with a matrix having the same pattern.
+      *
+      * \sa compute(), analyzePattern()
+      */
+    template<typename MatrixType>
+    void factorize(const MatrixType& mat);
+    
+    /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
+      *
+      * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
+      *
+      * \sa analyzePattern(), factorize()
+      */
+    template<typename MatrixType>
+    void compute(const MatrixType& mat)
+    {
+      analyzePattern(mat);
+      factorize(mat);
+    }
+    
+    // internal
+    template<typename Rhs, typename Dest>
+    void _solve_impl(const Rhs& b, Dest& x) const
+    {
+      eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+      if (m_perm.rows() == b.rows())  x = m_perm * b;
+      else                            x = b;
+      x = m_scale.asDiagonal() * x;
+      x = m_L.template triangularView<Lower>().solve(x);
+      x = m_L.adjoint().template triangularView<Upper>().solve(x);
+      x = m_scale.asDiagonal() * x;
+      if (m_perm.rows() == b.rows())
+        x = m_perm.inverse() * x;
+    }
+
+    /** \returns the sparse lower triangular factor L */
+    const FactorType& matrixL() const { eigen_assert("m_factorizationIsOk"); return m_L; }
+
+    /** \returns a vector representing the scaling factor S */
+    const VectorRx& scalingS() const { eigen_assert("m_factorizationIsOk"); return m_scale; }
+
+    /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
+    const PermutationType& permutationP() const { eigen_assert("m_analysisIsOk"); return m_perm; }
+
+  protected:
+    FactorType m_L;              // The lower part stored in CSC
+    VectorRx m_scale;            // The vector for scaling the matrix 
+    RealScalar m_initialShift;   // The initial shift parameter
+    bool m_analysisIsOk; 
+    bool m_factorizationIsOk; 
+    ComputationInfo m_info;
+    PermutationType m_perm; 
+
+  private:
+    inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol); 
+}; 
+
+template<typename Scalar, int _UpLo, typename OrderingType>
+template<typename _MatrixType>
+void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
+{
+  using std::sqrt;
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
+    
+  // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
+  
+  // Apply the fill-reducing permutation computed in analyzePattern()
+  if (m_perm.rows() == mat.rows() ) // To detect the null permutation
+  {
+    // The temporary is needed to make sure that the diagonal entry is properly sorted
+    FactorType tmp(mat.rows(), mat.cols());
+    tmp = mat.template selfadjointView<_UpLo>().twistedBy(m_perm);
+    m_L.template selfadjointView<Lower>() = tmp.template selfadjointView<Lower>();
+  }
+  else
+  {
+    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
+  }
+  
+  Index n = m_L.cols(); 
+  Index nnz = m_L.nonZeros();
+  Map<VectorSx> vals(m_L.valuePtr(), nnz);         //values
+  Map<VectorIx> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
+  Map<VectorIx> colPtr( m_L.outerIndexPtr(), n+1); // Pointer to the beginning of each row
+  VectorIx firstElt(n-1); // for each j, points to the next entry in vals that will be used in the factorization
+  VectorList listCol(n);  // listCol(j) is a linked list of columns to update column j
+  VectorSx col_vals(n);   // Store a  nonzero values in each column
+  VectorIx col_irow(n);   // Row indices of nonzero elements in each column
+  VectorIx col_pattern(n);
+  col_pattern.fill(-1);
+  StorageIndex col_nnz;
+  
+  
+  // Computes the scaling factors 
+  m_scale.resize(n);
+  m_scale.setZero();
+  for (Index j = 0; j < n; j++)
+    for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
+    {
+      m_scale(j) += numext::abs2(vals(k));
+      if(rowIdx[k]!=j)
+        m_scale(rowIdx[k]) += numext::abs2(vals(k));
+    }
+  
+  m_scale = m_scale.cwiseSqrt().cwiseSqrt();
+
+  for (Index j = 0; j < n; ++j)
+    if(m_scale(j)>(std::numeric_limits<RealScalar>::min)())
+      m_scale(j) = RealScalar(1)/m_scale(j);
+    else
+      m_scale(j) = 1;
+
+  // FIXME disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+  
+  // Scale and compute the shift for the matrix 
+  RealScalar mindiag = NumTraits<RealScalar>::highest();
+  for (Index j = 0; j < n; j++)
+  {
+    for (Index k = colPtr[j]; k < colPtr[j+1]; k++)
+      vals[k] *= (m_scale(j)*m_scale(rowIdx[k]));
+    eigen_internal_assert(rowIdx[colPtr[j]]==j && "IncompleteCholesky: only the lower triangular part must be stored");
+    mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
+  }
+  
+  RealScalar shift = 0;
+  if(mindiag <= RealScalar(0.))
+    shift = m_initialShift - mindiag;
+
+  // Apply the shift to the diagonal elements of the matrix
+  for (Index j = 0; j < n; j++)
+    vals[colPtr[j]] += shift;
+  
+  // jki version of the Cholesky factorization 
+  for (Index j=0; j < n; ++j)
+  {  
+    // Left-looking factorization of the j-th column
+    // First, load the j-th column into col_vals 
+    Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+    col_nnz = 0;
+    for (Index i = colPtr[j] + 1; i < colPtr[j+1]; i++)
+    {
+      StorageIndex l = rowIdx[i];
+      col_vals(col_nnz) = vals[i];
+      col_irow(col_nnz) = l;
+      col_pattern(l) = col_nnz;
+      col_nnz++;
+    }
+    {
+      typename std::list<StorageIndex>::iterator k; 
+      // Browse all previous columns that will update column j
+      for(k = listCol[j].begin(); k != listCol[j].end(); k++) 
+      {
+        Index jk = firstElt(*k); // First element to use in the column 
+        eigen_internal_assert(rowIdx[jk]==j);
+        Scalar v_j_jk = numext::conj(vals[jk]);
+        
+        jk += 1; 
+        for (Index i = jk; i < colPtr[*k+1]; i++)
+        {
+          StorageIndex l = rowIdx[i];
+          if(col_pattern[l]<0)
+          {
+            col_vals(col_nnz) = vals[i] * v_j_jk;
+            col_irow[col_nnz] = l;
+            col_pattern(l) = col_nnz;
+            col_nnz++;
+          }
+          else
+            col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
+        }
+        updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
+      }
+    }
+    
+    // Scale the current column
+    if(numext::real(diag) <= 0) 
+    {
+      m_info = NumericalIssue; 
+      return; 
+    }
+    
+    RealScalar rdiag = sqrt(numext::real(diag));
+    vals[colPtr[j]] = rdiag;
+    for (Index k = 0; k<col_nnz; ++k)
+    {
+      Index i = col_irow[k];
+      //Scale
+      col_vals(k) /= rdiag;
+      //Update the remaining diagonals with col_vals
+      vals[colPtr[i]] -= numext::abs2(col_vals(k));
+    }
+    // Select the largest p elements
+    // p is the original number of elements in the column (without the diagonal)
+    Index p = colPtr[j+1] - colPtr[j] - 1 ; 
+    Ref<VectorSx> cvals = col_vals.head(col_nnz);
+    Ref<VectorIx> cirow = col_irow.head(col_nnz);
+    internal::QuickSplit(cvals,cirow, p); 
+    // Insert the largest p elements in the matrix
+    Index cpt = 0; 
+    for (Index i = colPtr[j]+1; i < colPtr[j+1]; i++)
+    {
+      vals[i] = col_vals(cpt); 
+      rowIdx[i] = col_irow(cpt);
+      // restore col_pattern:
+      col_pattern(col_irow(cpt)) = -1;
+      cpt++; 
+    }
+    // Get the first smallest row index and put it after the diagonal element
+    Index jk = colPtr(j)+1;
+    updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
+  }
+  m_factorizationIsOk = true; 
+  m_info = Success;
+}
+
+template<typename Scalar, int _UpLo, typename OrderingType>
+inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col, const Index& jk, VectorIx& firstElt, VectorList& listCol)
+{
+  if (jk < colPtr(col+1) )
+  {
+    Index p = colPtr(col+1) - jk;
+    Index minpos; 
+    rowIdx.segment(jk,p).minCoeff(&minpos);
+    minpos += jk;
+    if (rowIdx(minpos) != rowIdx(jk))
+    {
+      //Swap
+      std::swap(rowIdx(jk),rowIdx(minpos));
+      std::swap(vals(jk),vals(minpos));
+    }
+    firstElt(col) = internal::convert_index<StorageIndex,Index>(jk);
+    listCol[rowIdx(jk)].push_back(internal::convert_index<StorageIndex,Index>(col));
+  }
+}
+
+} // end namespace Eigen 
+
+#endif
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index b55afc13..338e6f10 100644
--- a/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -24,7 +25,7 @@ namespace internal {
   * \param ind The array of index for the elements in @p row
   * \param ncut  The number of largest elements to keep
   **/ 
-template <typename VectorV, typename VectorI, typename Index>
+template <typename VectorV, typename VectorI>
 Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
 {
   typedef typename VectorV::RealScalar RealScalar;
@@ -66,6 +67,8 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * \class IncompleteLUT
   * \brief Incomplete LU factorization with dual-threshold strategy
   *
+  * \implsparsesolverconcept
+  *
   * During the numerical factorization, two dropping rules are used :
   *  1) any element whose magnitude is less than some tolerance is dropped.
   *    This tolerance is obtained by multiplying the input tolerance @p droptol 
@@ -92,28 +95,36 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
   * alternatively, on GMANE:
   *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
   */
-template <typename _Scalar>
-class IncompleteLUT : internal::noncopyable
+template <typename _Scalar, typename _StorageIndex = int>
+class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageIndex> >
 {
+  protected:
+    typedef SparseSolverBase<IncompleteLUT> Base;
+    using Base::m_isInitialized;
+  public:
     typedef _Scalar Scalar;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
-    typedef SparseMatrix<Scalar,ColMajor> PermutType;
-    typedef typename FactorType::Index Index;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> FactorType;
+
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
 
   public:
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
     
     IncompleteLUT()
       : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
-        m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false)
+        m_analysisIsOk(false), m_factorizationIsOk(false)
     {}
     
     template<typename MatrixType>
-    IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
+    explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
       : m_droptol(droptol),m_fillfactor(fillfactor),
-        m_analysisIsOk(false),m_factorizationIsOk(false),m_isInitialized(false)
+        m_analysisIsOk(false),m_factorizationIsOk(false)
     {
       eigen_assert(fillfactor != 0);
       compute(mat); 
@@ -146,11 +157,10 @@ class IncompleteLUT : internal::noncopyable
       * 
       **/
     template<typename MatrixType>
-    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
+    IncompleteLUT& compute(const MatrixType& amat)
     {
       analyzePattern(amat); 
       factorize(amat);
-      m_isInitialized = m_factorizationIsOk;
       return *this;
     }
 
@@ -158,23 +168,14 @@ class IncompleteLUT : internal::noncopyable
     void setFillfactor(int fillfactor); 
     
     template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
+    void _solve_impl(const Rhs& b, Dest& x) const
     {
-      x = m_Pinv * b;  
+      x = m_Pinv * b;
       x = m_lu.template triangularView<UnitLower>().solve(x);
       x = m_lu.template triangularView<Upper>().solve(x);
       x = m_P * x; 
     }
 
-    template<typename Rhs> inline const internal::solve_retval<IncompleteLUT, Rhs>
-     solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLUT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteLUT, Rhs>(*this, b.derived());
-    }
-
 protected:
 
     /** keeps off-diagonal entries; drops diagonal entries */
@@ -192,18 +193,17 @@ class IncompleteLUT : internal::noncopyable
     int m_fillfactor;
     bool m_analysisIsOk;
     bool m_factorizationIsOk;
-    bool m_isInitialized;
     ComputationInfo m_info;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // Fill-reducing permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // Inverse permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // Fill-reducing permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // Inverse permutation
 };
 
 /**
  * Set control parameter droptol
  *  \param droptol   Drop any element whose magnitude is less than this tolerance 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setDroptol(const RealScalar& droptol)
 {
   this->m_droptol = droptol;   
 }
@@ -212,50 +212,62 @@ void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
  * Set control parameter fillfactor
  * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
  **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
+template<typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar,StorageIndex>::setFillfactor(int fillfactor)
 {
   this->m_fillfactor = fillfactor;   
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
 {
   // Compute the Fill-reducing permutation
-  SparseMatrix<Scalar,ColMajor, Index> mat1 = amat;
-  SparseMatrix<Scalar,ColMajor, Index> mat2 = amat.transpose();
-  // Symmetrize the pattern
+  // Since ILUT does not perform any numerical pivoting,
+  // it is highly preferable to keep the diagonal through symmetric permutations.
+#ifndef EIGEN_MPL2_ONLY
+  // To this end, let's symmetrize the pattern and perform AMD on it.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
   //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
-  SparseMatrix<Scalar,ColMajor, Index> AtA = mat2 + mat1;
-  AtA.prune(keep_diag());
-  internal::minimum_degree_ordering<Scalar, Index>(AtA, m_P);  // Then compute the AMD ordering...
-
-  m_Pinv  = m_P.inverse(); // ... and the inverse permutation
+  SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
+  AMDOrdering<StorageIndex> ordering;
+  ordering(AtA,m_P);
+  m_Pinv  = m_P.inverse(); // cache the inverse permutation
+#else
+  // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  COLAMDOrdering<StorageIndex> ordering;
+  ordering(mat1,m_Pinv);
+  m_P = m_Pinv.inverse();
+#endif
 
   m_analysisIsOk = true;
+  m_factorizationIsOk = false;
+  m_isInitialized = true;
 }
 
-template <typename Scalar>
+template <typename Scalar, typename StorageIndex>
 template<typename _MatrixType>
-void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
+void IncompleteLUT<Scalar,StorageIndex>::factorize(const _MatrixType& amat)
 {
   using std::sqrt;
   using std::swap;
   using std::abs;
+  using internal::convert_index;
 
   eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
   Index n = amat.cols();  // Size of the matrix
   m_lu.resize(n,n);
   // Declare Working vectors and variables
   Vector u(n) ;     // real values of the row -- maximum size is n --
-  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
-  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+  VectorI ju(n);   // column position of the values in u -- maximum size  is n
+  VectorI jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
   // Apply the fill-reducing permutation
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  SparseMatrix<Scalar,RowMajor, Index> mat;
+  SparseMatrix<Scalar,RowMajor, StorageIndex> mat;
   mat = amat.twistedBy(m_Pinv);
 
   // Initialization
@@ -264,7 +276,7 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
   u.fill(0);
 
   // number of largest elements to keep in each row:
-  Index fill_in =   static_cast<Index> (amat.nonZeros()*m_fillfactor)/n+1;
+  Index fill_in = (amat.nonZeros()*m_fillfactor)/n + 1;
   if (fill_in > n) fill_in = n;
 
   // number of largest nonzero elements to keep in the L and the U part of the current row:
@@ -279,9 +291,9 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
 
     Index sizeu = 1; // number of nonzero elements in the upper part of the current row
     Index sizel = 0; // number of nonzero elements in the lower part of the current row
-    ju(ii)    = ii;
+    ju(ii)    = convert_index<StorageIndex>(ii);
     u(ii)     = 0;
-    jr(ii)    = ii;
+    jr(ii)    = convert_index<StorageIndex>(ii);
     RealScalar rownorm = 0;
 
     typename FactorType::InnerIterator j_it(mat, ii); // Iterate through the current row ii
@@ -291,9 +303,9 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       if (k < ii)
       {
         // copy the lower part
-        ju(sizel) = k;
+        ju(sizel) = convert_index<StorageIndex>(k);
         u(sizel) = j_it.value();
-        jr(k) = sizel;
+        jr(k) = convert_index<StorageIndex>(sizel);
         ++sizel;
       }
       else if (k == ii)
@@ -304,9 +316,9 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       {
         // copy the upper part
         Index jpos = ii + sizeu;
-        ju(jpos) = k;
+        ju(jpos) = convert_index<StorageIndex>(k);
         u(jpos) = j_it.value();
-        jr(k) = jpos;
+        jr(k) = convert_index<StorageIndex>(jpos);
         ++sizeu;
       }
       rownorm += numext::abs2(j_it.value());
@@ -336,7 +348,8 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
         // swap the two locations
         Index j = ju(jj);
         swap(ju(jj), ju(k));
-        jr(minrow) = jj;   jr(j) = k;
+        jr(minrow) = convert_index<StorageIndex>(jj);
+        jr(j) = convert_index<StorageIndex>(k);
         swap(u(jj), u(k));
       }
       // Reset this location
@@ -360,8 +373,8 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       for (; ki_it; ++ki_it)
       {
         Scalar prod = fact * ki_it.value();
-        Index j       = ki_it.index();
-        Index jpos    = jr(j);
+        Index j     = ki_it.index();
+        Index jpos  = jr(j);
         if (jpos == -1) // fill-in element
         {
           Index newpos;
@@ -377,16 +390,16 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
             sizel++;
             eigen_internal_assert(sizel<=ii);
           }
-          ju(newpos) = j;
+          ju(newpos) = convert_index<StorageIndex>(j);
           u(newpos) = -prod;
-          jr(j) = newpos;
+          jr(j) = convert_index<StorageIndex>(newpos);
         }
         else
           u(jpos) -= prod;
       }
       // store the pivot element
-      u(len) = fact;
-      ju(len) = minrow;
+      u(len)  = fact;
+      ju(len) = convert_index<StorageIndex>(minrow);
       ++len;
 
       jj++;
@@ -401,7 +414,7 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizel = len;
     len = (std::min)(sizel, nnzL);
     typename Vector::SegmentReturnType ul(u.segment(0, sizel));
-    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    typename VectorI::SegmentReturnType jul(ju.segment(0, sizel));
     internal::QuickSplit(ul, jul, len);
 
     // store the largest m_fill elements of the L part
@@ -430,14 +443,13 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizeu = len + 1; // +1 to take into account the diagonal element
     len = (std::min)(sizeu, nnzU);
     typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
-    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    typename VectorI::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
     internal::QuickSplit(uu, juu, len);
 
     // store the largest elements of the U part
     for(Index k = ii + 1; k < ii + len; k++)
       m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
   }
-
   m_lu.finalize();
   m_lu.makeCompressed();
 
@@ -445,23 +457,6 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
   m_info = Success;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<IncompleteLUT<_MatrixType>, Rhs>
-  : solve_retval_base<IncompleteLUT<_MatrixType>, Rhs>
-{
-  typedef IncompleteLUT<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_INCOMPLETE_LUT_H
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 2036922d..3d62fef6 100644
--- a/nuparu/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,29 +12,158 @@
 
 namespace Eigen { 
 
+namespace internal {
+
+template<typename MatrixType>
+struct is_ref_compatible_impl
+{
+private:
+  template <typename T0>
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(const Ref<const T>&, int);
+  template<typename T>
+  static no  test(any_conversion<T>, ...);
+
+public:
+  static MatrixType ms_from;
+  enum { value = sizeof(test<MatrixType>(ms_from, 0))==sizeof(yes) };
+};
+
+template<typename MatrixType>
+struct is_ref_compatible
+{
+  enum { value = is_ref_compatible_impl<typename remove_all<MatrixType>::type>::value };
+};
+
+template<typename MatrixType, bool MatrixFree = !internal::is_ref_compatible<MatrixType>::value>
+class generic_matrix_wrapper;
+
+// We have an explicit matrix at hand, compatible with Ref<>
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,false>
+{
+public:
+  typedef Ref<const MatrixType> ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType {
+    typedef typename ActualMatrixType::template ConstSelfAdjointViewReturnType<UpLo>::Type Type;
+  };
+
+  enum {
+    MatrixFree = false
+  };
+
+  generic_matrix_wrapper()
+    : m_dummy(0,0), m_matrix(m_dummy)
+  {}
+
+  template<typename InputType>
+  generic_matrix_wrapper(const InputType &mat)
+    : m_matrix(mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrix;
+  }
+
+  template<typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &mat)
+  {
+    m_matrix.~Ref<const MatrixType>();
+    ::new (&m_matrix) Ref<const MatrixType>(mat.derived());
+  }
+
+  void grab(const Ref<const MatrixType> &mat)
+  {
+    if(&(mat.derived()) != &m_matrix)
+    {
+      m_matrix.~Ref<const MatrixType>();
+      ::new (&m_matrix) Ref<const MatrixType>(mat);
+    }
+  }
+
+protected:
+  MatrixType m_dummy; // used to default initialize the Ref<> object
+  ActualMatrixType m_matrix;
+};
+
+// MatrixType is not compatible with Ref<> -> matrix-free wrapper
+template<typename MatrixType>
+class generic_matrix_wrapper<MatrixType,true>
+{
+public:
+  typedef MatrixType ActualMatrixType;
+  template<int UpLo> struct ConstSelfAdjointViewReturnType
+  {
+    typedef ActualMatrixType Type;
+  };
+
+  enum {
+    MatrixFree = true
+  };
+
+  generic_matrix_wrapper()
+    : mp_matrix(0)
+  {}
+
+  generic_matrix_wrapper(const MatrixType &mat)
+    : mp_matrix(&mat)
+  {}
+
+  const ActualMatrixType& matrix() const
+  {
+    return *mp_matrix;
+  }
+
+  void grab(const MatrixType &mat)
+  {
+    mp_matrix = &mat;
+  }
+
+protected:
+  const ActualMatrixType *mp_matrix;
+};
+
+}
+
 /** \ingroup IterativeLinearSolvers_Module
   * \brief Base class for linear iterative solvers
   *
   * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
 template< typename Derived>
-class IterativeSolverBase : internal::noncopyable
+class IterativeSolverBase : public SparseSolverBase<Derived>
 {
+protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::m_isInitialized;
+  
 public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::RealScalar RealScalar;
 
+  enum {
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
 public:
 
-  Derived& derived() { return *static_cast<Derived*>(this); }
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  using Base::derived;
 
   /** Default constructor. */
   IterativeSolverBase()
-    : mp_matrix(0)
   {
     init();
   }
@@ -49,77 +178,90 @@ class IterativeSolverBase : internal::noncopyable
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  IterativeSolverBase(const MatrixType& A)
+  template<typename MatrixDerived>
+  explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A)
+    : m_matrixWrapper(A.derived())
   {
     init();
-    compute(A);
+    compute(matrix());
   }
 
   ~IterativeSolverBase() {}
   
-  /** Initializes the iterative solver for the sparcity pattern of the matrix \a A for further solving \c Ax=b problems.
+  /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly call analyzePattern on the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
+    * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
+    * we might, for instance, implement column reordering for faster matrix vector products.
     */
-  Derived& analyzePattern(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& analyzePattern(const EigenBase<MatrixDerived>& A)
   {
-    m_preconditioner.analyzePattern(A);
+    grab(A.derived());
+    m_preconditioner.analyzePattern(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
   
   /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly call factorize on the preconditioner.
+    * Currently, this function mostly calls factorize on the preconditioner.
     *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& factorize(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& factorize(const EigenBase<MatrixDerived>& A)
   {
     eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
-    mp_matrix = &A;
-    m_preconditioner.factorize(A);
+    grab(A.derived());
+    m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** Initializes the iterative solver with the matrix \a A for further solving \c Ax=b problems.
     *
-    * Currently, this function mostly initialized/compute the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
+    * Currently, this function mostly initializes/computes the preconditioner. In the future
+    * we might, for instance, implement column reordering for faster matrix vector products.
     *
     * \warning this class stores a reference to the matrix A as well as some
     * precomputed values that depend on it. Therefore, if \a A is changed
     * this class becomes invalid. Call compute() to update it with the new
     * matrix A, or modify a copy of A.
     */
-  Derived& compute(const MatrixType& A)
+  template<typename MatrixDerived>
+  Derived& compute(const EigenBase<MatrixDerived>& A)
   {
-    mp_matrix = &A;
-    m_preconditioner.compute(A);
+    grab(A.derived());
+    m_preconditioner.compute(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** \internal */
-  Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
+  Index rows() const { return matrix().rows(); }
+
   /** \internal */
-  Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+  Index cols() const { return matrix().cols(); }
 
-  /** \returns the tolerance threshold used by the stopping criteria */
+  /** \returns the tolerance threshold used by the stopping criteria.
+    * \sa setTolerance()
+    */
   RealScalar tolerance() const { return m_tolerance; }
   
-  /** Sets the tolerance threshold used by the stopping criteria */
+  /** Sets the tolerance threshold used by the stopping criteria.
+    *
+    * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
+    * The default value is the machine precision given by NumTraits<Scalar>::epsilon()
+    */
   Derived& setTolerance(const RealScalar& tolerance)
   {
     m_tolerance = tolerance;
@@ -132,58 +274,52 @@ class IterativeSolverBase : internal::noncopyable
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
-  /** \returns the max number of iterations */
-  int maxIterations() const
+  /** \returns the max number of iterations.
+    * It is either the value setted by setMaxIterations or, by default,
+    * twice the number of columns of the matrix.
+    */
+  Index maxIterations() const
   {
-    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
+    return (m_maxIterations<0) ? 2*matrix().cols() : m_maxIterations;
   }
   
-  /** Sets the max number of iterations */
-  Derived& setMaxIterations(int maxIters)
+  /** Sets the max number of iterations.
+    * Default is twice the number of columns of the matrix.
+    */
+  Derived& setMaxIterations(Index maxIters)
   {
     m_maxIterations = maxIters;
     return derived();
   }
 
   /** \returns the number of iterations performed during the last solve */
-  int iterations() const
+  Index iterations() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
     return m_iterations;
   }
 
-  /** \returns the tolerance error reached during the last solve */
+  /** \returns the tolerance error reached during the last solve.
+    * It is a close approximation of the true relative residual error |Ax-b|/|b|.
+    */
   RealScalar error() const
   {
     eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
     return m_error;
   }
 
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs> inline const internal::solve_retval<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval<Derived, Rhs>(derived(), b.derived());
-  }
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+    * and \a x0 as an initial solution.
     *
-    * \sa compute()
+    * \sa solve(), compute()
     */
-  template<typename Rhs>
-  inline const internal::sparse_solve_retval<IterativeSolverBase, Rhs>
-  solve(const SparseMatrixBase<Rhs>& b) const
+  template<typename Rhs,typename Guess>
+  inline const SolveWithGuess<Derived, Rhs, Guess>
+  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
   {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::sparse_solve_retval<IterativeSolverBase, Rhs>(*this, b.derived());
+    eigen_assert(m_isInitialized && "Solver is not initialized.");
+    eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+    return SolveWithGuess<Derived, Rhs, Guess>(derived(), b.derived(), x0);
   }
 
   /** \returns Success if the iterations converged, and NoConvergence otherwise. */
@@ -195,20 +331,24 @@ class IterativeSolverBase : internal::noncopyable
   
   /** \internal */
   template<typename Rhs, typename DestScalar, int DestOptions, typename DestIndex>
-  void _solve_sparse(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
+  void _solve_impl(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
   {
     eigen_assert(rows()==b.rows());
     
-    int rhsCols = b.cols();
-    int size = b.rows();
+    Index rhsCols = b.cols();
+    Index size = b.rows();
     Eigen::Matrix<DestScalar,Dynamic,1> tb(size);
-    Eigen::Matrix<DestScalar,Dynamic,1> tx(size);
-    for(int k=0; k<rhsCols; ++k)
+    Eigen::Matrix<DestScalar,Dynamic,1> tx(cols());
+    // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
+    // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other.
+    SparseMatrix<DestScalar,DestOptions,DestIndex> tmp(cols(),rhsCols);
+    for(Index k=0; k<rhsCols; ++k)
     {
       tb = b.col(k);
       tx = derived().solve(tb);
-      dest.col(k) = tx.sparseView(0);
+      tmp.col(k) = tx.sparseView(0);
     }
+    tmp.swap(dest);
   }
 
 protected:
@@ -220,35 +360,33 @@ class IterativeSolverBase : internal::noncopyable
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  const MatrixType* mp_matrix;
+
+  typedef internal::generic_matrix_wrapper<MatrixType> MatrixWrapper;
+  typedef typename MatrixWrapper::ActualMatrixType ActualMatrixType;
+
+  const ActualMatrixType& matrix() const
+  {
+    return m_matrixWrapper.matrix();
+  }
+  
+  template<typename InputType>
+  void grab(const InputType &A)
+  {
+    m_matrixWrapper.grab(A);
+  }
+  
+  MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
-  int m_maxIterations;
+  Index m_maxIterations;
   RealScalar m_tolerance;
   
   mutable RealScalar m_error;
-  mutable int m_iterations;
+  mutable Index m_iterations;
   mutable ComputationInfo m_info;
-  mutable bool m_isInitialized, m_analysisIsOk, m_factorizationIsOk;
-};
-
-namespace internal {
- 
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<IterativeSolverBase<Derived>, Rhs>
-  : sparse_solve_retval_base<IterativeSolverBase<Derived>, Rhs>
-{
-  typedef IterativeSolverBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve_sparse(rhs(),dst);
-  }
+  mutable bool m_analysisIsOk, m_factorizationIsOk;
 };
 
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_ITERATIVE_SOLVER_BASE_H
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
new file mode 100644
index 00000000..0aea0e09
--- /dev/null
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+
+namespace Eigen { 
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm for least-square problems
+  * \param mat The matrix A
+  * \param rhs The right hand side vector b
+  * \param x On input and initial solution, on output the computed solution.
+  * \param precond A preconditioner being able to efficiently solve for an
+  *                approximation of A'Ax=b (regardless of b)
+  * \param iters On input the max number of iteration, on output the number of performed iterations.
+  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+  */
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE
+void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                     const Preconditioner& precond, Index& iters,
+                                     typename Dest::RealScalar& tol_error)
+{
+  using std::sqrt;
+  using std::abs;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  
+  RealScalar tol = tol_error;
+  Index maxIters = iters;
+  
+  Index m = mat.rows(), n = mat.cols();
+
+  VectorType residual        = rhs - mat * x;
+  VectorType normal_residual = mat.adjoint() * residual;
+
+  RealScalar rhsNorm2 = (mat.adjoint()*rhs).squaredNorm();
+  if(rhsNorm2 == 0) 
+  {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol*tol*rhsNorm2;
+  RealScalar residualNorm2 = normal_residual.squaredNorm();
+  if (residualNorm2 < threshold)
+  {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+  
+  VectorType p(n);
+  p = precond.solve(normal_residual);                         // initial search direction
+
+  VectorType z(n), tmp(m);
+  RealScalar absNew = numext::real(normal_residual.dot(p));  // the square of the absolute value of r scaled by invM
+  Index i = 0;
+  while(i < maxIters)
+  {
+    tmp.noalias() = mat * p;
+
+    Scalar alpha = absNew / tmp.squaredNorm();      // the amount we travel on dir
+    x += alpha * p;                                 // update solution
+    residual -= alpha * tmp;                        // update residual
+    normal_residual = mat.adjoint() * residual;     // update residual of the normal equation
+    
+    residualNorm2 = normal_residual.squaredNorm();
+    if(residualNorm2 < threshold)
+      break;
+    
+    z = precond.solve(normal_residual);             // approximately solve for "A'A z = normal_residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(normal_residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;              // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                               // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}
+
+template< typename _MatrixType,
+          typename _Preconditioner = LeastSquareDiagonalPreconditioner<typename _MatrixType::Scalar> >
+class LeastSquaresConjugateGradient;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef _MatrixType MatrixType;
+  typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse (or dense) least-square problems
+  *
+  * This class allows to solve for A x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty stability.
+  * Otherwise, the SparseLU or SparseQR classes might be preferable.
+  * The matrix A and the vectors x and b can be either dense or sparse.
+  *
+  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam _Preconditioner the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  * 
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  * 
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+    \code
+    int m=1000000, n = 10000;
+    VectorXd x(n), b(m);
+    SparseMatrix<double> A(m,n);
+    // fill A and b
+    LeastSquaresConjugateGradient<SparseMatrix<double> > lscg;
+    lscg.compute(A);
+    x = lscg.solve(b);
+    std::cout << "#iterations:     " << lscg.iterations() << std::endl;
+    std::cout << "estimated error: " << lscg.error()      << std::endl;
+    // update b, and solve again
+    x = lscg.solve(b);
+    \endcode
+  * 
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  * 
+  * \sa class ConjugateGradient, SparseLU, SparseQR
+  */
+template< typename _MatrixType, typename _Preconditioner>
+class LeastSquaresConjugateGradient : public IterativeSolverBase<LeastSquaresConjugateGradient<_MatrixType,_Preconditioner> >
+{
+  typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
+  using Base::matrix;
+  using Base::m_error;
+  using Base::m_iterations;
+  using Base::m_info;
+  using Base::m_isInitialized;
+public:
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef _Preconditioner Preconditioner;
+
+public:
+
+  /** Default constructor. */
+  LeastSquaresConjugateGradient() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+    * 
+    * This constructor is a shortcut for the default constructor followed
+    * by a call to compute().
+    * 
+    * \warning this class stores a reference to the matrix A as well as some
+    * precomputed values that depend on it. Therefore, if \a A is changed
+    * this class becomes invalid. Call compute() to update it with the new
+    * matrix A, or modify a copy of A.
+    */
+  template<typename MatrixDerived>
+  explicit LeastSquaresConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+
+  ~LeastSquaresConjugateGradient() {}
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      internal::least_square_conjugate_gradient(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+  
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
+  }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
diff --git a/nuparu/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/nuparu/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
new file mode 100644
index 00000000..35923be3
--- /dev/null
+++ b/nuparu/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -0,0 +1,109 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVEWITHGUESS_H
+#define EIGEN_SOLVEWITHGUESS_H
+
+namespace Eigen {
+
+template<typename Decomposition, typename RhsType, typename GuessType> class SolveWithGuess;
+  
+/** \class SolveWithGuess
+  * \ingroup IterativeLinearSolvers_Module
+  *
+  * \brief Pseudo expression representing a solving operation
+  *
+  * \tparam Decomposition the type of the matrix or decomposion object
+  * \tparam Rhstype the type of the right-hand side
+  *
+  * This class represents an expression of A.solve(B)
+  * and most of the time this is the only way it is used.
+  *
+  */
+namespace internal {
+
+
+template<typename Decomposition, typename RhsType, typename GuessType>
+struct traits<SolveWithGuess<Decomposition, RhsType, GuessType> >
+  : traits<Solve<Decomposition,RhsType> >
+{};
+
+}
+
+
+template<typename Decomposition, typename RhsType, typename GuessType>
+class SolveWithGuess : public internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type
+{
+public:
+  typedef typename internal::traits<SolveWithGuess>::Scalar Scalar;
+  typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
+  typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition,RhsType,GuessType>, MatrixXpr, typename internal::traits<RhsType>::StorageKind>::type Base;
+  
+  SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
+    : m_dec(dec), m_rhs(rhs), m_guess(guess)
+  {}
+  
+  EIGEN_DEVICE_FUNC Index rows() const { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition& dec()   const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType&       rhs()   const { return m_rhs; }
+  EIGEN_DEVICE_FUNC const GuessType&     guess() const { return m_guess; }
+
+protected:
+  const Decomposition &m_dec;
+  const RhsType       &m_rhs;
+  const GuessType     &m_guess;
+  
+private:
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+// Evaluator of SolveWithGuess -> eval into a temporary
+template<typename Decomposition, typename RhsType, typename GuessType>
+struct evaluator<SolveWithGuess<Decomposition,RhsType, GuessType> >
+  : public evaluator<typename SolveWithGuess<Decomposition,RhsType,GuessType>::PlainObject>
+{
+  typedef SolveWithGuess<Decomposition,RhsType,GuessType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  evaluator(const SolveType& solve)
+    : m_result(solve.rows(), solve.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result, solve().guess());
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solveWithGuess(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse specialization must exist somewhere
+template<typename DstXprType, typename DecType, typename RhsType, typename GuessType, typename Scalar>
+struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef SolveWithGuess<DecType,RhsType,GuessType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    // FIXME shall we resize dst here?
+    dst = src.guess();
+    src.dec()._solve_with_guess_impl(src.rhs(), dst/*, src.guess()*/);
+  }
+};
+
+} // end namepsace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SOLVEWITHGUESS_H
diff --git a/nuparu/include/Eigen/src/Jacobi/Jacobi.h b/nuparu/include/Eigen/src/Jacobi/Jacobi.h
index 956f72d5..55de15e8 100644
--- a/nuparu/include/Eigen/src/Jacobi/Jacobi.h
+++ b/nuparu/include/Eigen/src/Jacobi/Jacobi.h
@@ -62,7 +62,7 @@ template<typename Scalar> class JacobiRotation
     JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
 
     template<typename Derived>
-    bool makeJacobi(const MatrixBase<Derived>&, typename Derived::Index p, typename Derived::Index q);
+    bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
     bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
@@ -123,7 +123,7 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
-inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, typename Derived::Index p, typename Derived::Index q)
+inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
 }
@@ -255,15 +255,15 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 *   Implementation of MatrixBase methods
 ****************************************************************************************/
 
+namespace internal {
 /** \jacobi_module
   * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
   * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
   *
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
-namespace internal {
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j);
+void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }
 
 /** \jacobi_module
@@ -298,19 +298,18 @@ inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiR
 
 namespace internal {
 template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j)
+void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
-  typedef typename VectorX::Index Index;
   typedef typename VectorX::Scalar Scalar;
   enum { PacketSize = packet_traits<Scalar>::size };
   typedef typename packet_traits<Scalar>::type Packet;
-  eigen_assert(_x.size() == _y.size());
-  Index size = _x.size();
-  Index incrx = _x.innerStride();
-  Index incry = _y.innerStride();
+  eigen_assert(xpr_x.size() == xpr_y.size());
+  Index size = xpr_x.size();
+  Index incrx = xpr_x.derived().innerStride();
+  Index incry = xpr_y.derived().innerStride();
 
-  Scalar* EIGEN_RESTRICT x = &_x.coeffRef(0);
-  Scalar* EIGEN_RESTRICT y = &_y.coeffRef(0);
+  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
   
   OtherScalar c = j.c();
   OtherScalar s = j.s();
@@ -326,7 +325,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
     // both vectors are sequentially stored in memory => vectorization
     enum { Peeling = 2 };
 
-    Index alignedStart = internal::first_aligned(y, size);
+    Index alignedStart = internal::first_default_aligned(y, size);
     Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
 
     const Packet pc = pset1<Packet>(c);
@@ -344,7 +343,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
     Scalar* EIGEN_RESTRICT px = x + alignedStart;
     Scalar* EIGEN_RESTRICT py = y + alignedStart;
 
-    if(internal::first_aligned(x, size)==alignedStart)
+    if(internal::first_default_aligned(x, size)==alignedStart)
     {
       for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
       {
@@ -393,7 +392,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y,
   /*** fixed-size vectorized path ***/
   else if(VectorX::SizeAtCompileTime != Dynamic &&
           (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-          (VectorX::Flags & VectorY::Flags & AlignedBit))
+          (EIGEN_PLAIN_ENUM_MIN(evaluator<VectorX>::Alignment, evaluator<VectorY>::Alignment)>0)) // FIXME should be compared to the required alignment
   {
     const Packet pc = pset1<Packet>(c);
     const Packet ps = pset1<Packet>(s);
diff --git a/nuparu/include/Eigen/src/LU/Determinant.h b/nuparu/include/Eigen/src/LU/Determinant.h
index bb8e78a8..d6a3c1e5 100644
--- a/nuparu/include/Eigen/src/LU/Determinant.h
+++ b/nuparu/include/Eigen/src/LU/Determinant.h
@@ -92,7 +92,7 @@ template<typename Derived>
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
-  typedef typename internal::nested<Derived,Base::RowsAtCompileTime>::type Nested;
+  typedef typename internal::nested_eval<Derived,Base::RowsAtCompileTime>::type Nested;
   return internal::determinant_impl<typename internal::remove_all<Nested>::type>::run(derived());
 }
 
diff --git a/nuparu/include/Eigen/src/LU/FullPivLU.h b/nuparu/include/Eigen/src/LU/FullPivLU.h
index dfe25f42..0c4d6392 100644
--- a/nuparu/include/Eigen/src/LU/FullPivLU.h
+++ b/nuparu/include/Eigen/src/LU/FullPivLU.h
@@ -10,7 +10,18 @@
 #ifndef EIGEN_LU_H
 #define EIGEN_LU_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
 
 /** \ingroup LU_Module
   *
@@ -20,10 +31,11 @@ namespace Eigen {
   *
   * \param MatrixType the type of the matrix of which we are computing the LU decomposition
   *
-  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A
-  * is decomposed as A = PLUQ where L is unit-lower-triangular, U is upper-triangular, and P and Q
-  * are permutation matrices. This is a rank-revealing LU decomposition. The eigenvalues (diagonal
-  * coefficients) of U are sorted in such a way that any zeros are at the end.
+  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
+  * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
+  * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
+  * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
+  * zeros are at the end.
   *
   * This decomposition provides the generic approach to solving systems of linear equations, computing
   * the rank, invertibility, inverse, kernel, and determinant.
@@ -43,24 +55,23 @@ namespace Eigen {
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
+  : public SolverBase<FullPivLU<_MatrixType> >
 {
   public:
     typedef _MatrixType MatrixType;
+    typedef SolverBase<FullPivLU> Base;
+
+    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
-    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
-    typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
+    typedef typename internal::plain_row_type<MatrixType, StorageIndex>::type IntRowVectorType;
+    typedef typename internal::plain_col_type<MatrixType, StorageIndex>::type IntColVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationQType;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationPType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /**
       * \brief Default Constructor.
@@ -83,7 +94,8 @@ template<typename _MatrixType> class FullPivLU
       * \param matrix the matrix of which to compute the LU decomposition.
       *               It is required to be nonzero.
       */
-    FullPivLU(const MatrixType& matrix);
+    template<typename InputType>
+    explicit FullPivLU(const EigenBase<InputType>& matrix);
 
     /** Computes the LU decomposition of the given matrix.
       *
@@ -92,7 +104,8 @@ template<typename _MatrixType> class FullPivLU
       *
       * \returns a reference to *this
       */
-    FullPivLU& compute(const MatrixType& matrix);
+    template<typename InputType>
+    FullPivLU& compute(const EigenBase<InputType>& matrix);
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -165,7 +178,7 @@ template<typename _MatrixType> class FullPivLU
     }
 
     /** \returns the image of the matrix, also called its column-space. The columns of the returned matrix
-      * will form a basis of the kernel.
+      * will form a basis of the image (column-space).
       *
       * \param originalMatrix the original matrix, of which *this is the LU decomposition.
       *                       The reason why it is needed to pass it here, is that this allows
@@ -209,12 +222,13 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa TriangularView::solve(), kernel(), inverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
-    inline const internal::solve_retval<FullPivLU, Rhs>
+    inline const Solve<FullPivLU, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
-      return internal::solve_retval<FullPivLU, Rhs>(*this, b.derived());
+      return Solve<FullPivLU, Rhs>(*this, b.derived());
     }
 
     /** \returns the determinant of the matrix of which
@@ -359,12 +373,11 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa MatrixBase::inverse()
       */
-    inline const internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    inline const Inverse<FullPivLU> inverse() const
     {
       eigen_assert(m_isInitialized && "LU is not initialized.");
       eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
-      return internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+      return Inverse<FullPivLU>(*this);
     }
 
     MatrixType reconstructedMatrix() const;
@@ -372,7 +385,25 @@ template<typename _MatrixType> class FullPivLU
     inline Index rows() const { return m_lu.rows(); }
     inline Index cols() const { return m_lu.cols(); }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+    #endif
+
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
+    void computeInPlace();
+
     MatrixType m_lu;
     PermutationPType m_p;
     PermutationQType m_q;
@@ -402,7 +433,8 @@ FullPivLU<MatrixType>::FullPivLU(Index rows, Index cols)
 }
 
 template<typename MatrixType>
-FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
+template<typename InputType>
+FullPivLU<MatrixType>::FullPivLU(const EigenBase<InputType>& matrix)
   : m_lu(matrix.rows(), matrix.cols()),
     m_p(matrix.rows()),
     m_q(matrix.cols()),
@@ -411,26 +443,37 @@ FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
     m_isInitialized(false),
     m_usePrescribedThreshold(false)
 {
-  compute(matrix);
+  compute(matrix.derived());
 }
 
 template<typename MatrixType>
-FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
+  check_template_parameters();
+
   // the permutations are stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-  
+
   m_isInitialized = true;
-  m_lu = matrix;
+  m_lu = matrix.derived();
 
-  const Index size = matrix.diagonalSize();
-  const Index rows = matrix.rows();
-  const Index cols = matrix.cols();
+  computeInPlace();
+
+  return *this;
+}
+
+template<typename MatrixType>
+void FullPivLU<MatrixType>::computeInPlace()
+{
+  const Index size = m_lu.diagonalSize();
+  const Index rows = m_lu.rows();
+  const Index cols = m_lu.cols();
 
   // will store the transpositions, before we accumulate them at the end.
   // can't accumulate on-the-fly because that will be done in reverse order for the rows.
-  m_rowsTranspositions.resize(matrix.rows());
-  m_colsTranspositions.resize(matrix.cols());
+  m_rowsTranspositions.resize(m_lu.rows());
+  m_colsTranspositions.resize(m_lu.cols());
   Index number_of_transpositions = 0; // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
 
   m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
@@ -442,14 +485,16 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
     biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
+                        .unaryExpr(Scoring())
                         .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
     col_of_biggest_in_corner += k; // need to add k to them.
 
-    if(biggest_in_corner==RealScalar(0))
+    if(biggest_in_corner==Score(0))
     {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
@@ -462,7 +507,8 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if(abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
@@ -499,7 +545,6 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
     m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
-  return *this;
 }
 
 template<typename MatrixType>
@@ -511,8 +556,8 @@ typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant
 }
 
 /** \returns the matrix represented by the decomposition,
- * i.e., it returns the product: P^{-1} L U Q^{-1}.
- * This function is provided for debug purpose. */
+ * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$.
+ * This function is provided for debug purposes. */
 template<typename MatrixType>
 MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
 {
@@ -662,64 +707,136 @@ struct image_retval<FullPivLU<_MatrixType> >
 
 /***** Implementation of solve() *****************************************************/
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivLU<_MatrixType>, Rhs>
+} // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivLU<_MatrixType>,Rhs)
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
+  * So we proceed as follows:
+  * Step 1: compute c = P * rhs.
+  * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
+  * Step 3: replace c by the solution x to Ux = c. May or may not exist.
+  * Step 4: result = Q * c;
+  */
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  const Index rows = this->rows(),
+              cols = this->cols(),
+              nonzero_pivots = this->rank();
+  eigen_assert(rhs.rows() == rows);
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
   {
-    /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
-     * So we proceed as follows:
-     * Step 1: compute c = P * rhs.
-     * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
-     * Step 3: replace c by the solution x to Ux = c. May or may not exist.
-     * Step 4: result = Q * c;
-     */
-
-    const Index rows = dec().rows(), cols = dec().cols(),
-              nonzero_pivots = dec().nonzeroPivots();
-    eigen_assert(rhs().rows() == rows);
-    const Index smalldim = (std::min)(rows, cols);
-
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationP() * rhs;
 
-    typename Rhs::PlainObject c(rhs().rows(), rhs().cols());
+  // Step 2
+  m_lu.topLeftCorner(smalldim,smalldim)
+      .template triangularView<UnitLower>()
+      .solveInPlace(c.topRows(smalldim));
+  if(rows>cols)
+    c.bottomRows(rows-cols) -= m_lu.bottomRows(rows-cols) * c.topRows(cols);
 
-    // Step 1
-    c = dec().permutationP() * rhs();
+  // Step 3
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  // Step 4
+  for(Index i = 0; i < nonzero_pivots; ++i)
+    dst.row(permutationQ().indices().coeff(i)) = c.row(i);
+  for(Index i = nonzero_pivots; i < m_lu.cols(); ++i)
+    dst.row(permutationQ().indices().coeff(i)).setZero();
+}
 
+template<typename _MatrixType>
+template<bool Conjugate, typename RhsType, typename DstType>
+void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
+{
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
+   * and since permutations are real and unitary, we can write this
+   * as   A^T = Q U^T L^T P,
+   * So we proceed as follows:
+   * Step 1: compute c = Q^T rhs.
+   * Step 2: replace c by the solution x to U^T x = c. May or may not exist.
+   * Step 3: replace c by the solution x to L^T x = c.
+   * Step 4: result = P^T c.
+   * If Conjugate is true, replace "^T" by "^*" above.
+   */
+
+  const Index rows = this->rows(), cols = this->cols(),
+    nonzero_pivots = this->rank();
+   eigen_assert(rhs.rows() == cols);
+  const Index smalldim = (std::min)(rows, cols);
+
+  if(nonzero_pivots == 0)
+  {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationQ().inverse() * rhs;
+
+  if (Conjugate) {
     // Step 2
-    dec().matrixLU()
-        .topLeftCorner(smalldim,smalldim)
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .adjoint()
+        .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
         .template triangularView<UnitLower>()
+        .adjoint()
         .solveInPlace(c.topRows(smalldim));
-    if(rows>cols)
-    {
-      c.bottomRows(rows-cols)
-        -= dec().matrixLU().bottomRows(rows-cols)
-         * c.topRows(cols);
-    }
-
-    // Step 3
-    dec().matrixLU()
-        .topLeftCorner(nonzero_pivots, nonzero_pivots)
+  } else {
+    // Step 2
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
         .template triangularView<Upper>()
+        .transpose()
         .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
+        .template triangularView<UnitLower>()
+        .transpose()
+        .solveInPlace(c.topRows(smalldim));
+  }
+
+  // Step 4
+  PermutationPType invp = permutationP().inverse().eval();
+  for(Index i = 0; i < smalldim; ++i)
+    dst.row(invp.indices().coeff(i)) = c.row(i);
+  for(Index i = smalldim; i < rows; ++i)
+    dst.row(invp.indices().coeff(i)).setZero();
+}
+
+#endif
+
+namespace internal {
 
-    // Step 4
-    for(Index i = 0; i < nonzero_pivots; ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < dec().matrixLU().cols(); ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+
+/***** Implementation of inverse() *****************************************************/
+template<typename DstXprType, typename MatrixType, typename Scalar>
+struct Assignment<DstXprType, Inverse<FullPivLU<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef FullPivLU<MatrixType> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
 } // end namespace internal
 
 /******* MatrixBase methods *****************************************************************/
@@ -730,12 +847,14 @@ struct solve_retval<FullPivLU<_MatrixType>, Rhs>
   *
   * \sa class FullPivLU
   */
+#ifndef __CUDACC__
 template<typename Derived>
 inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::fullPivLu() const
 {
   return FullPivLU<PlainObject>(eval());
 }
+#endif
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/LU/Inverse.h b/nuparu/include/Eigen/src/LU/InverseImpl.h
similarity index 87%
rename from nuparu/include/Eigen/src/LU/Inverse.h
rename to nuparu/include/Eigen/src/LU/InverseImpl.h
index 3cf88719..e202a55c 100644
--- a/nuparu/include/Eigen/src/LU/Inverse.h
+++ b/nuparu/include/Eigen/src/LU/InverseImpl.h
@@ -2,13 +2,14 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_INVERSE_H
-#define EIGEN_INVERSE_H
+#ifndef EIGEN_INVERSE_IMPL_H
+#define EIGEN_INVERSE_IMPL_H
 
 namespace Eigen { 
 
@@ -21,6 +22,7 @@ namespace internal {
 template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
 struct compute_inverse
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     result = matrix.partialPivLu().inverse();
@@ -37,16 +39,19 @@ struct compute_inverse_and_det_with_check { /* nothing! general case not support
 template<typename MatrixType, typename ResultType>
 struct compute_inverse<MatrixType, ResultType, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     typedef typename MatrixType::Scalar Scalar;
-    result.coeffRef(0,0) = Scalar(1) / matrix.coeff(0,0);
+    internal::evaluator<MatrixType> matrixEval(matrix);
+    result.coeffRef(0,0) = Scalar(1) / matrixEval.coeff(0,0);
   }
 };
 
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -67,19 +72,21 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
 ****************************/
 
 template<typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC 
 inline void compute_inverse_size2_helper(
     const MatrixType& matrix, const typename ResultType::Scalar& invdet,
     ResultType& result)
 {
-  result.coeffRef(0,0) = matrix.coeff(1,1) * invdet;
+  result.coeffRef(0,0) =  matrix.coeff(1,1) * invdet;
   result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
   result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
-  result.coeffRef(1,1) = matrix.coeff(0,0) * invdet;
+  result.coeffRef(1,1) =  matrix.coeff(0,0) * invdet;
 }
 
 template<typename MatrixType, typename ResultType>
 struct compute_inverse<MatrixType, ResultType, 2>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     typedef typename ResultType::Scalar Scalar;
@@ -91,6 +98,7 @@ struct compute_inverse<MatrixType, ResultType, 2>
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -114,6 +122,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
 ****************************/
 
 template<typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC 
 inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
 {
   enum {
@@ -127,6 +136,7 @@ inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
 }
 
 template<typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC
 inline void compute_inverse_size3_helper(
     const MatrixType& matrix,
     const typename ResultType::Scalar& invdet,
@@ -145,6 +155,7 @@ inline void compute_inverse_size3_helper(
 template<typename MatrixType, typename ResultType>
 struct compute_inverse<MatrixType, ResultType, 3>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(const MatrixType& matrix, ResultType& result)
   {
     typedef typename ResultType::Scalar Scalar;
@@ -161,6 +172,7 @@ struct compute_inverse<MatrixType, ResultType, 3>
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -188,6 +200,7 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
 ****************************/
 
 template<typename Derived>
+EIGEN_DEVICE_FUNC 
 inline const typename Derived::Scalar general_det3_helper
 (const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
 {
@@ -196,6 +209,7 @@ inline const typename Derived::Scalar general_det3_helper
 }
 
 template<typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC 
 inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
 {
   enum {
@@ -214,6 +228,7 @@ inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
 template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
 struct compute_inverse_size4
 {
+  EIGEN_DEVICE_FUNC
   static void run(const MatrixType& matrix, ResultType& result)
   {
     result.coeffRef(0,0) =  cofactor_4x4<MatrixType,0,0>(matrix);
@@ -246,6 +261,7 @@ struct compute_inverse<MatrixType, ResultType, 4>
 template<typename MatrixType, typename ResultType>
 struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
 {
+  EIGEN_DEVICE_FUNC
   static inline void run(
     const MatrixType& matrix,
     const typename MatrixType::RealScalar& absDeterminantThreshold,
@@ -265,38 +281,33 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
 *** MatrixBase methods ***
 *************************/
 
-template<typename MatrixType>
-struct traits<inverse_impl<MatrixType> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename MatrixType>
-struct inverse_impl : public ReturnByValue<inverse_impl<MatrixType> >
-{
-  typedef typename MatrixType::Index Index;
-  typedef typename internal::eval<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
-  MatrixTypeNested m_matrix;
-
-  inverse_impl(const MatrixType& matrix)
-    : m_matrix(matrix)
-  {}
+} // end namespace internal
 
-  inline Index rows() const { return m_matrix.rows(); }
-  inline Index cols() const { return m_matrix.cols(); }
+namespace internal {
 
-  template<typename Dest> inline void evalTo(Dest& dst) const
+// Specialization for "dense = dense_xpr.inverse()"
+template<typename DstXprType, typename XprType, typename Scalar>
+struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef Inverse<XprType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
   {
-    const int Size = EIGEN_PLAIN_ENUM_MIN(MatrixType::ColsAtCompileTime,Dest::ColsAtCompileTime);
+    // FIXME shall we resize dst here?
+    const int Size = EIGEN_PLAIN_ENUM_MIN(XprType::ColsAtCompileTime,DstXprType::ColsAtCompileTime);
     EIGEN_ONLY_USED_FOR_DEBUG(Size);
-    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(m_matrix)!=extract_data(dst)))
+    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(src.nestedExpression())!=extract_data(dst)))
               && "Aliasing problem detected in inverse(), you need to do inverse().eval() here.");
 
-    compute_inverse<MatrixTypeNestedCleaned, Dest>::run(m_matrix, dst);
+    typedef typename internal::nested_eval<XprType,XprType::ColsAtCompileTime>::type  ActualXprType;
+    typedef typename internal::remove_all<ActualXprType>::type                        ActualXprTypeCleanded;
+    
+    ActualXprType actual_xpr(src.nestedExpression());
+    
+    compute_inverse<ActualXprTypeCleanded, DstXprType>::run(actual_xpr, dst);
   }
 };
 
+  
 } // end namespace internal
 
 /** \lu_module
@@ -317,11 +328,11 @@ struct inverse_impl : public ReturnByValue<inverse_impl<MatrixType> >
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
-inline const internal::inverse_impl<Derived> MatrixBase<Derived>::inverse() const
+inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
   eigen_assert(rows() == cols());
-  return internal::inverse_impl<Derived>(derived());
+  return Inverse<Derived>(derived());
 }
 
 /** \lu_module
@@ -357,7 +368,7 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
   // for larger sizes, evaluating has negligible cost and limits code size.
   typedef typename internal::conditional<
     RowsAtCompileTime == 2,
-    typename internal::remove_all<typename internal::nested<Derived, 2>::type>::type,
+    typename internal::remove_all<typename internal::nested_eval<Derived, 2>::type>::type,
     PlainObject
   >::type MatrixType;
   internal::compute_inverse_and_det_with_check<MatrixType, ResultType>::run
@@ -397,4 +408,4 @@ inline void MatrixBase<Derived>::computeInverseWithCheck(
 
 } // end namespace Eigen
 
-#endif // EIGEN_INVERSE_H
+#endif // EIGEN_INVERSE_IMPL_H
diff --git a/nuparu/include/Eigen/src/LU/PartialPivLU.h b/nuparu/include/Eigen/src/LU/PartialPivLU.h
index 740ee694..50e92060 100644
--- a/nuparu/include/Eigen/src/LU/PartialPivLU.h
+++ b/nuparu/include/Eigen/src/LU/PartialPivLU.h
@@ -11,7 +11,22 @@
 #ifndef EIGEN_PARTIALLU_H
 #define EIGEN_PARTIALLU_H
 
-namespace Eigen { 
+namespace Eigen {
+
+namespace internal {
+template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
+ : traits<_MatrixType>
+{
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef traits<_MatrixType> BaseTraits;
+  enum {
+    Flags = BaseTraits::Flags & RowMajorBit,
+    CoeffReadCost = Dynamic
+  };
+};
+
+} // end namespace internal
 
 /** \ingroup LU_Module
   *
@@ -45,31 +60,29 @@ namespace Eigen {
   * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
   */
 template<typename _MatrixType> class PartialPivLU
+  : public SolverBase<PartialPivLU<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
+    typedef SolverBase<PartialPivLU> Base;
+    EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
     typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
 
     /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via PartialPivLU::compute(const MatrixType&).
-    */
+      * \brief Default Constructor.
+      *
+      * The default constructor is useful in cases in which the user intends to
+      * perform decompositions via PartialPivLU::compute(const MatrixType&).
+      */
     PartialPivLU();
 
     /** \brief Default Constructor with memory preallocation
@@ -78,7 +91,7 @@ template<typename _MatrixType> class PartialPivLU
       * according to the specified problem \a size.
       * \sa PartialPivLU()
       */
-    PartialPivLU(Index size);
+    explicit PartialPivLU(Index size);
 
     /** Constructor.
       *
@@ -87,9 +100,11 @@ template<typename _MatrixType> class PartialPivLU
       * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
       * If you need to deal with non-full rank, use class FullPivLU instead.
       */
-    PartialPivLU(const MatrixType& matrix);
+    template<typename InputType>
+    explicit PartialPivLU(const EigenBase<InputType>& matrix);
 
-    PartialPivLU& compute(const MatrixType& matrix);
+    template<typename InputType>
+    PartialPivLU& compute(const EigenBase<InputType>& matrix);
 
     /** \returns the LU decomposition matrix: the upper-triangular part is U, the
       * unit-lower-triangular part is L (at least for square matrices; in the non-square
@@ -128,12 +143,13 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
-    inline const internal::solve_retval<PartialPivLU, Rhs>
+    inline const Solve<PartialPivLU, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU, Rhs>(*this, b.derived());
+      return Solve<PartialPivLU, Rhs>(*this, b.derived());
     }
 
     /** \returns the inverse of the matrix of which *this is the LU decomposition.
@@ -143,11 +159,10 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa MatrixBase::inverse(), LU::inverse()
       */
-    inline const internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType> inverse() const
+    inline const Inverse<PartialPivLU> inverse() const
     {
       eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
+      return Inverse<PartialPivLU>(*this);
     }
 
     /** \returns the determinant of the matrix of which
@@ -170,7 +185,64 @@ template<typename _MatrixType> class PartialPivLU
     inline Index rows() const { return m_lu.rows(); }
     inline Index cols() const { return m_lu.cols(); }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+      * So we proceed as follows:
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
+      */
+
+      eigen_assert(rhs.rows() == m_lu.rows());
+
+      // Step 1
+      dst = permutationP() * rhs;
+
+      // Step 2
+      m_lu.template triangularView<UnitLower>().solveInPlace(dst);
+
+      // Step 3
+      m_lu.template triangularView<Upper>().solveInPlace(dst);
+    }
+
+    template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+      * So we proceed as follows:
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
+      */
+
+      eigen_assert(rhs.rows() == m_lu.cols());
+
+      if (Conjugate) {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
+      } else {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
+      }
+      // Step 3
+      dst = permutationP().transpose() * dst;
+    }
+    #endif
+
   protected:
+
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+
     MatrixType m_lu;
     PermutationType m_p;
     TranspositionType m_rowsTranspositions;
@@ -199,14 +271,15 @@ PartialPivLU<MatrixType>::PartialPivLU(Index size)
 }
 
 template<typename MatrixType>
-PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
+template<typename InputType>
+PartialPivLU<MatrixType>::PartialPivLU(const EigenBase<InputType>& matrix)
   : m_lu(matrix.rows(), matrix.rows()),
     m_p(matrix.rows()),
     m_rowsTranspositions(matrix.rows()),
     m_det_p(0),
     m_isInitialized(false)
 {
-  compute(matrix);
+  compute(matrix.derived());
 }
 
 namespace internal {
@@ -224,7 +297,6 @@ struct partial_lu_impl
   typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
   typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
 
   /** \internal performs the LU decomposition in-place of the matrix \a lu
     * using an unblocked algorithm.
@@ -238,6 +310,8 @@ struct partial_lu_impl
     */
   static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
@@ -247,15 +321,15 @@ struct partial_lu_impl
     {
       Index rrows = rows-k-1;
       Index rcols = cols-k-1;
-        
+
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner
+        = lu.col(k).tail(rows-k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
+      if(biggest_in_corner != Score(0))
       {
         if(k != row_of_biggest_in_col)
         {
@@ -371,32 +445,35 @@ struct partial_lu_impl
 /** \internal performs the LU decomposition with partial pivoting in-place.
   */
 template<typename MatrixType, typename TranspositionType>
-void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::Index& nb_transpositions)
+void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions)
 {
   eigen_assert(lu.cols() == row_transpositions.size());
   eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
 
   partial_lu_impl
-    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::Index>
+    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex>
     ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
 }
 
 } // end namespace internal
 
 template<typename MatrixType>
-PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
+  check_template_parameters();
+
   // the row permutation is stored as int indices, so just to be sure:
   eigen_assert(matrix.rows()<NumTraits<int>::highest());
-  
-  m_lu = matrix;
+
+  m_lu = matrix.derived();
 
   eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
   const Index size = matrix.rows();
 
   m_rowsTranspositions.resize(size);
 
-  typename TranspositionType::Index nb_transpositions;
+  typename TranspositionType::StorageIndex nb_transpositions;
   internal::partial_lu_inplace(m_lu, m_rowsTranspositions, nb_transpositions);
   m_det_p = (nb_transpositions%2) ? -1 : 1;
 
@@ -430,38 +507,21 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
   return res;
 }
 
-/***** Implementation of solve() *****************************************************/
+/***** Implementation details *****************************************************/
 
 namespace internal {
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PartialPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<PartialPivLU<_MatrixType>, Rhs>
+/***** Implementation of inverse() *****************************************************/
+template<typename DstXprType, typename MatrixType, typename Scalar>
+struct Assignment<DstXprType, Inverse<PartialPivLU<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
 {
-  EIGEN_MAKE_SOLVE_HELPERS(PartialPivLU<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
+  typedef PartialPivLU<MatrixType> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
   {
-    /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
-    * So we proceed as follows:
-    * Step 1: compute c = Pb.
-    * Step 2: replace c by the solution x to Lx = c.
-    * Step 3: replace c by the solution x to Ux = c.
-    */
-
-    eigen_assert(rhs().rows() == dec().matrixLU().rows());
-
-    // Step 1
-    dst = dec().permutationP() * rhs();
-
-    // Step 2
-    dec().matrixLU().template triangularView<UnitLower>().solveInPlace(dst);
-
-    // Step 3
-    dec().matrixLU().template triangularView<Upper>().solveInPlace(dst);
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
 } // end namespace internal
 
 /******** MatrixBase methods *******/
@@ -472,14 +532,15 @@ struct solve_retval<PartialPivLU<_MatrixType>, Rhs>
   *
   * \sa class PartialPivLU
   */
+#ifndef __CUDACC__
 template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::partialPivLu() const
 {
   return PartialPivLU<PlainObject>(eval());
 }
+#endif
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
 /** \lu_module
   *
   * Synonym of partialPivLu().
@@ -488,6 +549,7 @@ MatrixBase<Derived>::partialPivLu() const
   *
   * \sa class PartialPivLU
   */
+#ifndef __CUDACC__
 template<typename Derived>
 inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
 MatrixBase<Derived>::lu() const
diff --git a/nuparu/include/Eigen/src/LU/arch/Inverse_SSE.h b/nuparu/include/Eigen/src/LU/arch/Inverse_SSE.h
index 60b7a237..e1470c66 100644
--- a/nuparu/include/Eigen/src/LU/arch/Inverse_SSE.h
+++ b/nuparu/include/Eigen/src/LU/arch/Inverse_SSE.h
@@ -35,13 +35,15 @@ template<typename MatrixType, typename ResultType>
 struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
 {
   enum {
-    MatrixAlignment     = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment     = bool(ResultType::Flags&AlignedBit),
+    MatrixAlignment     = traits<MatrixType>::Alignment,
+    ResultAlignment     = traits<ResultType>::Alignment,
     StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
   };
+  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
   
-  static void run(const MatrixType& matrix, ResultType& result)
+  static void run(const MatrixType& mat, ResultType& result)
   {
+    ActualMatrixType matrix(mat);
     EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
 
     // Load the full matrix into registers
@@ -163,18 +165,21 @@ template<typename MatrixType, typename ResultType>
 struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
 {
   enum {
-    MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment = bool(ResultType::Flags&AlignedBit),
+    MatrixAlignment     = traits<MatrixType>::Alignment,
+    ResultAlignment     = traits<ResultType>::Alignment,
     StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
   };
-  static void run(const MatrixType& matrix, ResultType& result)
+  typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
+  
+  static void run(const MatrixType& mat, ResultType& result)
   {
+    ActualMatrixType matrix(mat);
     const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
     const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
 
     // The inverse is calculated using "Divide and Conquer" technique. The
     // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register of the matrix holds two element, the smaller matrices are
+    // register of the matrix holds two elements, the smaller matrices are
     // consisted of two registers. Hence we get a better locality of the
     // calculations.
 
diff --git a/nuparu/include/Eigen/src/MetisSupport/MetisSupport.h b/nuparu/include/Eigen/src/MetisSupport/MetisSupport.h
index f2bbef20..4c15304a 100644
--- a/nuparu/include/Eigen/src/MetisSupport/MetisSupport.h
+++ b/nuparu/include/Eigen/src/MetisSupport/MetisSupport.h
@@ -18,12 +18,12 @@ namespace Eigen {
  * Row (column) i of A is the matperm(i) row (column) of Ap. 
  * WARNING: As computed by METIS, this corresponds to the vector iperm (instead of perm)
  */
-template <typename Index>
+template <typename StorageIndex>
 class MetisOrdering
 {
 public:
-  typedef PermutationMatrix<Dynamic,Dynamic,Index> PermutationType;
-  typedef Matrix<Index,Dynamic,1> IndexVector; 
+  typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> PermutationType;
+  typedef Matrix<StorageIndex,Dynamic,1> IndexVector; 
   
   template <typename MatrixType>
   void get_symmetrized_graph(const MatrixType& A)
@@ -36,7 +36,7 @@ class MetisOrdering
     Index TotNz = 0; 
     IndexVector visited(m); 
     visited.setConstant(-1); 
-    for (int j = 0; j < m; j++)
+    for (StorageIndex j = 0; j < m; j++)
     {
       // Compute the union structure of of A(j,:) and At(j,:)
       visited(j) = j; // Do not include the diagonal element
@@ -67,8 +67,8 @@ class MetisOrdering
 
     // Now compute the real adjacency list of each column/row 
     visited.setConstant(-1); 
-    Index CurNz = 0; 
-    for (int j = 0; j < m; j++)
+    StorageIndex CurNz = 0; 
+    for (StorageIndex j = 0; j < m; j++)
     {
       m_indexPtr(j) = CurNz; 
       
@@ -76,7 +76,7 @@ class MetisOrdering
       // Add the pattern of row/column j of A to A+At
       for (typename MatrixType::InnerIterator it(A,j); it; ++it)
       {
-        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
+        StorageIndex idx = it.index(); // Get the row index (for column major) or column index (for row major)
         if (visited(idx) != j ) 
         {
           visited(idx) = j; 
@@ -87,7 +87,7 @@ class MetisOrdering
       //Add the pattern of row/column j of At to A+At
       for (typename MatrixType::InnerIterator it(At, j); it; ++it)
       {
-        Index idx = it.index(); 
+        StorageIndex idx = it.index(); 
         if(visited(idx) != j)
         {
           visited(idx) = j; 
@@ -102,7 +102,7 @@ class MetisOrdering
   template <typename MatrixType>
   void operator() (const MatrixType& A, PermutationType& matperm)
   {
-     Index m = A.cols();
+     StorageIndex m = internal::convert_index<StorageIndex>(A.cols()); // must be StorageIndex, because it is passed by address to METIS
      IndexVector perm(m),iperm(m); 
     // First, symmetrize the matrix graph. 
      get_symmetrized_graph(A); 
diff --git a/nuparu/include/Eigen/src/OrderingMethods/Amd.h b/nuparu/include/Eigen/src/OrderingMethods/Amd.h
index 41b4fd7e..323255e0 100644
--- a/nuparu/include/Eigen/src/OrderingMethods/Amd.h
+++ b/nuparu/include/Eigen/src/OrderingMethods/Amd.h
@@ -41,10 +41,10 @@ template<typename T0, typename T1> inline bool amd_marked(const T0* w, const T1&
 template<typename T0, typename T1> inline void amd_mark(const T0* w, const T1& j) { return w[j] = amd_flip(w[j]); }
 
 /* clear w */
-template<typename Index>
-static int cs_wclear (Index mark, Index lemax, Index *w, Index n)
+template<typename StorageIndex>
+static StorageIndex cs_wclear (StorageIndex mark, StorageIndex lemax, StorageIndex *w, StorageIndex n)
 {
-  Index k;
+  StorageIndex k;
   if(mark < 2 || (mark + lemax < 0))
   {
     for(k = 0; k < n; k++)
@@ -56,10 +56,10 @@ static int cs_wclear (Index mark, Index lemax, Index *w, Index n)
 }
 
 /* depth-first search and postorder of a tree rooted at node j */
-template<typename Index>
-Index cs_tdfs(Index j, Index k, Index *head, const Index *next, Index *post, Index *stack)
+template<typename StorageIndex>
+StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex *head, const StorageIndex *next, StorageIndex *post, StorageIndex *stack)
 {
-  int i, p, top = 0;
+  StorageIndex i, p, top = 0;
   if(!head || !next || !post || !stack) return (-1);    /* check inputs */
   stack[0] = j;                 /* place j on the stack */
   while (top >= 0)                /* while (stack is not empty) */
@@ -87,39 +87,39 @@ Index cs_tdfs(Index j, Index k, Index *head, const Index *next, Index *post, Ind
   * \returns the permutation P reducing the fill-in of the input matrix \a C
   * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional.
   * On exit the values of C are destroyed */
-template<typename Scalar, typename Index>
-void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, PermutationMatrix<Dynamic,Dynamic,Index>& perm)
+template<typename Scalar, typename StorageIndex>
+void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,StorageIndex>& C, PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm)
 {
   using std::sqrt;
   
-  int d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
-      k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
-      ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t;
-  unsigned int h;
+  StorageIndex d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
+                k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
+                ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t, h;
   
-  Index n = C.cols();
-  dense = std::max<Index> (16, Index(10 * sqrt(double(n))));   /* find dense threshold */
-  dense = std::min<Index> (n-2, dense);
+  StorageIndex n = StorageIndex(C.cols());
+  dense = std::max<StorageIndex> (16, StorageIndex(10 * sqrt(double(n))));   /* find dense threshold */
+  dense = (std::min)(n-2, dense);
   
-  Index cnz = C.nonZeros();
+  StorageIndex cnz = StorageIndex(C.nonZeros());
   perm.resize(n+1);
   t = cnz + cnz/5 + 2*n;                 /* add elbow room to C */
   C.resizeNonZeros(t);
   
-  Index* W       = new Index[8*(n+1)]; /* get workspace */
-  Index* len     = W;
-  Index* nv      = W +   (n+1);
-  Index* next    = W + 2*(n+1);
-  Index* head    = W + 3*(n+1);
-  Index* elen    = W + 4*(n+1);
-  Index* degree  = W + 5*(n+1);
-  Index* w       = W + 6*(n+1);
-  Index* hhead   = W + 7*(n+1);
-  Index* last    = perm.indices().data();                              /* use P as workspace for last */
+  // get workspace
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,W,8*(n+1),0);
+  StorageIndex* len     = W;
+  StorageIndex* nv      = W +   (n+1);
+  StorageIndex* next    = W + 2*(n+1);
+  StorageIndex* head    = W + 3*(n+1);
+  StorageIndex* elen    = W + 4*(n+1);
+  StorageIndex* degree  = W + 5*(n+1);
+  StorageIndex* w       = W + 6*(n+1);
+  StorageIndex* hhead   = W + 7*(n+1);
+  StorageIndex* last    = perm.indices().data();                              /* use P as workspace for last */
   
   /* --- Initialize quotient graph ---------------------------------------- */
-  Index* Cp = C.outerIndexPtr();
-  Index* Ci = C.innerIndexPtr();
+  StorageIndex* Cp = C.outerIndexPtr();
+  StorageIndex* Ci = C.innerIndexPtr();
   for(k = 0; k < n; k++)
     len[k] = Cp[k+1] - Cp[k];
   len[n] = 0;
@@ -136,23 +136,28 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
     elen[i]   = 0;                      // Ek of node i is empty
     degree[i] = len[i];                 // degree of node i
   }
-  mark = internal::cs_wclear<Index>(0, 0, w, n);         /* clear w */
-  elen[n] = -2;                         /* n is a dead element */
-  Cp[n] = -1;                           /* n is a root of assembly tree */
-  w[n] = 0;                             /* n is a dead element */
+  mark = internal::cs_wclear<StorageIndex>(0, 0, w, n);         /* clear w */
   
   /* --- Initialize degree lists ------------------------------------------ */
   for(i = 0; i < n; i++)
   {
+    bool has_diag = false;
+    for(p = Cp[i]; p<Cp[i+1]; ++p)
+      if(Ci[p]==i)
+      {
+        has_diag = true;
+        break;
+      }
+   
     d = degree[i];
-    if(d == 0)                         /* node i is empty */
+    if(d == 1 && has_diag)           /* node i is empty */
     {
       elen[i] = -2;                 /* element i is dead */
       nel++;
       Cp[i] = -1;                   /* i is a root of assembly tree */
       w[i] = 0;
     }
-    else if(d > dense)                 /* node i is dense */
+    else if(d > dense || !has_diag)  /* node i is dense or has no structural diagonal element */
     {
       nv[i] = 0;                    /* absorb i into element n */
       elen[i] = -1;                 /* node i is dead */
@@ -168,6 +173,10 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
     }
   }
   
+  elen[n] = -2;                         /* n is a dead element */
+  Cp[n] = -1;                           /* n is a root of assembly tree */
+  w[n] = 0;                             /* n is a dead element */
+  
   while (nel < n)                         /* while (selecting pivots) do */
   {
     /* --- Select node of minimum approximate degree -------------------- */
@@ -251,7 +260,7 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
     elen[k] = -2;                     /* k is now an element */
     
     /* --- Find set differences ----------------------------------------- */
-    mark = internal::cs_wclear<Index>(mark, lemax, w, n);  /* clear w if necessary */
+    mark = internal::cs_wclear<StorageIndex>(mark, lemax, w, n);  /* clear w if necessary */
     for(pk = pk1; pk < pk2; pk++)    /* scan 1: find |Le\Lk| */
     {
       i = Ci[pk];
@@ -321,7 +330,7 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
       }
       else
       {
-        degree[i] = std::min<Index> (degree[i], d);   /* update degree(i) */
+        degree[i] = std::min<StorageIndex> (degree[i], d);   /* update degree(i) */
         Ci[pn] = Ci[p3];         /* move first node to end */
         Ci[p3] = Ci[p1];         /* move 1st el. to end of Ei */
         Ci[p1] = k;               /* add k as 1st element in of Ei */
@@ -329,12 +338,12 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
         h %= n;                    /* finalize hash of i */
         next[i] = hhead[h];      /* place i in hash bucket */
         hhead[h] = i;
-        last[i] = h;              /* save hash of i in last[i] */
+        last[i] = h;      /* save hash of i in last[i] */
       }
     }                                   /* scan2 is done */
     degree[k] = dk;                   /* finalize |Lk| */
-    lemax = std::max<Index>(lemax, dk);
-    mark = internal::cs_wclear<Index>(mark+lemax, lemax, w, n);    /* clear w */
+    lemax = std::max<StorageIndex>(lemax, dk);
+    mark = internal::cs_wclear<StorageIndex>(mark+lemax, lemax, w, n);    /* clear w */
     
     /* --- Supernode detection ------------------------------------------ */
     for(pk = pk1; pk < pk2; pk++)
@@ -382,12 +391,12 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
       if((nvi = -nv[i]) <= 0) continue;/* skip if i is dead */
       nv[i] = nvi;                      /* restore nv[i] */
       d = degree[i] + dk - nvi;         /* compute external degree(i) */
-      d = std::min<Index> (d, n - nel - nvi);
+      d = std::min<StorageIndex> (d, n - nel - nvi);
       if(head[d] != -1) last[head[d]] = i;
       next[i] = head[d];               /* put i back in degree list */
       last[i] = -1;
       head[d] = i;
-      mindeg = std::min<Index> (mindeg, d);       /* find new minimum degree */
+      mindeg = std::min<StorageIndex> (mindeg, d);       /* find new minimum degree */
       degree[i] = d;
       Ci[p++] = i;                      /* place i in Lk */
     }
@@ -420,12 +429,10 @@ void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, Permutation
   }
   for(k = 0, i = 0; i <= n; i++)       /* postorder the assembly tree */
   {
-    if(Cp[i] == -1) k = internal::cs_tdfs<Index>(i, k, head, next, perm.indices().data(), w);
+    if(Cp[i] == -1) k = internal::cs_tdfs<StorageIndex>(i, k, head, next, perm.indices().data(), w);
   }
   
   perm.indices().conservativeResize(n);
-
-  delete[] W;
 }
 
 } // namespace internal
diff --git a/nuparu/include/Eigen/src/OrderingMethods/Eigen_Colamd.h b/nuparu/include/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 44548f66..6238676e 100644
--- a/nuparu/include/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/nuparu/include/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -135,54 +135,54 @@ namespace internal {
 /* ========================================================================== */
 
 // == Row and Column structures ==
-template <typename Index>
+template <typename IndexType>
 struct colamd_col
 {
-  Index start ;   /* index for A of first row in this column, or DEAD */
+  IndexType start ;   /* index for A of first row in this column, or DEAD */
   /* if column is dead */
-  Index length ;  /* number of rows in this column */
+  IndexType length ;  /* number of rows in this column */
   union
   {
-    Index thickness ; /* number of original columns represented by this */
+    IndexType thickness ; /* number of original columns represented by this */
     /* col, if the column is alive */
-    Index parent ;  /* parent in parent tree super-column structure, if */
+    IndexType parent ;  /* parent in parent tree super-column structure, if */
     /* the column is dead */
   } shared1 ;
   union
   {
-    Index score ; /* the score used to maintain heap, if col is alive */
-    Index order ; /* pivot ordering of this column, if col is dead */
+    IndexType score ; /* the score used to maintain heap, if col is alive */
+    IndexType order ; /* pivot ordering of this column, if col is dead */
   } shared2 ;
   union
   {
-    Index headhash ;  /* head of a hash bucket, if col is at the head of */
+    IndexType headhash ;  /* head of a hash bucket, if col is at the head of */
     /* a degree list */
-    Index hash ;  /* hash value, if col is not in a degree list */
-    Index prev ;  /* previous column in degree list, if col is in a */
+    IndexType hash ;  /* hash value, if col is not in a degree list */
+    IndexType prev ;  /* previous column in degree list, if col is in a */
     /* degree list (but not at the head of a degree list) */
   } shared3 ;
   union
   {
-    Index degree_next ; /* next column, if col is in a degree list */
-    Index hash_next ;   /* next column, if col is in a hash list */
+    IndexType degree_next ; /* next column, if col is in a degree list */
+    IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
   
 };
  
-template <typename Index>
+template <typename IndexType>
 struct Colamd_Row
 {
-  Index start ;   /* index for A of first col in this row */
-  Index length ;  /* number of principal columns in this row */
+  IndexType start ;   /* index for A of first col in this row */
+  IndexType length ;  /* number of principal columns in this row */
   union
   {
-    Index degree ;  /* number of principal & non-principal columns in row */
-    Index p ;   /* used as a row pointer in init_rows_cols () */
+    IndexType degree ;  /* number of principal & non-principal columns in row */
+    IndexType p ;   /* used as a row pointer in init_rows_cols () */
   } shared1 ;
   union
   {
-    Index mark ;  /* for computing set differences and marking dead rows*/
-    Index first_column ;/* first column in row (used in garbage collection) */
+    IndexType mark ;  /* for computing set differences and marking dead rows*/
+    IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
   
 };
@@ -202,38 +202,38 @@ struct Colamd_Row
   
   This macro is not needed when using symamd.
   
-  Explicit typecast to Index added Sept. 23, 2002, COLAMD version 2.2, to avoid
+  Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
-template <typename Index>
-inline Index colamd_c(Index n_col) 
-{ return Index( ((n_col) + 1) * sizeof (colamd_col<Index>) / sizeof (Index) ) ; }
+template <typename IndexType>
+inline IndexType colamd_c(IndexType n_col) 
+{ return IndexType( ((n_col) + 1) * sizeof (colamd_col<IndexType>) / sizeof (IndexType) ) ; }
 
-template <typename Index>
-inline Index  colamd_r(Index n_row)
-{ return Index(((n_row) + 1) * sizeof (Colamd_Row<Index>) / sizeof (Index)); }
+template <typename IndexType>
+inline IndexType  colamd_r(IndexType n_row)
+{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row<IndexType>) / sizeof (IndexType)); }
 
 // Prototypes of non-user callable routines
-template <typename Index>
-static Index init_rows_cols (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> col [], Index A [], Index p [], Index stats[COLAMD_STATS] ); 
+template <typename IndexType>
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); 
 
-template <typename Index>
-static void init_scoring (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], double knobs[COLAMD_KNOBS], Index *p_n_row2, Index *p_n_col2, Index *p_max_deg);
+template <typename IndexType>
+static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
-template <typename Index>
-static Index find_ordering (Index n_row, Index n_col, Index Alen, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], Index n_col2, Index max_deg, Index pfree);
+template <typename IndexType>
+static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
 
-template <typename Index>
-static void order_children (Index n_col, colamd_col<Index> Col [], Index p []);
+template <typename IndexType>
+static void order_children (IndexType n_col, colamd_col<IndexType> Col [], IndexType p []);
 
-template <typename Index>
-static void detect_super_cols (colamd_col<Index> Col [], Index A [], Index head [], Index row_start, Index row_length ) ;
+template <typename IndexType>
+static void detect_super_cols (colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
 
-template <typename Index>
-static Index garbage_collection (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index *pfree) ;
+template <typename IndexType>
+static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType *pfree) ;
 
-template <typename Index>
-static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
+template <typename IndexType>
+static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row [] ) ;
 
 /* === No debugging ========================================================= */
 
@@ -260,8 +260,8 @@ static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
  * \param n_col number of columns in A
  * \return recommended value of Alen for use by colamd
  */
-template <typename Index>
-inline Index colamd_recommended ( Index nnz, Index n_row, Index n_col)
+template <typename IndexType>
+inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
 {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
@@ -325,22 +325,22 @@ static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
  * \param knobs parameter settings for colamd
  * \param stats colamd output statistics and error codes
  */
-template <typename Index>
-static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, double knobs[COLAMD_KNOBS], Index stats[COLAMD_STATS])
+template <typename IndexType>
+static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS])
 {
   /* === Local variables ================================================== */
   
-  Index i ;     /* loop index */
-  Index nnz ;     /* nonzeros in A */
-  Index Row_size ;    /* size of Row [], in integers */
-  Index Col_size ;    /* size of Col [], in integers */
-  Index need ;      /* minimum required length of A */
-  Colamd_Row<Index> *Row ;   /* pointer into A of Row [0..n_row] array */
-  colamd_col<Index> *Col ;   /* pointer into A of Col [0..n_col] array */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index ngarbage ;    /* number of garbage collections performed */
-  Index max_deg ;   /* maximum row degree */
+  IndexType i ;     /* loop index */
+  IndexType nnz ;     /* nonzeros in A */
+  IndexType Row_size ;    /* size of Row [], in integers */
+  IndexType Col_size ;    /* size of Col [], in integers */
+  IndexType need ;      /* minimum required length of A */
+  Colamd_Row<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
+  colamd_col<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
+  IndexType n_col2 ;    /* number of non-dense, non-empty columns */
+  IndexType n_row2 ;    /* number of non-dense, non-empty rows */
+  IndexType ngarbage ;    /* number of garbage collections performed */
+  IndexType max_deg ;   /* maximum row degree */
   double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
   
   
@@ -431,8 +431,8 @@ static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, dou
   }
   
   Alen -= Col_size + Row_size ;
-  Col = (colamd_col<Index> *) &A [Alen] ;
-  Row = (Colamd_Row<Index> *) &A [Alen + Col_size] ;
+  Col = (colamd_col<IndexType> *) &A [Alen] ;
+  Row = (Colamd_Row<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
   
@@ -485,29 +485,29 @@ static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, dou
   column form of the matrix.  Returns false if the matrix is invalid,
   true otherwise.  Not user-callable.
 */
-template <typename Index>
-static Index init_rows_cols  /* returns true if OK, or false otherwise */
+template <typename IndexType>
+static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* row indices of A, of size Alen */
-    Index p [],     /* pointers to columns in A, of size n_col+1 */
-    Index stats [COLAMD_STATS]  /* colamd statistics */ 
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* row indices of A, of size Alen */
+    IndexType p [],     /* pointers to columns in A, of size n_col+1 */
+    IndexType stats [COLAMD_STATS]  /* colamd statistics */ 
     )
 {
   /* === Local variables ================================================== */
 
-  Index col ;     /* a column index */
-  Index row ;     /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *rp ;     /* a row pointer */
-  Index *rp_end ;   /* a pointer to the end of a row */
-  Index last_row ;    /* previous row */
+  IndexType col ;     /* a column index */
+  IndexType row ;     /* a row index */
+  IndexType *cp ;     /* a column pointer */
+  IndexType *cp_end ;   /* a pointer to the end of a column */
+  IndexType *rp ;     /* a row pointer */
+  IndexType *rp_end ;   /* a pointer to the end of a row */
+  IndexType last_row ;    /* previous row */
 
   /* === Initialize columns, and check column pointers ==================== */
 
@@ -701,40 +701,40 @@ static Index init_rows_cols  /* returns true if OK, or false otherwise */
   Kills dense or empty columns and rows, calculates an initial score for
   each column, and places all columns in the degree lists.  Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static void init_scoring
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* column form and row form of A */
+    IndexType head [],    /* of size n_col+1 */
     double knobs [COLAMD_KNOBS],/* parameters */
-    Index *p_n_row2,    /* number of non-dense, non-empty rows */
-    Index *p_n_col2,    /* number of non-dense, non-empty columns */
-    Index *p_max_deg    /* maximum row degree */
+    IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
+    IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
+    IndexType *p_max_deg    /* maximum row degree */
     )
 {
   /* === Local variables ================================================== */
 
-  Index c ;     /* a column index */
-  Index r, row ;    /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index deg ;     /* degree of a row or column */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *new_cp ;   /* new column pointer */
-  Index col_length ;    /* length of pruned column */
-  Index score ;     /* current column score */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index dense_row_count ; /* remove rows with more entries than this */
-  Index dense_col_count ; /* remove cols with more entries than this */
-  Index min_score ;   /* smallest column score */
-  Index max_deg ;   /* maximum row degree */
-  Index next_col ;    /* Used to add to degree list.*/
+  IndexType c ;     /* a column index */
+  IndexType r, row ;    /* a row index */
+  IndexType *cp ;     /* a column pointer */
+  IndexType deg ;     /* degree of a row or column */
+  IndexType *cp_end ;   /* a pointer to the end of a column */
+  IndexType *new_cp ;   /* new column pointer */
+  IndexType col_length ;    /* length of pruned column */
+  IndexType score ;     /* current column score */
+  IndexType n_col2 ;    /* number of non-dense, non-empty columns */
+  IndexType n_row2 ;    /* number of non-dense, non-empty rows */
+  IndexType dense_row_count ; /* remove rows with more entries than this */
+  IndexType dense_col_count ; /* remove cols with more entries than this */
+  IndexType min_score ;   /* smallest column score */
+  IndexType max_deg ;   /* maximum row degree */
+  IndexType next_col ;    /* Used to add to degree list.*/
 
 
   /* === Extract knobs ==================================================== */
@@ -845,7 +845,7 @@ static void init_scoring
       score = COLAMD_MIN (score, n_col) ;
     }
     /* determine pruned column length */
-    col_length = (Index) (new_cp - &A [Col [c].start]) ;
+    col_length = (IndexType) (new_cp - &A [Col [c].start]) ;
     if (col_length == 0)
     {
       /* a newly-made null column (all rows in this col are "dense" */
@@ -938,56 +938,56 @@ static void init_scoring
   (no supercolumns on input).  Uses a minimum approximate column minimum
   degree ordering method.  Not user-callable.
 */
-template <typename Index>
-static Index find_ordering /* return the number of garbage collections */
+template <typename IndexType>
+static IndexType find_ordering /* return the number of garbage collections */
   (
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Index Alen,     /* size of A, 2*nnz + n_col or larger */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
-    Index n_col2,     /* Remaining columns to order */
-    Index max_deg,    /* Maximum row degree */
-    Index pfree     /* index of first free slot (2*nnz on entry) */
+    IndexType n_row,      /* number of rows of A */
+    IndexType n_col,      /* number of columns of A */
+    IndexType Alen,     /* size of A, 2*nnz + n_col or larger */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
+    IndexType A [],     /* column form and row form of A */
+    IndexType head [],    /* of size n_col+1 */
+    IndexType n_col2,     /* Remaining columns to order */
+    IndexType max_deg,    /* Maximum row degree */
+    IndexType pfree     /* index of first free slot (2*nnz on entry) */
     )
 {
   /* === Local variables ================================================== */
 
-  Index k ;     /* current pivot ordering step */
-  Index pivot_col ;   /* current pivot column */
-  Index *cp ;     /* a column pointer */
-  Index *rp ;     /* a row pointer */
-  Index pivot_row ;   /* current pivot row */
-  Index *new_cp ;   /* modified column pointer */
-  Index *new_rp ;   /* modified row pointer */
-  Index pivot_row_start ; /* pointer to start of pivot row */
-  Index pivot_row_degree ;  /* number of columns in pivot row */
-  Index pivot_row_length ;  /* number of supercolumns in pivot row */
-  Index pivot_col_score ; /* score of pivot column */
-  Index needed_memory ;   /* free space needed for pivot row */
-  Index *cp_end ;   /* pointer to the end of a column */
-  Index *rp_end ;   /* pointer to the end of a row */
-  Index row ;     /* a row index */
-  Index col ;     /* a column index */
-  Index max_score ;   /* maximum possible score */
-  Index cur_score ;   /* score of current column */
+  IndexType k ;     /* current pivot ordering step */
+  IndexType pivot_col ;   /* current pivot column */
+  IndexType *cp ;     /* a column pointer */
+  IndexType *rp ;     /* a row pointer */
+  IndexType pivot_row ;   /* current pivot row */
+  IndexType *new_cp ;   /* modified column pointer */
+  IndexType *new_rp ;   /* modified row pointer */
+  IndexType pivot_row_start ; /* pointer to start of pivot row */
+  IndexType pivot_row_degree ;  /* number of columns in pivot row */
+  IndexType pivot_row_length ;  /* number of supercolumns in pivot row */
+  IndexType pivot_col_score ; /* score of pivot column */
+  IndexType needed_memory ;   /* free space needed for pivot row */
+  IndexType *cp_end ;   /* pointer to the end of a column */
+  IndexType *rp_end ;   /* pointer to the end of a row */
+  IndexType row ;     /* a row index */
+  IndexType col ;     /* a column index */
+  IndexType max_score ;   /* maximum possible score */
+  IndexType cur_score ;   /* score of current column */
   unsigned int hash ;   /* hash value for supernode detection */
-  Index head_column ;   /* head of hash bucket */
-  Index first_col ;   /* first column in hash bucket */
-  Index tag_mark ;    /* marker value for mark array */
-  Index row_mark ;    /* Row [row].shared2.mark */
-  Index set_difference ;  /* set difference size of row with pivot row */
-  Index min_score ;   /* smallest column score */
-  Index col_thickness ;   /* "thickness" (no. of columns in a supercol) */
-  Index max_mark ;    /* maximum value of tag_mark */
-  Index pivot_col_thickness ; /* number of columns represented by pivot col */
-  Index prev_col ;    /* Used by Dlist operations. */
-  Index next_col ;    /* Used by Dlist operations. */
-  Index ngarbage ;    /* number of garbage collections performed */
+  IndexType head_column ;   /* head of hash bucket */
+  IndexType first_col ;   /* first column in hash bucket */
+  IndexType tag_mark ;    /* marker value for mark array */
+  IndexType row_mark ;    /* Row [row].shared2.mark */
+  IndexType set_difference ;  /* set difference size of row with pivot row */
+  IndexType min_score ;   /* smallest column score */
+  IndexType col_thickness ;   /* "thickness" (no. of columns in a supercol) */
+  IndexType max_mark ;    /* maximum value of tag_mark */
+  IndexType pivot_col_thickness ; /* number of columns represented by pivot col */
+  IndexType prev_col ;    /* Used by Dlist operations. */
+  IndexType next_col ;    /* Used by Dlist operations. */
+  IndexType ngarbage ;    /* number of garbage collections performed */
 
 
   /* === Initialization and clear mark ==================================== */
@@ -1277,7 +1277,7 @@ static Index find_ordering /* return the number of garbage collections */
       }
 
       /* recompute the column's length */
-      Col [col].length = (Index) (new_cp - &A [Col [col].start]) ;
+      Col [col].length = (IndexType) (new_cp - &A [Col [col].start]) ;
 
       /* === Further mass elimination ================================= */
 
@@ -1325,7 +1325,7 @@ static Index find_ordering /* return the number of garbage collections */
 	Col [col].shared4.hash_next = first_col ;
 
 	/* save hash function in Col [col].shared3.hash */
-	Col [col].shared3.hash = (Index) hash ;
+	Col [col].shared3.hash = (IndexType) hash ;
 	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
       }
     }
@@ -1420,7 +1420,7 @@ static Index find_ordering /* return the number of garbage collections */
       /* update pivot row length to reflect any cols that were killed */
       /* during super-col detection and mass elimination */
       Row [pivot_row].start  = pivot_row_start ;
-      Row [pivot_row].length = (Index) (new_rp - &A[pivot_row_start]) ;
+      Row [pivot_row].length = (IndexType) (new_rp - &A[pivot_row_start]) ;
       Row [pivot_row].shared1.degree = pivot_row_degree ;
       Row [pivot_row].shared2.mark = 0 ;
       /* pivot row is no longer dead */
@@ -1449,22 +1449,22 @@ static Index find_ordering /* return the number of garbage collections */
   taken by this routine is O (n_col), that is, linear in the number of
   columns.  Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static inline  void order_children
 (
   /* === Parameters ======================================================= */
 
-  Index n_col,      /* number of columns of A */
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index p []      /* p [0 ... n_col-1] is the column permutation*/
+  IndexType n_col,      /* number of columns of A */
+  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  IndexType p []      /* p [0 ... n_col-1] is the column permutation*/
   )
 {
   /* === Local variables ================================================== */
 
-  Index i ;     /* loop counter for all columns */
-  Index c ;     /* column index */
-  Index parent ;    /* index of column's parent */
-  Index order ;     /* column's order */
+  IndexType i ;     /* loop counter for all columns */
+  IndexType c ;     /* column index */
+  IndexType parent ;    /* index of column's parent */
+  IndexType order ;     /* column's order */
 
   /* === Order each non-principal column ================================== */
 
@@ -1550,33 +1550,33 @@ static inline  void order_children
   just been computed in the approximate degree computation.
   Not user-callable.
 */
-template <typename Index>
+template <typename IndexType>
 static void detect_super_cols
 (
   /* === Parameters ======================================================= */
   
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index A [],     /* row indices of A */
-  Index head [],    /* head of degree lists and hash buckets */
-  Index row_start,    /* pointer to set of columns to check */
-  Index row_length    /* number of columns to check */
+  colamd_col<IndexType> Col [],    /* of size n_col+1 */
+  IndexType A [],     /* row indices of A */
+  IndexType head [],    /* head of degree lists and hash buckets */
+  IndexType row_start,    /* pointer to set of columns to check */
+  IndexType row_length    /* number of columns to check */
 )
 {
   /* === Local variables ================================================== */
 
-  Index hash ;      /* hash value for a column */
-  Index *rp ;     /* pointer to a row */
-  Index c ;     /* a column index */
-  Index super_c ;   /* column index of the column to absorb into */
-  Index *cp1 ;      /* column pointer for column super_c */
-  Index *cp2 ;      /* column pointer for column c */
-  Index length ;    /* length of column super_c */
-  Index prev_c ;    /* column preceding c in hash bucket */
-  Index i ;     /* loop counter */
-  Index *rp_end ;   /* pointer to the end of the row */
-  Index col ;     /* a column index in the row to check */
-  Index head_column ;   /* first column in hash bucket or degree list */
-  Index first_col ;   /* first column in hash bucket */
+  IndexType hash ;      /* hash value for a column */
+  IndexType *rp ;     /* pointer to a row */
+  IndexType c ;     /* a column index */
+  IndexType super_c ;   /* column index of the column to absorb into */
+  IndexType *cp1 ;      /* column pointer for column super_c */
+  IndexType *cp2 ;      /* column pointer for column c */
+  IndexType length ;    /* length of column super_c */
+  IndexType prev_c ;    /* column preceding c in hash bucket */
+  IndexType i ;     /* loop counter */
+  IndexType *rp_end ;   /* pointer to the end of the row */
+  IndexType col ;     /* a column index in the row to check */
+  IndexType head_column ;   /* first column in hash bucket or degree list */
+  IndexType first_col ;   /* first column in hash bucket */
 
   /* === Consider each column in the row ================================== */
 
@@ -1701,27 +1701,27 @@ static void detect_super_cols
   itself linear in the number of nonzeros in the input matrix.
   Not user-callable.
 */
-template <typename Index>
-static Index garbage_collection  /* returns the new value of pfree */
+template <typename IndexType>
+static IndexType garbage_collection  /* returns the new value of pfree */
   (
     /* === Parameters ======================================================= */
     
-    Index n_row,      /* number of rows */
-    Index n_col,      /* number of columns */
-    Colamd_Row<Index> Row [],    /* row info */
-    colamd_col<Index> Col [],    /* column info */
-    Index A [],     /* A [0 ... Alen-1] holds the matrix */
-    Index *pfree      /* &A [0] ... pfree is in use */
+    IndexType n_row,      /* number of rows */
+    IndexType n_col,      /* number of columns */
+    Colamd_Row<IndexType> Row [],    /* row info */
+    colamd_col<IndexType> Col [],    /* column info */
+    IndexType A [],     /* A [0 ... Alen-1] holds the matrix */
+    IndexType *pfree      /* &A [0] ... pfree is in use */
     )
 {
   /* === Local variables ================================================== */
 
-  Index *psrc ;     /* source pointer */
-  Index *pdest ;    /* destination pointer */
-  Index j ;     /* counter */
-  Index r ;     /* a row index */
-  Index c ;     /* a column index */
-  Index length ;    /* length of a row or column */
+  IndexType *psrc ;     /* source pointer */
+  IndexType *pdest ;    /* destination pointer */
+  IndexType j ;     /* counter */
+  IndexType r ;     /* a row index */
+  IndexType c ;     /* a column index */
+  IndexType length ;    /* length of a row or column */
 
   /* === Defragment the columns =========================================== */
 
@@ -1734,7 +1734,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 
       /* move and compact the column */
       COLAMD_ASSERT (pdest <= psrc) ;
-      Col [c].start = (Index) (pdest - &A [0]) ;
+      Col [c].start = (IndexType) (pdest - &A [0]) ;
       length = Col [c].length ;
       for (j = 0 ; j < length ; j++)
       {
@@ -1744,7 +1744,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 	  *pdest++ = r ;
 	}
       }
-      Col [c].length = (Index) (pdest - &A [Col [c].start]) ;
+      Col [c].length = (IndexType) (pdest - &A [Col [c].start]) ;
     }
   }
 
@@ -1791,7 +1791,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
-      Row [r].start = (Index) (pdest - &A [0]) ;
+      Row [r].start = (IndexType) (pdest - &A [0]) ;
       length = Row [r].length ;
       for (j = 0 ; j < length ; j++)
       {
@@ -1801,7 +1801,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 	  *pdest++ = c ;
 	}
       }
-      Row [r].length = (Index) (pdest - &A [Row [r].start]) ;
+      Row [r].length = (IndexType) (pdest - &A [Row [r].start]) ;
 
     }
   }
@@ -1810,7 +1810,7 @@ static Index garbage_collection  /* returns the new value of pfree */
 
   /* === Return the new value of pfree ==================================== */
 
-  return ((Index) (pdest - &A [0])) ;
+  return ((IndexType) (pdest - &A [0])) ;
 }
 
 
@@ -1822,18 +1822,18 @@ static Index garbage_collection  /* returns the new value of pfree */
   Clears the Row [].shared2.mark array, and returns the new tag_mark.
   Return value is the new tag_mark.  Not user-callable.
 */
-template <typename Index>
-static inline  Index clear_mark  /* return the new value for tag_mark */
+template <typename IndexType>
+static inline  IndexType clear_mark  /* return the new value for tag_mark */
   (
       /* === Parameters ======================================================= */
 
-    Index n_row,    /* number of rows in A */
-    Colamd_Row<Index> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    IndexType n_row,    /* number of rows in A */
+    Colamd_Row<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
     )
 {
   /* === Local variables ================================================== */
 
-  Index r ;
+  IndexType r ;
 
   for (r = 0 ; r < n_row ; r++)
   {
diff --git a/nuparu/include/Eigen/src/OrderingMethods/Ordering.h b/nuparu/include/Eigen/src/OrderingMethods/Ordering.h
index b4da6531..25792a82 100644
--- a/nuparu/include/Eigen/src/OrderingMethods/Ordering.h
+++ b/nuparu/include/Eigen/src/OrderingMethods/Ordering.h
@@ -44,14 +44,14 @@ void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
   *
   * Functor computing the \em approximate \em minimum \em degree ordering
   * If the matrix is not structurally symmetric, an ordering of A^T+A is computed
-  * \tparam  Index The type of indices of the matrix 
+  * \tparam  StorageIndex The type of indices of the matrix 
   * \sa COLAMDOrdering
   */
-template <typename Index>
+template <typename StorageIndex>
 class AMDOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a sparse matrix
      * This routine is much faster if the input matrix is column-major     
@@ -60,7 +60,7 @@ class AMDOrdering
     void operator()(const MatrixType& mat, PermutationType& perm)
     {
       // Compute the symmetric pattern
-      SparseMatrix<typename MatrixType::Scalar, ColMajor, Index> symm;
+      SparseMatrix<typename MatrixType::Scalar, ColMajor, StorageIndex> symm;
       internal::ordering_helper_at_plus_a(mat,symm); 
     
       // Call the AMD routine 
@@ -72,7 +72,7 @@ class AMDOrdering
     template <typename SrcType, unsigned int SrcUpLo> 
     void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm)
     { 
-      SparseMatrix<typename SrcType::Scalar, ColMajor, Index> C; C = mat;
+      SparseMatrix<typename SrcType::Scalar, ColMajor, StorageIndex> C; C = mat;
       
       // Call the AMD routine 
       // m_mat.prune(keep_diag()); //Remove the diagonal elements 
@@ -88,13 +88,13 @@ class AMDOrdering
   * Functor computing the natural ordering (identity)
   * 
   * \note Returns an empty permutation matrix
-  * \tparam  Index The type of indices of the matrix 
+  * \tparam  StorageIndex The type of indices of the matrix 
   */
-template <typename Index>
+template <typename StorageIndex>
 class NaturalOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
     
     /** Compute the permutation vector from a column-major sparse matrix */
     template <typename MatrixType>
@@ -108,40 +108,46 @@ class NaturalOrdering
 /** \ingroup OrderingMethods_Module
   * \class COLAMDOrdering
   *
+  * \tparam  StorageIndex The type of indices of the matrix 
+  * 
   * Functor computing the \em column \em approximate \em minimum \em degree ordering 
-  * The matrix should be in column-major format
+  * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
   */
-template<typename Index>
+template<typename StorageIndex>
 class COLAMDOrdering
 {
   public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; 
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType; 
+    typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     
-    /** Compute the permutation vector form a sparse matrix */
+    /** Compute the permutation vector \a perm form the sparse matrix \a mat
+      * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+      */
     template <typename MatrixType>
     void operator() (const MatrixType& mat, PermutationType& perm)
     {
-      Index m = mat.rows();
-      Index n = mat.cols();
-      Index nnz = mat.nonZeros();
+      eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
+      
+      StorageIndex m = StorageIndex(mat.rows());
+      StorageIndex n = StorageIndex(mat.cols());
+      StorageIndex nnz = StorageIndex(mat.nonZeros());
       // Get the recommended value of Alen to be used by colamd
-      Index Alen = internal::colamd_recommended(nnz, m, n); 
+      StorageIndex Alen = internal::colamd_recommended(nnz, m, n); 
       // Set the default parameters
       double knobs [COLAMD_KNOBS]; 
-      Index stats [COLAMD_STATS];
+      StorageIndex stats [COLAMD_STATS];
       internal::colamd_set_defaults(knobs);
       
-      Index info;
       IndexVector p(n+1), A(Alen); 
-      for(Index i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
-      for(Index i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
+      for(StorageIndex i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
+      for(StorageIndex i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
       // Call Colamd routine to compute the ordering 
-      info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      EIGEN_UNUSED_VARIABLE(info);
       eigen_assert( info && "COLAMD failed " );
       
       perm.resize(n);
-      for (Index i = 0; i < n; i++) perm.indices()(p(i)) = i;
+      for (StorageIndex i = 0; i < n; i++) perm.indices()(p(i)) = i;
     }
 };
 
diff --git a/nuparu/include/Eigen/src/PaStiXSupport/PaStiXSupport.h b/nuparu/include/Eigen/src/PaStiXSupport/PaStiXSupport.h
index a955287d..1999fd28 100644
--- a/nuparu/include/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/nuparu/include/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -12,6 +12,14 @@
 
 namespace Eigen { 
 
+#if defined(DCOMPLEX)
+  #define PASTIX_COMPLEX  COMPLEX
+  #define PASTIX_DCOMPLEX DCOMPLEX
+#else
+  #define PASTIX_COMPLEX  std::complex<float>
+  #define PASTIX_DCOMPLEX std::complex<double>
+#endif
+
 /** \ingroup PaStiXSupport_Module
   * \brief Interface to the PaStix solver
   * 
@@ -35,7 +43,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -44,7 +52,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -53,7 +61,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
   
   void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
@@ -74,14 +82,14 @@ namespace internal
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<COMPLEX*>(vals), perm, invp, reinterpret_cast<COMPLEX*>(x), nbrhs, iparm, dparm); 
+    c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_COMPLEX*>(vals), perm, invp, reinterpret_cast<PASTIX_COMPLEX*>(x), nbrhs, iparm, dparm); 
   }
   
   void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
   {
     if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
     if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<DCOMPLEX*>(vals), perm, invp, reinterpret_cast<DCOMPLEX*>(x), nbrhs, iparm, dparm); 
+    z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_DCOMPLEX*>(vals), perm, invp, reinterpret_cast<PASTIX_DCOMPLEX*>(x), nbrhs, iparm, dparm); 
   }
 
   // Convert the matrix  to Fortran-style Numbering
@@ -117,20 +125,30 @@ namespace internal
 // This is the base class to interface with PaStiX functions. 
 // Users should not used this class directly. 
 template <class Derived>
-class PastixBase : internal::noncopyable
+class PastixBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
+    
     typedef typename internal::pastix_traits<Derived>::MatrixType _MatrixType;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
     
-    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false), m_pastixdata(0), m_size(0)
+    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_pastixdata(0), m_size(0)
     {
       init();
     }
@@ -139,39 +157,16 @@ class PastixBase : internal::noncopyable
     {
       clean();
     }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PastixBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
     
     template<typename Rhs,typename Dest>
-    bool _solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
+    bool _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
     
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
-
     /** Returns a reference to the integer vector IPARM of PaStiX parameters
       * to modify the default parameters. 
       * The statistics related to the different phases of factorization and solve are saved here as well
       * \sa analyzePattern() factorize()
       */
-    Array<Index,IPARM_SIZE,1>& iparm()
+    Array<StorageIndex,IPARM_SIZE,1>& iparm()
     {
       return m_iparm; 
     }
@@ -220,20 +215,6 @@ class PastixBase : internal::noncopyable
       return m_info;
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PastixBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix LU, LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
-    
   protected:
 
     // Initialize the Pastix data structure, check the matrix
@@ -260,14 +241,13 @@ class PastixBase : internal::noncopyable
     int m_initisOk; 
     int m_analysisIsOk;
     int m_factorizationIsOk;
-    bool m_isInitialized;
     mutable ComputationInfo m_info; 
     mutable pastix_data_t *m_pastixdata; // Data structure for pastix
     mutable int m_comm; // The MPI communicator identifier
     mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
     mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
-    mutable Matrix<Index,Dynamic,1> m_perm;  // Permutation vector
-    mutable Matrix<Index,Dynamic,1> m_invp;  // Inverse permutation vector
+    mutable Matrix<StorageIndex,Dynamic,1> m_perm;  // Permutation vector
+    mutable Matrix<StorageIndex,Dynamic,1> m_invp;  // Inverse permutation vector
     mutable int m_size; // Size of the matrix 
 }; 
 
@@ -320,7 +300,6 @@ void PastixBase<Derived>::compute(ColSpMatrix& mat)
   factorize(mat);
   
   m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
-  m_isInitialized = m_factorizationIsOk;
 }
 
 
@@ -333,7 +312,7 @@ void PastixBase<Derived>::analyzePattern(ColSpMatrix& mat)
   if(m_size>0)
     clean();
   
-  m_size = mat.rows();
+  m_size = internal::convert_index<int>(mat.rows());
   m_perm.resize(m_size);
   m_invp.resize(m_size);
   
@@ -362,7 +341,7 @@ void PastixBase<Derived>::factorize(ColSpMatrix& mat)
   eigen_assert(m_analysisIsOk && "The analysis phase should be called before the factorization phase");
   m_iparm(IPARM_START_TASK) = API_TASK_NUMFACT;
   m_iparm(IPARM_END_TASK) = API_TASK_NUMFACT;
-  m_size = mat.rows();
+  m_size = internal::convert_index<int>(mat.rows());
   
   internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
                mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
@@ -385,7 +364,7 @@ void PastixBase<Derived>::factorize(ColSpMatrix& mat)
 /* Solve the system */
 template<typename Base>
 template<typename Rhs,typename Dest>
-bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
+bool PastixBase<Base>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
 {
   eigen_assert(m_isInitialized && "The matrix should be factorized first");
   EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
@@ -398,7 +377,7 @@ bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) co
     m_iparm[IPARM_START_TASK]          = API_TASK_SOLVE;
     m_iparm[IPARM_END_TASK]            = API_TASK_REFINE;
   
-    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, x.rows(), 0, 0, 0,
+    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, internal::convert_index<int>(x.rows()), 0, 0, 0,
                            m_perm.data(), m_invp.data(), &x(0, i), rhs, m_iparm.data(), m_dparm.data());
   }
   
@@ -423,7 +402,9 @@ bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) co
   * NOTE : Note that if the analysis and factorization phase are called separately, 
   * the input matrix will be symmetrized at each call, hence it is advised to 
   * symmetrize the matrix in a end-user program and set \p IsStrSym to true
-  * 
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   * 
   */
@@ -434,7 +415,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef PastixBase<PastixLU<MatrixType> > Base;
     typedef typename Base::ColSpMatrix ColSpMatrix;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     
   public:
     PastixLU() : Base()
@@ -442,7 +423,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
       init();
     }
     
-    PastixLU(const MatrixType& matrix):Base()
+    explicit PastixLU(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -534,7 +515,9 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType, int _UpLo>
@@ -552,7 +535,7 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
       init();
     }
     
-    PastixLLT(const MatrixType& matrix):Base()
+    explicit PastixLLT(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -615,7 +598,9 @@ class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
   * 
   * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType, int _UpLo>
@@ -633,7 +618,7 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
       init();
     }
     
-    PastixLDLT(const MatrixType& matrix):Base()
+    explicit PastixLDLT(const MatrixType& matrix):Base()
     {
       init();
       compute(matrix);
@@ -686,36 +671,6 @@ class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
     }
 };
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PastixBase<_MatrixType>, Rhs>
-  : solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<PastixBase<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif
diff --git a/nuparu/include/Eigen/src/PardisoSupport/PardisoSupport.h b/nuparu/include/Eigen/src/PardisoSupport/PardisoSupport.h
index 1c48f0df..7c238ce3 100644
--- a/nuparu/include/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/nuparu/include/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -40,13 +40,13 @@ template<typename _MatrixType, int Options=Upper> class PardisoLDLT;
 
 namespace internal
 {
-  template<typename Index>
+  template<typename IndexType>
   struct pardiso_run_selector
   {
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
+                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
-      Index error = 0;
+      IndexType error = 0;
       ::pardiso(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
       return error;
     }
@@ -54,11 +54,11 @@ namespace internal
   template<>
   struct pardiso_run_selector<long long int>
   {
-    typedef long long int Index;
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
+    typedef long long int IndexType;
+    static IndexType run( _MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase, IndexType n, void *a,
+                      IndexType *ia, IndexType *ja, IndexType *perm, IndexType nrhs, IndexType *iparm, IndexType msglvl, void *b, void *x)
     {
-      Index error = 0;
+      IndexType error = 0;
       ::pardiso_64(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
       return error;
     }
@@ -72,7 +72,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -81,7 +81,7 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
+    typedef typename _MatrixType::StorageIndex StorageIndex;
   };
 
   template<typename _MatrixType, int Options>
@@ -90,35 +90,44 @@ namespace internal
     typedef _MatrixType MatrixType;
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;    
+    typedef typename _MatrixType::StorageIndex StorageIndex;    
   };
 
-}
+} // end namespace internal
 
 template<class Derived>
-class PardisoImpl
+class PardisoImpl : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
+    
     typedef internal::pardiso_traits<Derived> Traits;
   public:
+    using Base::_solve_impl;
+    
     typedef typename Traits::MatrixType MatrixType;
     typedef typename Traits::Scalar Scalar;
     typedef typename Traits::RealScalar RealScalar;
-    typedef typename Traits::Index Index;
-    typedef SparseMatrix<Scalar,RowMajor,Index> SparseMatrixType;
+    typedef typename Traits::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,RowMajor,StorageIndex> SparseMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef Matrix<Index, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<Index, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
-    typedef Array<Index,64,1,DontAlign> ParameterType;
+    typedef Matrix<StorageIndex, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<StorageIndex, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+    typedef Array<StorageIndex,64,1,DontAlign> ParameterType;
     enum {
-      ScalarIsComplex = NumTraits<Scalar>::IsComplex
+      ScalarIsComplex = NumTraits<Scalar>::IsComplex,
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
     };
 
     PardisoImpl()
     {
-      eigen_assert((sizeof(Index) >= sizeof(_INTEGER_t) && sizeof(Index) <= 8) && "Non-supported index type");
+      eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type");
       m_iparm.setZero();
       m_msglvl = 0; // No output
-      m_initialized = false;
+      m_isInitialized = false;
     }
 
     ~PardisoImpl()
@@ -136,7 +145,7 @@ class PardisoImpl
       */
     ComputationInfo info() const
     {
-      eigen_assert(m_initialized && "Decomposition is not initialized.");
+      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
       return m_info;
     }
 
@@ -165,61 +174,25 @@ class PardisoImpl
     Derived& factorize(const MatrixType& matrix);
 
     Derived& compute(const MatrixType& matrix);
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PardisoImpl, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
-
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PardisoImpl, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
-
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
 
-    template<typename BDerived, typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
 
   protected:
     void pardisoRelease()
     {
-      if(m_initialized) // Factorization ran at least once
+      if(m_isInitialized) // Factorization ran at least once
       {
-        internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
-                                                   m_iparm.data(), m_msglvl, 0, 0);
+        internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1, m_size,0, 0, 0, m_perm.data(), 0,
+                                                          m_iparm.data(), m_msglvl, NULL, NULL);
+        m_isInitialized = false;
       }
     }
 
     void pardisoInit(int type)
     {
       m_type = type;
-      bool symmetric = abs(m_type) < 10;
+      bool symmetric = std::abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
       m_iparm[1] = 3;   // use Metis for the ordering
       m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
@@ -247,12 +220,14 @@ class PardisoImpl
       m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
       m_iparm[34] = 1;  // C indexing
       m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
+      
+      memset(m_pt, 0, sizeof(m_pt));
     }
 
   protected:
     // cached data to reduce reallocation, etc.
     
-    void manageErrorCode(Index error)
+    void manageErrorCode(Index error) const
     {
       switch(error)
       {
@@ -269,16 +244,14 @@ class PardisoImpl
     }
 
     mutable SparseMatrixType m_matrix;
-    ComputationInfo m_info;
-    bool m_initialized, m_analysisIsOk, m_factorizationIsOk;
+    mutable ComputationInfo m_info;
+    bool m_analysisIsOk, m_factorizationIsOk;
     Index m_type, m_msglvl;
     mutable void *m_pt[64];
     mutable ParameterType m_iparm;
     mutable IntColVectorType m_perm;
     Index m_size;
     
-  private:
-    PardisoImpl(PardisoImpl &) {}
 };
 
 template<class Derived>
@@ -288,19 +261,18 @@ Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
   eigen_assert(a.rows() == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 12, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 12, m_size,
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
 
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = true;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
@@ -311,19 +283,18 @@ Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
   eigen_assert(m_size == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 11, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 11, m_size,
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
@@ -336,21 +307,24 @@ Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
   derived().getMatrix(a);
 
   Index error;  
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 22, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 22, m_size,
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   
   manageErrorCode(error);
   m_factorizationIsOk = true;
   return derived();
 }
 
-template<class Base>
+template<class Derived>
 template<typename BDerived,typename XDerived>
-bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
+void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
 {
   if(m_iparm[0] == 0) // Factorization was not computed
-    return false;
+  {
+    m_info = InvalidInput;
+    return;
+  }
 
   //Index n = m_matrix.rows();
   Index nrhs = Index(b.cols());
@@ -380,12 +354,12 @@ bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerive
   }
   
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 33, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
-                                                     rhs_ptr, x.derived().data());
+  error = internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, 33, m_size,
+                                                            m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
+                                                            m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
+                                                            rhs_ptr, x.derived().data());
 
-  return error==0;
+  manageErrorCode(error);
 }
 
 
@@ -399,13 +373,15 @@ bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerive
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename MatrixType>
 class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
-    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
+    typedef PardisoImpl<PardisoLU> Base;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
@@ -423,7 +399,7 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
       pardisoInit(Base::ScalarIsComplex ? 13 : 11);
     }
 
-    PardisoLU(const MatrixType& matrix)
+    explicit PardisoLU(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? 13 : 11);
@@ -433,10 +409,8 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
     void getMatrix(const MatrixType& matrix)
     {
       m_matrix = matrix;
+      m_matrix.makeCompressed();
     }
-    
-  private:
-    PardisoLU(PardisoLU& ) {}
 };
 
 /** \ingroup PardisoSupport_Module
@@ -451,6 +425,8 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
   * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename MatrixType, int _UpLo>
@@ -459,7 +435,6 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
     typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
@@ -467,9 +442,9 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 
   public:
 
+    typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
-    using Base::solve;
 
     PardisoLLT()
       : Base()
@@ -477,7 +452,7 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
       pardisoInit(Base::ScalarIsComplex ? 4 : 2);
     }
 
-    PardisoLLT(const MatrixType& matrix)
+    explicit PardisoLLT(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? 4 : 2);
@@ -489,13 +464,11 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
     void getMatrix(const MatrixType& matrix)
     {
       // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
-    
-  private:
-    PardisoLLT(PardisoLLT& ) {}
 };
 
 /** \ingroup PardisoSupport_Module
@@ -512,6 +485,8 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
   *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
   *         Upper|Lower can be used to tell both triangular parts can be used as input.
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename MatrixType, int Options>
@@ -520,7 +495,6 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
     typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
     typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
@@ -528,8 +502,8 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 
   public:
 
+    typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
-    using Base::solve;
     enum { UpLo = Options&(Upper|Lower) };
 
     PardisoLDLT()
@@ -538,7 +512,7 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
       pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
     }
 
-    PardisoLDLT(const MatrixType& matrix)
+    explicit PardisoLDLT(const MatrixType& matrix)
       : Base()
     {
       pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
@@ -548,45 +522,13 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
     void getMatrix(const MatrixType& matrix)
     {
       // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> p_null;
       m_matrix.resize(matrix.rows(), matrix.cols());
       m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+      m_matrix.makeCompressed();
     }
-    
-  private:
-    PardisoLDLT(PardisoLDLT& ) {}
-};
-
-namespace internal {
-  
-template<typename _Derived, typename Rhs>
-struct solve_retval<PardisoImpl<_Derived>, Rhs>
-  : solve_retval_base<PardisoImpl<_Derived>, Rhs>
-{
-  typedef PardisoImpl<_Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<PardisoImpl<Derived>, Rhs>
-  : sparse_solve_retval_base<PardisoImpl<Derived>, Rhs>
-{
-  typedef PardisoImpl<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
 };
 
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_PARDISOSUPPORT_H
diff --git a/nuparu/include/Eigen/src/QR/ColPivHouseholderQR.h b/nuparu/include/Eigen/src/QR/ColPivHouseholderQR.h
index 8b01f817..172e4a89 100644
--- a/nuparu/include/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/nuparu/include/Eigen/src/QR/ColPivHouseholderQR.h
@@ -13,6 +13,15 @@
 
 namespace Eigen { 
 
+namespace internal {
+template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  enum { Flags = 0 };
+};
+
+} // end namespace internal
+
 /** \ingroup QR_Module
   *
   * \class ColPivHouseholderQR
@@ -48,7 +57,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
@@ -56,10 +66,11 @@ template<typename _MatrixType> class ColPivHouseholderQR
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
     typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
+    typedef typename MatrixType::PlainObject PlainObject;
     
   private:
     
-    typedef typename PermutationType::Index PermIndexType;
+    typedef typename PermutationType::StorageIndex PermIndexType;
     
   public:
 
@@ -76,7 +87,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
         m_colsTranspositions(),
         m_temp(),
         m_colSqNorms(),
-        m_isInitialized(false) {}
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {}
 
     /** \brief Default Constructor with memory preallocation
       *
@@ -106,7 +118,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
       * 
       * \sa compute()
       */
-    ColPivHouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
         m_colsPermutation(PermIndexType(matrix.cols())),
@@ -116,7 +129,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
         m_isInitialized(false),
         m_usePrescribedThreshold(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
@@ -137,15 +150,15 @@ template<typename _MatrixType> class ColPivHouseholderQR
       * Output: \verbinclude ColPivHouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<ColPivHouseholderQR, Rhs>
+    inline const Solve<ColPivHouseholderQR, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR, Rhs>(*this, b.derived());
+      return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived());
     }
 
-    HouseholderSequenceType householderQ(void) const;
-    HouseholderSequenceType matrixQ(void) const
+    HouseholderSequenceType householderQ() const;
+    HouseholderSequenceType matrixQ() const
     {
       return householderQ(); 
     }
@@ -173,7 +186,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
       return m_qr;
     }
     
-    ColPivHouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    ColPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns a const reference to the column permutation matrix */
     const PermutationType& colsPermutation() const
@@ -283,13 +297,10 @@ template<typename _MatrixType> class ColPivHouseholderQR
       * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
       *       Use isInvertible() to first determine whether this matrix is invertible.
       */
-    inline const
-    internal::solve_retval<ColPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
+    inline const Inverse<ColPivHouseholderQR> inverse() const
     {
       eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+      return Inverse<ColPivHouseholderQR>(*this);
     }
 
     inline Index rows() const { return m_qr.rows(); }
@@ -349,7 +360,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       return m_usePrescribedThreshold ? m_prescribedThreshold
       // this formula comes from experimenting (see "LU precision tuning" thread on the list)
       // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * m_qr.diagonalSize();
+                                      : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
     }
 
     /** \returns the number of nonzero pivots in the QR decomposition.
@@ -381,8 +392,22 @@ template<typename _MatrixType> class ColPivHouseholderQR
       eigen_assert(m_isInitialized && "Decomposition is not initialized.");
       return Success;
     }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
+    void computeInPlace();
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     PermutationType m_colsPermutation;
@@ -419,22 +444,34 @@ typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDetermina
   * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
-  using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  Index size = matrix.diagonalSize();
+  check_template_parameters();
   
   // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(cols<=NumTraits<int>::highest());
+  eigen_assert(matrix.cols()<=NumTraits<int>::highest());
 
   m_qr = matrix;
+  
+  computeInPlace();
+  
+  return *this;
+}
+
+template<typename MatrixType>
+void ColPivHouseholderQR<MatrixType>::computeInPlace()
+{
+  using std::abs;
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
+  Index size = m_qr.diagonalSize();
+  
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
 
-  m_colsTranspositions.resize(matrix.cols());
+  m_colsTranspositions.resize(m_qr.cols());
   Index number_of_transpositions = 0;
 
   m_colSqNorms.resize(cols);
@@ -462,20 +499,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
     // we store that back into our table: it can't hurt to correct our table.
     m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
 
-    // if the current biggest column is smaller than epsilon times the initial biggest column,
-    // terminate to avoid generating nan/inf values.
-    // Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
-    // repetitions of the unit test, with the result of solve() filled with large values of the order
-    // of 1/(size*epsilon).
-    if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
-    {
+    // Track the number of meaningful pivots but do not stop the decomposition to make
+    // sure that the initial matrix is properly reproduced. See bug 941.
+    if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
       m_nonzero_pivots = k;
-      m_hCoeffs.tail(size-k).setZero();
-      m_qr.bottomRightCorner(rows-k,cols-k)
-          .template triangularView<StrictlyLower>()
-          .setZero();
-      break;
-    }
 
     // apply the transposition to the columns
     m_colsTranspositions.coeffRef(k) = biggest_col_index;
@@ -504,65 +531,72 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
   }
 
   m_colsPermutation.setIdentity(PermIndexType(cols));
-  for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
+  for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
     m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<ColPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<ColPivHouseholderQR<_MatrixType>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(ColPivHouseholderQR<_MatrixType>,Rhs)
+  eigen_assert(rhs.rows() == rows());
+
+  const Index nonzero_pivots = nonzeroPivots();
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  if(nonzero_pivots == 0)
   {
-    eigen_assert(rhs().rows() == dec().rows());
+    dst.setZero();
+    return;
+  }
 
-    const Index cols = dec().cols(),
-				nonzero_pivots = dec().nonzeroPivots();
+  typename RhsType::PlainObject c(rhs);
 
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+  c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs)
+                    .setLength(nonzero_pivots)
+                    .transpose()
+    );
 
-    typename Rhs::PlainObject c(rhs());
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
 
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(dec().matrixQR(), dec().hCoeffs())
-                     .setLength(dec().nonzeroPivots())
-		     .transpose()
-      );
+  for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
+  for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
+}
+#endif
 
-    dec().matrixR()
-       .topLeftCorner(nonzero_pivots, nonzero_pivots)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(nonzero_pivots));
+namespace internal {
 
-    for(Index i = 0; i < nonzero_pivots; ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+template<typename DstXprType, typename MatrixType, typename Scalar>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef ColPivHouseholderQR<MatrixType> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {    
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
 } // end namespace internal
 
-/** \returns the matrix Q as a sequence of householder transformations */
+/** \returns the matrix Q as a sequence of householder transformations.
+  * You can extract the meaningful part only by using:
+  * \code qr.householderQ().setLength(qr.nonzeroPivots()) \endcode*/
 template<typename MatrixType>
 typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
   ::householderQ() const
 {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
+  return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
 }
 
+#ifndef __CUDACC__
 /** \return the column-pivoting Householder QR decomposition of \c *this.
   *
   * \sa class ColPivHouseholderQR
@@ -573,6 +607,7 @@ MatrixBase<Derived>::colPivHouseholderQr() const
 {
   return ColPivHouseholderQR<PlainObject>(eval());
 }
+#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/nuparu/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h
index b5b19832..1203d0d3 100644
--- a/nuparu/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h
+++ b/nuparu/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h
@@ -41,21 +41,20 @@ namespace Eigen {
 /** \internal Specialization for the data types supported by MKL */
 
 #define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
-template<> inline \
+template<> template<typename InputType> inline \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
 ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
-              const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix) \
+              const EigenBase<InputType>& matrix) \
 \
 { \
   using std::abs; \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
   typedef MatrixType::RealScalar RealScalar; \
   Index rows = matrix.rows();\
   Index cols = matrix.cols();\
-  Index size = matrix.diagonalSize();\
 \
   m_qr = matrix;\
+  Index size = m_qr.diagonalSize();\
   m_hCoeffs.resize(size);\
 \
   m_colsTranspositions.resize(cols);\
diff --git a/nuparu/include/Eigen/src/QR/FullPivHouseholderQR.h b/nuparu/include/Eigen/src/QR/FullPivHouseholderQR.h
index 0dd5ad34..64fe6b7b 100644
--- a/nuparu/include/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/nuparu/include/Eigen/src/QR/FullPivHouseholderQR.h
@@ -15,6 +15,12 @@ namespace Eigen {
 
 namespace internal {
 
+template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
+ : traits<_MatrixType>
+{
+  enum { Flags = 0 };
+};
+
 template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
 
 template<typename MatrixType>
@@ -23,7 +29,7 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   typedef typename MatrixType::PlainObject ReturnType;
 };
 
-}
+} // end namespace internal
 
 /** \ingroup QR_Module
   *
@@ -33,13 +39,13 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   *
   * \param MatrixType the type of the matrix of which we are computing the QR decomposition
   *
-  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
+  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
   * such that 
   * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
+  *  \mathbf{P} \, \mathbf{A} \, \mathbf{P}' = \mathbf{Q} \, \mathbf{R}
   * \f]
-  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
-  * upper triangular matrix.
+  * by using Householder transformations. Here, \b P and \b P' are permutation matrices, \b Q a unitary matrix 
+  * and \b R an upper triangular matrix.
   *
   * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
   * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
@@ -60,14 +66,17 @@ template<typename _MatrixType> class FullPivHouseholderQR
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef Matrix<Index, 1, ColsAtCompileTime, RowMajor, 1, MaxColsAtCompileTime> IntRowVectorType;
+    typedef Matrix<StorageIndex, 1,
+                   EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
+                   EIGEN_SIZE_MIN_PREFER_FIXED(MaxColsAtCompileTime,MaxRowsAtCompileTime)> IntDiagSizeVectorType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
-    typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
     typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
+    typedef typename MatrixType::PlainObject PlainObject;
 
     /** \brief Default Constructor.
       *
@@ -93,10 +102,10 @@ template<typename _MatrixType> class FullPivHouseholderQR
     FullPivHouseholderQR(Index rows, Index cols)
       : m_qr(rows, cols),
         m_hCoeffs((std::min)(rows,cols)),
-        m_rows_transpositions(rows),
-        m_cols_transpositions(cols),
+        m_rows_transpositions((std::min)(rows,cols)),
+        m_cols_transpositions((std::min)(rows,cols)),
         m_cols_permutation(cols),
-        m_temp((std::min)(rows,cols)),
+        m_temp(cols),
         m_isInitialized(false),
         m_usePrescribedThreshold(false) {}
 
@@ -112,25 +121,27 @@ template<typename _MatrixType> class FullPivHouseholderQR
       * 
       * \sa compute()
       */
-    FullPivHouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit FullPivHouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
-        m_rows_transpositions(matrix.rows()),
-        m_cols_transpositions(matrix.cols()),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
         m_cols_permutation(matrix.cols()),
-        m_temp((std::min)(matrix.rows(), matrix.cols())),
+        m_temp(matrix.cols()),
         m_isInitialized(false),
         m_usePrescribedThreshold(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
-      * *this is the QR decomposition, if any exists.
+      * \c *this is the QR decomposition.
       *
       * \param b the right-hand-side of the equation to solve.
       *
-      * \returns a solution.
+      * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
+      * and an arbitrary solution otherwise.
       *
       * \note The case where b is a matrix is not yet implemented. Also, this
       *       code is space inefficient.
@@ -143,11 +154,11 @@ template<typename _MatrixType> class FullPivHouseholderQR
       * Output: \verbinclude FullPivHouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<FullPivHouseholderQR, Rhs>
+    inline const Solve<FullPivHouseholderQR, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR, Rhs>(*this, b.derived());
+      return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived());
     }
 
     /** \returns Expression object representing the matrix Q
@@ -162,7 +173,8 @@ template<typename _MatrixType> class FullPivHouseholderQR
       return m_qr;
     }
 
-    FullPivHouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    FullPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns a const reference to the column permutation matrix */
     const PermutationType& colsPermutation() const
@@ -172,7 +184,7 @@ template<typename _MatrixType> class FullPivHouseholderQR
     }
 
     /** \returns a const reference to the vector of indices representing the rows transpositions */
-    const IntColVectorType& rowsTranspositions() const
+    const IntDiagSizeVectorType& rowsTranspositions() const
     {
       eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
       return m_rows_transpositions;
@@ -278,13 +290,11 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *
       * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
       *       Use isInvertible() to first determine whether this matrix is invertible.
-      */    inline const
-    internal::solve_retval<FullPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
+      */
+    inline const Inverse<FullPivHouseholderQR> inverse() const
     {
       eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
+      return Inverse<FullPivHouseholderQR>(*this);
     }
 
     inline Index rows() const { return m_qr.rows(); }
@@ -344,7 +354,7 @@ template<typename _MatrixType> class FullPivHouseholderQR
       return m_usePrescribedThreshold ? m_prescribedThreshold
       // this formula comes from experimenting (see "LU precision tuning" thread on the list)
       // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * m_qr.diagonalSize();
+                                      : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
     }
 
     /** \returns the number of nonzero pivots in the QR decomposition.
@@ -364,12 +374,26 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
+    void computeInPlace();
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
-    IntColVectorType m_rows_transpositions;
-    IntRowVectorType m_cols_transpositions;
+    IntDiagSizeVectorType m_rows_transpositions;
+    IntDiagSizeVectorType m_cols_transpositions;
     PermutationType m_cols_permutation;
     RowVectorType m_temp;
     bool m_isInitialized, m_usePrescribedThreshold;
@@ -403,22 +427,35 @@ typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDetermin
   * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
+{
+  check_template_parameters();
+  
+  m_qr = matrix.derived();
+  
+  computeInPlace();
+  
+  return *this;
+}
+
+template<typename MatrixType>
+void FullPivHouseholderQR<MatrixType>::computeInPlace()
 {
   using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
   Index size = (std::min)(rows,cols);
 
-  m_qr = matrix;
+  
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
 
-  m_precision = NumTraits<Scalar>::epsilon() * size;
+  m_precision = NumTraits<Scalar>::epsilon() * RealScalar(size);
 
-  m_rows_transpositions.resize(matrix.rows());
-  m_cols_transpositions.resize(matrix.cols());
+  m_rows_transpositions.resize(size);
+  m_cols_transpositions.resize(size);
   Index number_of_transpositions = 0;
 
   RealScalar biggest(0);
@@ -429,13 +466,15 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
   for (Index k = 0; k < size; ++k)
   {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows-k, cols-k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
+    RealScalar biggest_in_corner = internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
     if(k==0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
@@ -479,61 +518,55 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
 
   m_det_pq = (number_of_transpositions%2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivHouseholderQR<_MatrixType>, Rhs>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivHouseholderQR<_MatrixType>,Rhs)
+  eigen_assert(rhs.rows() == rows());
+  const Index l_rank = rank();
 
-  template<typename Dest> void evalTo(Dest& dst) const
+  // FIXME introduce nonzeroPivots() and use it here. and more generally,
+  // make the same improvements in this dec as in FullPivLU.
+  if(l_rank==0)
   {
-    const Index rows = dec().rows(), cols = dec().cols();
-    eigen_assert(rhs().rows() == rows);
+    dst.setZero();
+    return;
+  }
 
-    // FIXME introduce nonzeroPivots() and use it here. and more generally,
-    // make the same improvements in this dec as in FullPivLU.
-    if(dec().rank()==0)
-    {
-      dst.setZero();
-      return;
-    }
+  typename RhsType::PlainObject c(rhs);
 
-    typename Rhs::PlainObject c(rhs());
+  Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+  for (Index k = 0; k < l_rank; ++k)
+  {
+    Index remainingSize = rows()-k;
+    c.row(k).swap(c.row(m_rows_transpositions.coeff(k)));
+    c.bottomRightCorner(remainingSize, rhs.cols())
+      .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1),
+                               m_hCoeffs.coeff(k), &temp.coeffRef(0));
+  }
 
-    Matrix<Scalar,1,Rhs::ColsAtCompileTime> temp(rhs().cols());
-    for (Index k = 0; k < dec().rank(); ++k)
-    {
-      Index remainingSize = rows-k;
-      c.row(k).swap(c.row(dec().rowsTranspositions().coeff(k)));
-      c.bottomRightCorner(remainingSize, rhs().cols())
-       .applyHouseholderOnTheLeft(dec().matrixQR().col(k).tail(remainingSize-1),
-                                  dec().hCoeffs().coeff(k), &temp.coeffRef(0));
-    }
+  m_qr.topLeftCorner(l_rank, l_rank)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(l_rank));
 
-    if(!dec().isSurjective())
-    {
-      // is c is in the image of R ?
-      RealScalar biggest_in_upper_part_of_c = c.topRows(   dec().rank()     ).cwiseAbs().maxCoeff();
-      RealScalar biggest_in_lower_part_of_c = c.bottomRows(rows-dec().rank()).cwiseAbs().maxCoeff();
-      // FIXME brain dead
-      const RealScalar m_precision = NumTraits<Scalar>::epsilon() * (std::min)(rows,cols);
-      // this internal:: prefix is needed by at least gcc 3.4 and ICC
-      if(!internal::isMuchSmallerThan(biggest_in_lower_part_of_c, biggest_in_upper_part_of_c, m_precision))
-        return;
-    }
-    dec().matrixQR()
-       .topLeftCorner(dec().rank(), dec().rank())
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(dec().rank()));
+  for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
+  for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
+}
+#endif
 
-    for(Index i = 0; i < dec().rank(); ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = dec().rank(); i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType, typename Scalar>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType> >, internal::assign_op<Scalar>, Dense2Dense, Scalar>
+{
+  typedef FullPivHouseholderQR<MatrixType> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {    
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
@@ -547,19 +580,18 @@ template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
   : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
 {
 public:
-  typedef typename MatrixType::Index Index;
-  typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
+  typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
                  MatrixType::MaxRowsAtCompileTime> WorkVectorType;
 
   FullPivHouseholderQRMatrixQReturnType(const MatrixType&       qr,
                                         const HCoeffsType&      hCoeffs,
-                                        const IntColVectorType& rowsTranspositions)
+                                        const IntDiagSizeVectorType& rowsTranspositions)
     : m_qr(qr),
       m_hCoeffs(hCoeffs),
       m_rowsTranspositions(rowsTranspositions)
-      {}
+  {}
 
   template <typename ResultType>
   void evalTo(ResultType& result) const
@@ -589,15 +621,20 @@ template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
     }
   }
 
-    Index rows() const { return m_qr.rows(); }
-    Index cols() const { return m_qr.rows(); }
+  Index rows() const { return m_qr.rows(); }
+  Index cols() const { return m_qr.rows(); }
 
 protected:
   typename MatrixType::Nested m_qr;
   typename HCoeffsType::Nested m_hCoeffs;
-  typename IntColVectorType::Nested m_rowsTranspositions;
+  typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
 };
 
+// template<typename MatrixType>
+// struct evaluator<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+//  : public evaluator<ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> > >
+// {};
+
 } // end namespace internal
 
 template<typename MatrixType>
@@ -607,6 +644,7 @@ inline typename FullPivHouseholderQR<MatrixType>::MatrixQReturnType FullPivHouse
   return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
 }
 
+#ifndef __CUDACC__
 /** \return the full-pivoting Householder QR decomposition of \c *this.
   *
   * \sa class FullPivHouseholderQR
@@ -617,6 +655,7 @@ MatrixBase<Derived>::fullPivHouseholderQr() const
 {
   return FullPivHouseholderQR<PlainObject>(eval());
 }
+#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/QR/HouseholderQR.h b/nuparu/include/Eigen/src/QR/HouseholderQR.h
index abc61bcb..1eb86102 100644
--- a/nuparu/include/Eigen/src/QR/HouseholderQR.h
+++ b/nuparu/include/Eigen/src/QR/HouseholderQR.h
@@ -53,7 +53,8 @@ template<typename _MatrixType> class HouseholderQR
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -91,13 +92,14 @@ template<typename _MatrixType> class HouseholderQR
       * 
       * \sa compute()
       */
-    HouseholderQR(const MatrixType& matrix)
+    template<typename InputType>
+    explicit HouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
         m_temp(matrix.cols()),
         m_isInitialized(false)
     {
-      compute(matrix);
+      compute(matrix.derived());
     }
 
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
@@ -118,11 +120,11 @@ template<typename _MatrixType> class HouseholderQR
       * Output: \verbinclude HouseholderQR_solve.out
       */
     template<typename Rhs>
-    inline const internal::solve_retval<HouseholderQR, Rhs>
+    inline const Solve<HouseholderQR, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
       eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return internal::solve_retval<HouseholderQR, Rhs>(*this, b.derived());
+      return Solve<HouseholderQR, Rhs>(*this, b.derived());
     }
 
     /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
@@ -148,7 +150,8 @@ template<typename _MatrixType> class HouseholderQR
         return m_qr;
     }
 
-    HouseholderQR& compute(const MatrixType& matrix);
+    template<typename InputType>
+    HouseholderQR& compute(const EigenBase<InputType>& matrix);
 
     /** \returns the absolute value of the determinant of the matrix of which
       * *this is the QR decomposition. It has only linear complexity
@@ -187,8 +190,20 @@ template<typename _MatrixType> class HouseholderQR
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    void _solve_impl(const RhsType &rhs, DstType &dst) const;
+    #endif
 
   protected:
+    
+    static void check_template_parameters()
+    {
+      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    }
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
@@ -218,7 +233,6 @@ namespace internal {
 template<typename MatrixQR, typename HCoeffs>
 void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename MatrixQR::Scalar* tempData = 0)
 {
-  typedef typename MatrixQR::Index Index;
   typedef typename MatrixQR::Scalar Scalar;
   typedef typename MatrixQR::RealScalar RealScalar;
   Index rows = mat.rows();
@@ -251,88 +265,87 @@ void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename
 }
 
 /** \internal */
-template<typename MatrixQR, typename HCoeffs>
-void householder_qr_inplace_blocked(MatrixQR& mat, HCoeffs& hCoeffs,
-                                       typename MatrixQR::Index maxBlockSize=32,
-                                       typename MatrixQR::Scalar* tempData = 0)
+template<typename MatrixQR, typename HCoeffs,
+  typename MatrixQRScalar = typename MatrixQR::Scalar,
+  bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
+struct householder_qr_inplace_blocked
 {
-  typedef typename MatrixQR::Index Index;
-  typedef typename MatrixQR::Scalar Scalar;
-  typedef Block<MatrixQR,Dynamic,Dynamic> BlockType;
-
-  Index rows = mat.rows();
-  Index cols = mat.cols();
-  Index size = (std::min)(rows, cols);
-
-  typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixQR::MaxColsAtCompileTime,1> TempType;
-  TempType tempVector;
-  if(tempData==0)
-  {
-    tempVector.resize(cols);
-    tempData = tempVector.data();
-  }
-
-  Index blockSize = (std::min)(maxBlockSize,size);
-
-  Index k = 0;
-  for (k = 0; k < size; k += blockSize)
+  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
+      typename MatrixQR::Scalar* tempData = 0)
   {
-    Index bs = (std::min)(size-k,blockSize);  // actual size of the block
-    Index tcols = cols - k - bs;            // trailing columns
-    Index brows = rows-k;                   // rows of the block
+    typedef typename MatrixQR::Scalar Scalar;
+    typedef Block<MatrixQR,Dynamic,Dynamic> BlockType;
 
-    // partition the matrix:
-    //        A00 | A01 | A02
-    // mat  = A10 | A11 | A12
-    //        A20 | A21 | A22
-    // and performs the qr dec of [A11^T A12^T]^T
-    // and update [A21^T A22^T]^T using level 3 operations.
-    // Finally, the algorithm continue on A22
+    Index rows = mat.rows();
+    Index cols = mat.cols();
+    Index size = (std::min)(rows, cols);
 
-    BlockType A11_21 = mat.block(k,k,brows,bs);
-    Block<HCoeffs,Dynamic,1> hCoeffsSegment = hCoeffs.segment(k,bs);
+    typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixQR::MaxColsAtCompileTime,1> TempType;
+    TempType tempVector;
+    if(tempData==0)
+    {
+      tempVector.resize(cols);
+      tempData = tempVector.data();
+    }
 
-    householder_qr_inplace_unblocked(A11_21, hCoeffsSegment, tempData);
+    Index blockSize = (std::min)(maxBlockSize,size);
 
-    if(tcols)
+    Index k = 0;
+    for (k = 0; k < size; k += blockSize)
     {
-      BlockType A21_22 = mat.block(k,k+bs,brows,tcols);
-      apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment.adjoint());
+      Index bs = (std::min)(size-k,blockSize);  // actual size of the block
+      Index tcols = cols - k - bs;              // trailing columns
+      Index brows = rows-k;                     // rows of the block
+
+      // partition the matrix:
+      //        A00 | A01 | A02
+      // mat  = A10 | A11 | A12
+      //        A20 | A21 | A22
+      // and performs the qr dec of [A11^T A12^T]^T
+      // and update [A21^T A22^T]^T using level 3 operations.
+      // Finally, the algorithm continue on A22
+
+      BlockType A11_21 = mat.block(k,k,brows,bs);
+      Block<HCoeffs,Dynamic,1> hCoeffsSegment = hCoeffs.segment(k,bs);
+
+      householder_qr_inplace_unblocked(A11_21, hCoeffsSegment, tempData);
+
+      if(tcols)
+      {
+        BlockType A21_22 = mat.block(k,k+bs,brows,tcols);
+        apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment, false); // false == backward
+      }
     }
   }
-}
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<HouseholderQR<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(HouseholderQR<_MatrixType>,Rhs)
+};
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    const Index rows = dec().rows(), cols = dec().cols();
-    const Index rank = (std::min)(rows, cols);
-    eigen_assert(rhs().rows() == rows);
+} // end namespace internal
 
-    typename Rhs::PlainObject c(rhs());
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename _MatrixType>
+template<typename RhsType, typename DstType>
+void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  const Index rank = (std::min)(rows(), cols());
+  eigen_assert(rhs.rows() == rows());
 
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(
-      dec().matrixQR().leftCols(rank),
-      dec().hCoeffs().head(rank)).transpose()
-    );
+  typename RhsType::PlainObject c(rhs);
 
-    dec().matrixQR()
-       .topLeftCorner(rank, rank)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(rank));
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+  c.applyOnTheLeft(householderSequence(
+    m_qr.leftCols(rank),
+    m_hCoeffs.head(rank)).transpose()
+  );
 
-    dst.topRows(rank) = c.topRows(rank);
-    dst.bottomRows(cols-rank).setZero();
-  }
-};
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(rank));
 
-} // end namespace internal
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(cols()-rank).setZero();
+}
+#endif
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
   * the factorization is stored into \c *this, and a reference to \c *this
@@ -341,23 +354,27 @@ struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
   * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
   */
 template<typename MatrixType>
-HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType& matrix)
+template<typename InputType>
+HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const EigenBase<InputType>& matrix)
 {
+  check_template_parameters();
+  
   Index rows = matrix.rows();
   Index cols = matrix.cols();
   Index size = (std::min)(rows,cols);
 
-  m_qr = matrix;
+  m_qr = matrix.derived();
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
 
-  internal::householder_qr_inplace_blocked(m_qr, m_hCoeffs, 48, m_temp.data());
+  internal::householder_qr_inplace_blocked<MatrixType, HCoeffsType>::run(m_qr, m_hCoeffs, 48, m_temp.data());
 
   m_isInitialized = true;
   return *this;
 }
 
+#ifndef __CUDACC__
 /** \return the Householder QR decomposition of \c *this.
   *
   * \sa class HouseholderQR
@@ -368,6 +385,7 @@ MatrixBase<Derived>::householderQr() const
 {
   return HouseholderQR<PlainObject>(eval());
 }
+#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/QR/HouseholderQR_MKL.h b/nuparu/include/Eigen/src/QR/HouseholderQR_MKL.h
index 5313de60..84ab640a 100644
--- a/nuparu/include/Eigen/src/QR/HouseholderQR_MKL.h
+++ b/nuparu/include/Eigen/src/QR/HouseholderQR_MKL.h
@@ -34,7 +34,7 @@
 #ifndef EIGEN_QR_MKL_H
 #define EIGEN_QR_MKL_H
 
-#include "Eigen/src/Core/util/MKL_support.h"
+#include "../Core/util/MKL_support.h"
 
 namespace Eigen { 
 
@@ -44,18 +44,19 @@ namespace internal {
 
 #define EIGEN_MKL_QR_NOPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
 template<typename MatrixQR, typename HCoeffs> \
-void householder_qr_inplace_blocked(MatrixQR& mat, HCoeffs& hCoeffs, \
-                                       typename MatrixQR::Index maxBlockSize=32, \
-                                       EIGTYPE* tempData = 0) \
+struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
 { \
-  lapack_int m = mat.rows(); \
-  lapack_int n = mat.cols(); \
-  lapack_int lda = mat.outerStride(); \
-  lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-  LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
-  hCoeffs.adjointInPlace(); \
-\
-}
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index = 32, \
+      typename MatrixQR::Scalar* = 0) \
+  { \
+    lapack_int m = (lapack_int) mat.rows(); \
+    lapack_int n = (lapack_int) mat.cols(); \
+    lapack_int lda = (lapack_int) mat.outerStride(); \
+    lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
+    LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
+    hCoeffs.adjointInPlace(); \
+  } \
+};
 
 EIGEN_MKL_QR_NOPIV(double, double, d)
 EIGEN_MKL_QR_NOPIV(float, float, s)
diff --git a/nuparu/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/nuparu/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index aa41f434..d9c3113e 100644
--- a/nuparu/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/nuparu/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -32,49 +33,54 @@ namespace Eigen {
   } // End namespace internal
   
 /**
- * \ingroup SPQRSupport_Module
- * \class SPQR
- * \brief Sparse QR factorization based on SuiteSparseQR library
- * 
- * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition 
- * of sparse matrices. The result is then used to solve linear leasts_square systems.
- * Clearly, a QR factorization is returned such that A*P = Q*R where :
- * 
- * P is the column permutation. Use colsPermutation() to get it.
- * 
- * Q is the orthogonal matrix represented as Householder reflectors. 
- * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
- * You can then apply it to a vector.
- * 
- * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
- * NOTE : The Index type of R is always UF_long. You can get it with SPQR::Index
- * 
- * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
- * NOTE 
- * 
- */
+  * \ingroup SPQRSupport_Module
+  * \class SPQR
+  * \brief Sparse QR factorization based on SuiteSparseQR library
+  *
+  * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition
+  * of sparse matrices. The result is then used to solve linear leasts_square systems.
+  * Clearly, a QR factorization is returned such that A*P = Q*R where :
+  *
+  * P is the column permutation. Use colsPermutation() to get it.
+  *
+  * Q is the orthogonal matrix represented as Householder reflectors.
+  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
+  * You can then apply it to a vector.
+  *
+  * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
+  * NOTE : The Index type of R is always SuiteSparse_long. You can get it with SPQR::Index
+  *
+  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
+  *
+  * \implsparsesolverconcept
+  *
+  *
+  */
 template<typename _MatrixType>
-class SPQR
+class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
 {
+  protected:
+    typedef SparseSolverBase<SPQR<_MatrixType> > Base;
+    using Base::m_isInitialized;
   public:
     typedef typename _MatrixType::Scalar Scalar;
     typedef typename _MatrixType::RealScalar RealScalar;
-    typedef UF_long Index ; 
-    typedef SparseMatrix<Scalar, ColMajor, Index> MatrixType;
-    typedef PermutationMatrix<Dynamic, Dynamic> PermutationType;
+    typedef SuiteSparse_long StorageIndex ;
+    typedef SparseMatrix<Scalar, ColMajor, StorageIndex> MatrixType;
+    typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
+    enum {
+      ColsAtCompileTime = Dynamic,
+      MaxColsAtCompileTime = Dynamic
+    };
   public:
     SPQR() 
-    : m_ordering(SPQR_ORDERING_DEFAULT),
-      m_allow_tol(SPQR_DEFAULT_TOL),
-      m_tolerance (NumTraits<Scalar>::epsilon())
+      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
-    SPQR(const _MatrixType& matrix) 
-    : m_ordering(SPQR_ORDERING_DEFAULT),
-      m_allow_tol(SPQR_DEFAULT_TOL),
-      m_tolerance (NumTraits<Scalar>::epsilon())
+    explicit SPQR(const _MatrixType& matrix)
+    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -82,21 +88,42 @@ class SPQR
     
     ~SPQR()
     {
-      // Calls SuiteSparseQR_free()
+      SPQR_free();
+      cholmod_l_finish(&m_cc);
+    }
+    void SPQR_free()
+    {
       cholmod_l_free_sparse(&m_H, &m_cc);
       cholmod_l_free_sparse(&m_cR, &m_cc);
       cholmod_l_free_dense(&m_HTau, &m_cc);
       std::free(m_E);
       std::free(m_HPinv);
-      cholmod_l_finish(&m_cc);
     }
+
     void compute(const _MatrixType& matrix)
     {
+      if(m_isInitialized) SPQR_free();
+
       MatrixType mat(matrix);
+      
+      /* Compute the default threshold as in MatLab, see:
+       * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+       * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+       */
+      RealScalar pivotThreshold = m_tolerance;
+      if(m_useDefaultThreshold) 
+      {
+        RealScalar max2Norm = 0.0;
+        for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
+        if(max2Norm==RealScalar(0))
+          max2Norm = RealScalar(1);
+        pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
+      }
+      
       cholmod_sparse A; 
       A = viewAsCholmod(mat);
       Index col = matrix.cols();
-      m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A, 
+      m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, 
                              &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
 
       if (!m_cR)
@@ -112,41 +139,37 @@ class SPQR
     /** 
      * Get the number of rows of the input matrix and the Q matrix
      */
-    inline Index rows() const {return m_H->nrow; }
+    inline Index rows() const {return m_cR->nrow; }
     
     /** 
      * Get the number of columns of the input matrix. 
      */
     inline Index cols() const { return m_cR->ncol; }
-   
-      /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SPQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
-      eigen_assert(this->rows()==B.rows()
-                    && "SPQR::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SPQR, Rhs>(*this, B.derived());
-    }
     
     template<typename Rhs, typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       eigen_assert(b.cols()==1 && "This method is for vectors only");
-      
+
       //Compute Q^T * b
-      Dest y; 
+      typename Dest::PlainObject y, y2;
       y = matrixQ().transpose() * b;
-        // Solves with the triangular matrix R
+      
+      // Solves with the triangular matrix R
       Index rk = this->rank();
-      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
-      y.bottomRows(cols()-rk).setZero();
+      y2 = y;
+      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
+      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
+
       // Apply the column permutation 
-      dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      // colsPermutation() performs a copy of the permutation,
+      // so let's apply it manually:
+      for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
+      for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
+      
+//       y.bottomRows(y.rows()-rk).setZero();
+//       dest = colsPermutation() * y.topRows(cols());
       
       m_info = Success;
     }
@@ -157,7 +180,7 @@ class SPQR
     {
       eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
       if(!m_isRUpToDate) {
-        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::Index>(*m_cR);
+        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::StorageIndex>(*m_cR);
         m_isRUpToDate = true;
       }
       return m_R;
@@ -171,11 +194,7 @@ class SPQR
     PermutationType colsPermutation() const
     { 
       eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      Index n = m_cR->ncol;
-      PermutationType colsPerm(n);
-      for(Index j = 0; j <n; j++) colsPerm.indices()(j) = m_E[j];
-      return colsPerm; 
-      
+      return PermutationType(m_E, m_cR->ncol);
     }
     /**
      * Gets the rank of the matrix. 
@@ -189,7 +208,11 @@ class SPQR
     /// Set the fill-reducing ordering method to be used
     void setSPQROrdering(int ord) { m_ordering = ord;}
     /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
-    void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
+    void setPivotThreshold(const RealScalar& tol)
+    {
+      m_useDefaultThreshold = false;
+      m_tolerance = tol;
+    }
     
     /** \returns a pointer to the SPQR workspace */
     cholmod_common *cholmodCommon() const { return &m_cc; }
@@ -206,7 +229,6 @@ class SPQR
       return m_info;
     }
   protected:
-    bool m_isInitialized;
     bool m_analysisIsOk;
     bool m_factorizationIsOk;
     mutable bool m_isRUpToDate;
@@ -216,12 +238,13 @@ class SPQR
     RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
     mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
     mutable MatrixType m_R; // The sparse matrix R in Eigen format
-    mutable Index *m_E; // The permutation applied to columns
+    mutable StorageIndex *m_E; // The permutation applied to columns
     mutable cholmod_sparse *m_H;  //The householder vectors
-    mutable Index *m_HPinv; // The row permutation of H
+    mutable StorageIndex *m_HPinv; // The row permutation of H
     mutable cholmod_dense *m_HTau; // The Householder coefficients
     mutable Index m_rank; // The rank of the matrix
     mutable cholmod_common m_cc; // Workspace and parameters
+    bool m_useDefaultThreshold;     // Use default threshold
     template<typename ,typename > friend struct SPQR_QProduct;
 };
 
@@ -229,7 +252,7 @@ template <typename SPQRType, typename Derived>
 struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType,Derived> >
 {
   typedef typename SPQRType::Scalar Scalar;
-  typedef typename SPQRType::Index Index;
+  typedef typename SPQRType::StorageIndex StorageIndex;
   //Define the constructor to get reference to argument types
   SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose) : m_spqr(spqr),m_other(other),m_transpose(transpose) {}
   
@@ -285,22 +308,5 @@ struct SPQRMatrixQTransposeReturnType{
   const SPQRType& m_spqr;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<SPQR<_MatrixType>, Rhs>
-  : solve_retval_base<SPQR<_MatrixType>, Rhs>
-{
-  typedef SPQR<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
 }// End namespace Eigen
 #endif
diff --git a/nuparu/include/Eigen/src/SVD/BDCSVD.h b/nuparu/include/Eigen/src/SVD/BDCSVD.h
new file mode 100644
index 00000000..896246e4
--- /dev/null
+++ b/nuparu/include/Eigen/src/SVD/BDCSVD.h
@@ -0,0 +1,1208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+// 
+// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
+// research report written by Ming Gu and Stanley C.Eisenstat
+// The code variable names correspond to the names they used in their 
+// report
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+// Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BDCSVD_H
+#define EIGEN_BDCSVD_H
+// #define EIGEN_BDCSVD_DEBUG_VERBOSE
+// #define EIGEN_BDCSVD_SANITY_CHECKS
+namespace Eigen {
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
+#endif
+  
+template<typename _MatrixType> class BDCSVD;
+
+namespace internal {
+
+template<typename _MatrixType> 
+struct traits<BDCSVD<_MatrixType> >
+{
+  typedef _MatrixType MatrixType;
+};  
+
+} // end namespace internal
+  
+  
+/** \ingroup SVD_Module
+ *
+ *
+ * \class BDCSVD
+ *
+ * \brief class Bidiagonal Divide and Conquer SVD
+ *
+ * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
+ * We plan to have a very similar interface to JacobiSVD on this class.
+ * It should be used to speed up the calcul of SVD for big matrices. 
+ */
+template<typename _MatrixType> 
+class BDCSVD : public SVDBase<BDCSVD<_MatrixType> >
+{
+  typedef SVDBase<BDCSVD> Base;
+    
+public:
+  using Base::rows;
+  using Base::cols;
+  using Base::computeU;
+  using Base::computeV;
+  
+  typedef _MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
+    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), 
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, 
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, 
+    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), 
+    MatrixOptions = MatrixType::Options
+  };
+
+  typedef typename Base::MatrixUType MatrixUType;
+  typedef typename Base::MatrixVType MatrixVType;
+  typedef typename Base::SingularValuesType SingularValuesType;
+  
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> MatrixX;
+  typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
+  typedef Array<Index,1,Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;
+
+  /** \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via BDCSVD::compute(const MatrixType&).
+   */
+  BDCSVD() : m_algoswap(16), m_numIters(0)
+  {}
+
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem size.
+   * \sa BDCSVD()
+   */
+  BDCSVD(Index rows, Index cols, unsigned int computationOptions = 0)
+    : m_algoswap(16), m_numIters(0)
+  {
+    allocate(rows, cols, computationOptions);
+  }
+
+  /** \brief Constructor performing the decomposition of given matrix.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
+   *                           #ComputeFullV, #ComputeThinV.
+   *
+   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
+   * available with the (non - default) FullPivHouseholderQR preconditioner.
+   */
+  BDCSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
+    : m_algoswap(16), m_numIters(0)
+  {
+    compute(matrix, computationOptions);
+  }
+
+  ~BDCSVD() 
+  {
+  }
+  
+  /** \brief Method performing the decomposition of given matrix using custom options.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
+   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
+   *                           #ComputeFullV, #ComputeThinV.
+   *
+   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
+   * available with the (non - default) FullPivHouseholderQR preconditioner.
+   */
+  BDCSVD& compute(const MatrixType& matrix, unsigned int computationOptions);
+
+  /** \brief Method performing the decomposition of given matrix using current options.
+   *
+   * \param matrix the matrix to decompose
+   *
+   * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
+   */
+  BDCSVD& compute(const MatrixType& matrix)
+  {
+    return compute(matrix, this->m_computationOptions);
+  }
+
+  void setSwitchSize(int s) 
+  {
+    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    m_algoswap = s;
+  }
+ 
+private:
+  void allocate(Index rows, Index cols, unsigned int computationOptions);
+  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
+  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals, ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
+  void deflation43(Index firstCol, Index shift, Index i, Index size);
+  void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
+  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
+  template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
+  void copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naivev);
+  void structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift);
+
+protected:
+  MatrixXr m_naiveU, m_naiveV;
+  MatrixXr m_computed;
+  Index m_nRec;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
+  int m_algoswap;
+  bool m_isTranspose, m_compU, m_compV;
+  
+  using Base::m_singularValues;
+  using Base::m_diagSize;
+  using Base::m_computeFullU;
+  using Base::m_computeFullV;
+  using Base::m_computeThinU;
+  using Base::m_computeThinV;
+  using Base::m_matrixU;
+  using Base::m_matrixV;
+  using Base::m_isInitialized;
+  using Base::m_nonzeroSingularValues;
+
+public:  
+  int m_numIters;
+}; //end class BDCSVD
+
+
+// Method to allocate and initialize matrix and attributes
+template<typename MatrixType>
+void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
+{
+  m_isTranspose = (cols > rows);
+
+  if (Base::allocate(rows, cols, computationOptions))
+    return;
+  
+  m_computed = MatrixXr::Zero(m_diagSize + 1, m_diagSize );
+  m_compU = computeV();
+  m_compV = computeU();
+  if (m_isTranspose)
+    std::swap(m_compU, m_compV);
+  
+  if (m_compU) m_naiveU = MatrixXr::Zero(m_diagSize + 1, m_diagSize + 1 );
+  else         m_naiveU = MatrixXr::Zero(2, m_diagSize + 1 );
+  
+  if (m_compV) m_naiveV = MatrixXr::Zero(m_diagSize, m_diagSize);
+  
+  m_workspace.resize((m_diagSize+1)*(m_diagSize+1)*3);
+  m_workspaceI.resize(3*m_diagSize);
+}// end allocate
+
+template<typename MatrixType>
+BDCSVD<MatrixType>& BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions) 
+{
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "\n\n\n======================================================================================================================\n\n\n";
+#endif
+  allocate(matrix.rows(), matrix.cols(), computationOptions);
+  using std::abs;
+  
+  //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
+  if(matrix.cols() < m_algoswap)
+  {
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixType> jsvd(matrix,computationOptions);
+    if(computeU()) m_matrixU = jsvd.matrixU();
+    if(computeV()) m_matrixV = jsvd.matrixV();
+    m_singularValues = jsvd.singularValues();
+    m_nonzeroSingularValues = jsvd.nonzeroSingularValues();
+    m_isInitialized = true;
+    return *this;
+  }
+  
+  //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  if(scale==RealScalar(0)) scale = RealScalar(1);
+  MatrixX copy;
+  if (m_isTranspose) copy = matrix.adjoint()/scale;
+  else               copy = matrix/scale;
+  
+  //**** step 1 - Bidiagonalization
+  // FIXME this line involves temporaries
+  internal::UpperBidiagonalization<MatrixX> bid(copy);
+
+  //**** step 2 - Divide & Conquer
+  m_naiveU.setZero();
+  m_naiveV.setZero();
+  // FIXME this line involves a temporary matrix
+  m_computed.topRows(m_diagSize) = bid.bidiagonal().toDenseMatrix().transpose();
+  m_computed.template bottomRows<1>().setZero();
+  divide(0, m_diagSize - 1, 0, 0, 0);
+
+  //**** step 3 - Copy singular values and vectors
+  for (int i=0; i<m_diagSize; i++)
+  {
+    RealScalar a = abs(m_computed.coeff(i, i));
+    m_singularValues.coeffRef(i) = a * scale;
+    if (a == 0)
+    {
+      m_nonzeroSingularValues = i;
+      m_singularValues.tail(m_diagSize - i - 1).setZero();
+      break;
+    }
+    else if (i == m_diagSize - 1)
+    {
+      m_nonzeroSingularValues = i + 1;
+      break;
+    }
+  }
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+//   std::cout << "m_naiveU\n" << m_naiveU << "\n\n";
+//   std::cout << "m_naiveV\n" << m_naiveV << "\n\n";
+#endif
+  if(m_isTranspose) copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU);
+  else              copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV);
+
+  m_isInitialized = true;
+  return *this;
+}// end compute
+
+
+template<typename MatrixType>
+template<typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
+void BDCSVD<MatrixType>::copyUV(const HouseholderU &householderU, const HouseholderV &householderV, const NaiveU &naiveU, const NaiveV &naiveV)
+{
+  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
+  if (computeU())
+  {
+    Index Ucols = m_computeThinU ? m_diagSize : householderU.cols();
+    m_matrixU = MatrixX::Identity(householderU.cols(), Ucols);
+    m_matrixU.topLeftCorner(m_diagSize, m_diagSize) = naiveV.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
+    householderU.applyThisOnTheLeft(m_matrixU); // FIXME this line involves a temporary buffer
+  }
+  if (computeV())
+  {
+    Index Vcols = m_computeThinV ? m_diagSize : householderV.cols();
+    m_matrixV = MatrixX::Identity(householderV.cols(), Vcols);
+    m_matrixV.topLeftCorner(m_diagSize, m_diagSize) = naiveU.template cast<Scalar>().topLeftCorner(m_diagSize, m_diagSize);
+    householderV.applyThisOnTheLeft(m_matrixV); // FIXME this line involves a temporary buffer
+  }
+}
+
+/** \internal
+  * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
+  *  A = [A1]
+  *      [A2]
+  * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
+  * We can thus pack them prior to the the matrix product. However, this is only worth the effort if the matrix is large
+  * enough.
+  */
+template<typename MatrixType>
+void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, const MatrixXr &B, Index n1)
+{
+  Index n = A.rows();
+  if(n>100)
+  {
+    // If the matrices are large enough, let's exploit the sparse structure of A by
+    // splitting it in half (wrt n1), and packing the non-zero columns.
+    Index n2 = n - n1;
+    Map<MatrixXr> A1(m_workspace.data()      , n1, n);
+    Map<MatrixXr> A2(m_workspace.data()+ n1*n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data()+  n*n, n,  n);
+    Map<MatrixXr> B2(m_workspace.data()+2*n*n, n,  n);
+    Index k1=0, k2=0;
+    for(Index j=0; j<n; ++j)
+    {
+      if( (A.col(j).head(n1).array()!=0).any() )
+      {
+        A1.col(k1) = A.col(j).head(n1);
+        B1.row(k1) = B.row(j);
+        ++k1;
+      }
+      if( (A.col(j).tail(n2).array()!=0).any() )
+      {
+        A2.col(k2) = A.col(j).tail(n2);
+        B2.row(k2) = B.row(j);
+        ++k2;
+      }
+    }
+  
+    A.topRows(n1).noalias()    = A1.leftCols(k1) * B1.topRows(k1);
+    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
+  }
+  else
+  {
+    Map<MatrixXr,Aligned> tmp(m_workspace.data(),n,n);
+    tmp.noalias() = A*B;
+    A = tmp;
+  }
+}
+
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
+// place of the submatrix we are currently working on.
+
+//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
+// lastCol + 1 - firstCol is the size of the submatrix.
+//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
+//@param firstRowW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
+// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
+template<typename MatrixType>
+void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
+{
+  // requires rows = cols + 1;
+  using std::pow;
+  using std::sqrt;
+  using std::abs;
+  const Index n = lastCol - firstCol + 1;
+  const Index k = n/2;
+  RealScalar alphaK;
+  RealScalar betaK; 
+  RealScalar r0; 
+  RealScalar lambda, phi, c0, s0;
+  VectorType l, f;
+  // We use the other algorithm which is more efficient for small 
+  // matrices.
+  if (n < m_algoswap)
+  {
+    // FIXME this line involves temporaries
+    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), ComputeFullU | (m_compV ? ComputeFullV : 0));
+    if (m_compU)
+      m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = b.matrixU();
+    else 
+    {
+      m_naiveU.row(0).segment(firstCol, n + 1).real() = b.matrixU().row(0);
+      m_naiveU.row(1).segment(firstCol, n + 1).real() = b.matrixU().row(n);
+    }
+    if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = b.matrixV();
+    m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
+    m_computed.diagonal().segment(firstCol + shift, n) = b.singularValues().head(n);
+    return;
+  }
+  // We use the divide and conquer algorithm
+  alphaK =  m_computed(firstCol + k, firstCol + k);
+  betaK = m_computed(firstCol + k + 1, firstCol + k);
+  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
+  // right submatrix before the left one. 
+  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+
+  if (m_compU)
+  {
+    lambda = m_naiveU(firstCol + k, firstCol + k);
+    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
+  } 
+  else 
+  {
+    lambda = m_naiveU(1, firstCol + k);
+    phi = m_naiveU(0, lastCol + 1);
+  }
+  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
+  if (m_compU)
+  {
+    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
+    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
+  } 
+  else 
+  {
+    l = m_naiveU.row(1).segment(firstCol, k);
+    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
+  }
+  if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1;
+  if (r0 == 0)
+  {
+    c0 = 1;
+    s0 = 0;
+  }
+  else
+  {
+    c0 = alphaK * lambda / r0;
+    s0 = betaK * phi / r0;
+  }
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  if (m_compU)
+  {
+    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));     
+    // we shiftW Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
+      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
+    // we shift q1 at the left with a factor c0
+    m_naiveU.col(firstCol).segment( firstCol, k + 1) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * ( - s0));
+    // first column = q2 * s0
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) = m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0; 
+    // q2 *= c0
+    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
+  } 
+  else 
+  {
+    RealScalar q1 = m_naiveU(0, firstCol + k);
+    // we shift Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
+      m_naiveU(0, i + 1) = m_naiveU(0, i);
+    // we shift q1 at the left with a factor c0
+    m_naiveU(0, firstCol) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU(0, lastCol + 1) = (q1 * ( - s0));
+    // first column = q2 * s0
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; 
+    // q2 *= c0
+    m_naiveU(1, lastCol + 1) *= c0;
+    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
+    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
+  }
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  m_computed(firstCol + shift, firstCol + shift) = r0;
+  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
+  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp1 = (m_computed.block(firstCol+shift, firstCol+shift, n, n)).jacobiSvd().singularValues();
+#endif
+  // Second part: try to deflate singular values in combined matrix
+  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp2 = (m_computed.block(firstCol+shift, firstCol+shift, n, n)).jacobiSvd().singularValues();
+  std::cout << "\n\nj1 = " << tmp1.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "j2 = " << tmp2.transpose().format(bdcsvdfmt) << "\n\n";
+  std::cout << "err:      " << ((tmp1-tmp2).abs()>1e-12*tmp2.abs()).transpose() << "\n";
+  static int count = 0;
+  std::cout << "# " << ++count << "\n\n";
+  assert((tmp1-tmp2).matrix().norm() < 1e-14*tmp2.matrix().norm());
+//   assert(count<681);
+//   assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
+#endif
+  
+  // Third part: compute SVD of combined matrix
+  MatrixXr UofSVD, VofSVD;
+  VectorType singVals;
+  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(UofSVD.allFinite());
+  assert(VofSVD.allFinite());
+#endif
+  
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n+2)/2);
+  else
+  {
+    Map<Matrix<RealScalar,2,Dynamic>,Aligned> tmp(m_workspace.data(),2,n+1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n+1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
+  
+  if (m_compV)  structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n+1)/2);
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
+}// end divide
+
+// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
+// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
+// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
+// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
+//
+// TODO Opportunities for optimization: better root finding algo, better stopping criterion, better
+// handling of round-off errors, be consistent in ordering
+// For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
+{
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) =  m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
+  diag(0) = 0;
+
+  // Allocate space for singular values and vectors
+  singVals.resize(n);
+  U.resize(n+1, n+1);
+  if (m_compV) V.resize(n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  if (col0.hasNaN() || diag.hasNaN())
+    std::cout << "\n\nHAS NAN\n\n";
+#endif
+  
+  // Many singular values might have been deflated, the zero ones have been moved to the end,
+  // but others are interleaved and we must ignore them at this stage.
+  // To this end, let's compute a permutation skipping them:
+  Index actual_n = n;
+  while(actual_n>1 && diag(actual_n-1)==0) --actual_n;
+  Index m = 0; // size of the deflated problem
+  for(Index k=0;k<actual_n;++k)
+    if(col0(k)!=0)
+      m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(),m);
+  
+  Map<ArrayXr> shifts(m_workspace.data()+1*n, n);
+  Map<ArrayXr> mus(m_workspace.data()+2*n, n);
+  Map<ArrayXr> zhat(m_workspace.data()+3*n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "computeSVDofM using:\n";
+  std::cout << "  z: " << col0.transpose() << "\n";
+  std::cout << "  d: " << diag.transpose() << "\n";
+#endif
+  
+  // Compute singVals, shifts, and mus
+  computeSingVals(col0, diag, perm, singVals, shifts, mus);
+  
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  j:        " << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse() << "\n\n";
+  std::cout << "  sing-val: " << singVals.transpose() << "\n";
+  std::cout << "  mu:       " << mus.transpose() << "\n";
+  std::cout << "  shift:    " << shifts.transpose() << "\n";
+  
+  {
+    Index actual_n = n;
+    while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+    std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
+    std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    std::cout << "    check3 (>0)      : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
+    std::cout << "    check4 (>0)      : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
+  }
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(singVals.allFinite());
+  assert(mus.allFinite());
+  assert(shifts.allFinite());
+#endif
+  
+  // Compute zhat
+  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  zhat: " << zhat.transpose() << "\n";
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(zhat.allFinite());
+#endif
+  
+  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
+  
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() << "\n";
+  std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() << "\n";
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(U.allFinite());
+  assert(V.allFinite());
+  assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
+  assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+  
+  // Because of deflation, the singular values might not be completely sorted.
+  // Fortunately, reordering them is a O(n) problem
+  for(Index i=0; i<actual_n-1; ++i)
+  {
+    if(singVals(i)>singVals(i+1))
+    {
+      using std::swap;
+      swap(singVals(i),singVals(i+1));
+      U.col(i).swap(U.col(i+1));
+      if(m_compV) V.col(i).swap(V.col(i+1));
+    }
+  }
+  
+  // Reverse order so that singular values in increased order
+  // Because of deflation, the zeros singular-values are already at the end
+  singVals.head(actual_n).reverseInPlace();
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
+  
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n) );
+  std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
+  std::cout << "  * sing-val: " << singVals.transpose() << "\n";
+//   std::cout << "  * err:      " << ((jsvd.singularValues()-singVals)>1e-13*singVals.norm()).transpose() << "\n";
+#endif
+}
+
+template <typename MatrixType>
+typename BDCSVD<MatrixType>::RealScalar BDCSVD<MatrixType>::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift)
+{
+  Index m = perm.size();
+  RealScalar res = 1;
+  for(Index i=0; i<m; ++i)
+  {
+    Index j = perm(i);
+    res += numext::abs2(col0(j)) / ((diagShifted(j) - mu) * (diag(j) + shift + mu));
+  }
+  return res;
+}
+
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm,
+                                         VectorType& singVals, ArrayRef shifts, ArrayRef mus)
+{
+  using std::abs;
+  using std::swap;
+
+  Index n = col0.size();
+  Index actual_n = n;
+  while(actual_n>1 && col0(actual_n-1)==0) --actual_n;
+
+  for (Index k = 0; k < n; ++k)
+  {
+    if (col0(k) == 0 || actual_n==1)
+    {
+      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
+      // if actual_n==1, then the deflated problem is already diagonalized
+      singVals(k) = k==0 ? col0(0) : diag(k);
+      mus(k) = 0;
+      shifts(k) = k==0 ? col0(0) : diag(k);
+      continue;
+    } 
+
+    // otherwise, use secular equation to find singular value
+    RealScalar left = diag(k);
+    RealScalar right; // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
+    if(k==actual_n-1)
+      right = (diag(actual_n-1) + col0.matrix().norm());
+    else
+    {
+      // Skip deflated singular values
+      Index l = k+1;
+      while(col0(l)==0) { ++l; eigen_internal_assert(l<actual_n); }
+      right = diag(l);
+    }
+
+    // first decide whether it's closer to the left end or the right end
+    RealScalar mid = left + (right-left) / 2;
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, 0);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << right-left << "\n";
+    std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right)   << "\n";
+    std::cout << "     = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
+#endif
+    RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right;
+    
+    // measure everything relative to shift
+    Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
+    diagShifted = diag - shift;
+    
+    // initial guess
+    RealScalar muPrev, muCur;
+    if (shift == left)
+    {
+      muPrev = (right - left) * 0.1;
+      if (k == actual_n-1) muCur = right - left;
+      else                 muCur = (right - left) * 0.5; 
+    }
+    else
+    {
+      muPrev = -(right - left) * 0.1;
+      muCur = -(right - left) * 0.5;
+    }
+
+    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
+    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
+    if (abs(fPrev) < abs(fCur))
+    {
+      swap(fPrev, fCur);
+      swap(muPrev, muCur);
+    }
+
+    // rational interpolation: fit a function of the form a / mu + b through the two previous
+    // iterates and use its zero to compute the next iterate
+    bool useBisection = fPrev*fCur>0;
+    while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits<RealScalar>::epsilon() && !useBisection)
+    {
+      ++m_numIters;
+
+      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
+      RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev);
+      RealScalar b = fCur - a / muCur;
+      // And find mu such that f(mu)==0:
+      RealScalar muZero = -a/b;
+      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+      
+      muPrev = muCur;
+      fPrev = fCur;
+      muCur = muZero;
+      fCur = fZero;
+      
+      
+      if (shift == left  && (muCur < 0 || muCur > right - left)) useBisection = true;
+      if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true;
+      if (abs(fCur)>abs(fPrev)) useBisection = true;
+    }
+
+    // fall back on bisection method if rational interpolation did not work
+    if (useBisection)
+    {
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n";
+#endif
+      RealScalar leftShifted, rightShifted;
+      if (shift == left)
+      {
+        leftShifted = RealScalar(1)/NumTraits<RealScalar>::highest();
+        // I don't understand why the case k==0 would be special there:
+        // if (k == 0) rightShifted = right - left; else 
+        rightShifted = (k==actual_n-1) ? right : ((right - left) * 0.6); // theoretically we can take 0.5, but let's be safe
+      }
+      else
+      {
+        leftShifted = -(right - left) * 0.6;
+        rightShifted = -RealScalar(1)/NumTraits<RealScalar>::highest();
+      }
+      
+      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
+      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
+#endif
+
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      if(!(fLeft * fRight<0))
+        std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
+#endif
+      eigen_internal_assert(fLeft * fRight < 0);
+      
+      while (rightShifted - leftShifted > 2 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
+      {
+        RealScalar midShifted = (leftShifted + rightShifted) / 2;
+        fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+        if (fLeft * fMid < 0)
+        {
+          rightShifted = midShifted;
+        }
+        else
+        {
+          leftShifted = midShifted;
+          fLeft = fMid;
+        }
+      }
+
+      muCur = (leftShifted + rightShifted) / 2;
+    }
+      
+    singVals[k] = shift + muCur;
+    shifts[k] = shift;
+    mus[k] = muCur;
+
+    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
+    // (deflation is supposed to avoid this from happening)
+    // - this does no seem to be necessary anymore -
+//     if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+//     if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+  }
+}
+
+
+// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
+template <typename MatrixType>
+void BDCSVD<MatrixType>::perturbCol0
+   (const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat)
+{
+  using std::sqrt;
+  Index n = col0.size();
+  Index m = perm.size();
+  if(m==0)
+  {
+    zhat.setZero();
+    return;
+  }
+  Index last = perm(m-1);
+  // The offset permits to skip deflated entries while computing zhat
+  for (Index k = 0; k < n; ++k)
+  {
+    if (col0(k) == 0) // deflated
+      zhat(k) = 0;
+    else
+    {
+      // see equation (3.6)
+      RealScalar dk = diag(k);
+      RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
+
+      for(Index l = 0; l<m; ++l)
+      {
+        Index i = perm(l);
+        if(i!=k)
+        {
+          Index j = i<k ? i : perm(l-1);
+          prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+          if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
+            std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
+                       << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n";
+#endif
+        }
+      }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
+#endif
+      RealScalar tmp = sqrt(prod);
+      zhat(k) = col0(k) > 0 ? tmp : -tmp;
+    }
+  }
+}
+
+// compute singular vectors
+template <typename MatrixType>
+void BDCSVD<MatrixType>::computeSingVecs
+   (const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef &perm, const VectorType& singVals,
+    const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V)
+{
+  Index n = zhat.size();
+  Index m = perm.size();
+  
+  for (Index k = 0; k < n; ++k)
+  {
+    if (zhat(k) == 0)
+    {
+      U.col(k) = VectorType::Unit(n+1, k);
+      if (m_compV) V.col(k) = VectorType::Unit(n, k);
+    }
+    else
+    {
+      U.col(k).setZero();
+      for(Index l=0;l<m;++l)
+      {
+        Index i = perm(l);
+        U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
+      }
+      U(n,k) = 0;      
+      U.col(k).normalize();
+    
+      if (m_compV)
+      {
+        V.col(k).setZero();
+        for(Index l=1;l<m;++l)
+        {
+          Index i = perm(l);
+          V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k]));
+        }
+        V(0,k) = -1;
+        V.col(k).normalize();
+      }
+    }
+  }
+  U.col(n) = VectorType::Unit(n+1, n);
+}
+
+
+// page 12_13
+// i >= 1, di almost null and zi non null.
+// We use a rotation to zero out zi applied to the left of M
+template <typename MatrixType>
+void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size)
+{
+  using std::abs;
+  using std::sqrt;
+  using std::pow;
+  Index start = firstCol + shift;
+  RealScalar c = m_computed(start, start);
+  RealScalar s = m_computed(start+i, start);
+  RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
+  if (r == 0)
+  {
+    m_computed(start+i, start+i) = 0;
+    return;
+  }
+  m_computed(start,start) = r;  
+  m_computed(start+i, start) = 0;
+  m_computed(start+i, start+i) = 0;
+  
+  JacobiRotation<RealScalar> J(c/r,-s/r);
+  if (m_compU)  m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J);
+  else          m_naiveU.applyOnTheRight(firstCol, firstCol+i, J);
+}// end deflation 43
+
+
+// page 13
+// i,j >= 1, i!=j and |di - dj| < epsilon * norm2(M)
+// We apply two rotations to have zj = 0;
+// TODO deflation44 is still broken and not properly tested
+template <typename MatrixType>
+void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size)
+{
+  using std::abs;
+  using std::sqrt;
+  using std::conj;
+  using std::pow;
+  RealScalar c = m_computed(firstColm+i, firstColm);
+  RealScalar s = m_computed(firstColm+j, firstColm);
+  RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s));
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "deflation 4.4: " << i << "," << j << " -> " << c << " " << s << " " << r << " ; "
+    << m_computed(firstColm + i-1, firstColm)  << " "
+    << m_computed(firstColm + i, firstColm)  << " "
+    << m_computed(firstColm + i+1, firstColm) << " "
+    << m_computed(firstColm + i+2, firstColm) << "\n";
+  std::cout << m_computed(firstColm + i-1, firstColm + i-1)  << " "
+    << m_computed(firstColm + i, firstColm+i)  << " "
+    << m_computed(firstColm + i+1, firstColm+i+1) << " "
+    << m_computed(firstColm + i+2, firstColm+i+2) << "\n";
+#endif
+  if (r==0)
+  {
+    m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
+    return;
+  }
+  c/=r;
+  s/=r;
+  m_computed(firstColm + i, firstColm) = r;  
+  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+  m_computed(firstColm + j, firstColm) = 0;
+
+  JacobiRotation<RealScalar> J(c,-s);
+  if (m_compU)  m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J);
+  else          m_naiveU.applyOnTheRight(firstColu+i, firstColu+j, J);
+  if (m_compV)  m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + i, firstColW + j, J);
+}// end deflation 44
+
+
+// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
+template <typename MatrixType>
+void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift)
+{
+  using std::sqrt;
+  using std::abs;
+  const Index length = lastCol + 1 - firstCol;
+  
+  Block<MatrixXr,Dynamic,1> col0(m_computed, firstCol+shift, firstCol+shift, length, 1);
+  Diagonal<MatrixXr> fulldiag(m_computed);
+  VectorBlock<Diagonal<MatrixXr>,Dynamic> diag(fulldiag, firstCol+shift, length);
+  
+  RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff();
+  RealScalar epsilon_strict = NumTraits<RealScalar>::epsilon() * maxDiag;
+  RealScalar epsilon_coarse = 8 * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE  
+  std::cout << "\ndeflate:" << diag.head(k+1).transpose() << "  |  " << diag.segment(k+1,length-k-1).transpose() << "\n";
+#endif
+  
+  //condition 4.1
+  if (diag(0) < epsilon_coarse)
+  { 
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
+#endif
+    diag(0) = epsilon_coarse;
+  }
+
+  //condition 4.2
+  for (Index i=1;i<length;++i)
+    if (abs(col0(i)) < epsilon_strict)
+    {
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << "  (diag(" << i << ")=" << diag(i) << ")\n";
+#endif
+      col0(i) = 0;
+    }
+
+  //condition 4.3
+  for (Index i=1;i<length; i++)
+    if (diag(i) < epsilon_coarse)
+    {
+#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.3, cancel z(" << i << ")=" << col0(i) << " because diag(" << i << ")=" << diag(i) << " < " << epsilon_coarse << "\n";
+#endif
+      deflation43(firstCol, shift, i, length);
+    }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+#endif
+  {
+    // Check for total deflation
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
+    bool total_deflation = (col0.tail(length-1).array()==RealScalar(0)).all();
+    
+    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
+    // First, compute the respective permutation.
+    Index *permutation = m_workspaceI.data();
+    {
+      permutation[0] = 0;
+      Index p = 1;
+      
+      // Move deflated diagonal entries at the end.
+      for(Index i=1; i<length; ++i)
+        if(diag(i)==0)
+          permutation[p++] = i;
+        
+      Index i=1, j=k+1;
+      for( ; p < length; ++p)
+      {
+             if (i > k)             permutation[p] = j++;
+        else if (j >= length)       permutation[p] = i++;
+        else if (diag(i) < diag(j)) permutation[p] = j++;
+        else                        permutation[p] = i++;
+      }
+    }
+    
+    // If we have a total deflation, then we have to insert diag(0) at the right place
+    if(total_deflation)
+    {
+      for(Index i=1; i<length; ++i)
+      {
+        Index pi = permutation[i];
+        if(diag(pi)==0 || diag(0)<diag(pi))
+          permutation[i-1] = permutation[i];
+        else
+        {
+          permutation[i-1] = 0;
+          break;
+        }
+      }
+    }
+    
+    // Current index of each col, and current column of each index
+    Index *realInd = m_workspaceI.data()+length;
+    Index *realCol = m_workspaceI.data()+2*length;
+    
+    for(int pos = 0; pos< length; pos++)
+    {
+      realCol[pos] = pos;
+      realInd[pos] = pos;
+    }
+    
+    for(Index i = total_deflation?0:1; i < length; i++)
+    {
+      const Index pi = permutation[length - (total_deflation ? i+1 : i)];
+      const Index J = realCol[pi];
+      
+      using std::swap;
+      // swap diagonal and first column entries:
+      swap(diag(i), diag(J));
+      if(i!=0 && J!=0) swap(col0(i), col0(J));
+
+      // change columns
+      if (m_compU) m_naiveU.col(firstCol+i).segment(firstCol, length + 1).swap(m_naiveU.col(firstCol+J).segment(firstCol, length + 1));
+      else         m_naiveU.col(firstCol+i).segment(0, 2)                .swap(m_naiveU.col(firstCol+J).segment(0, 2));
+      if (m_compV) m_naiveV.col(firstColW + i).segment(firstRowW, length).swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
+
+      //update real pos
+      const Index realI = realInd[i];
+      realCol[realI] = J;
+      realCol[pi] = i;
+      realInd[J] = realI;
+      realInd[i] = pi;
+    }
+  }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "      : " << col0.transpose() << "\n\n";
+#endif
+    
+  //condition 4.4
+  {
+    Index i = length-1;
+    while(i>0 && (diag(i)==0 || col0(i)==0)) --i;
+    for(; i>1;--i)
+       if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
+      {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+        std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
+#endif
+        eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
+        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
+      }
+  }
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  for(Index j=2;j<length;++j)
+    assert(diag(j-1)<=diag(j) || diag(j)==0);
+#endif
+  
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(m_naiveU.allFinite());
+  assert(m_naiveV.allFinite());
+  assert(m_computed.allFinite());
+#endif
+}//end deflation
+
+#ifndef __CUDACC__
+/** \svd_module
+  *
+  * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
+  *
+  * \sa class BDCSVD
+  */
+template<typename Derived>
+BDCSVD<typename MatrixBase<Derived>::PlainObject>
+MatrixBase<Derived>::bdcSvd(unsigned int computationOptions) const
+{
+  return BDCSVD<PlainObject>(*this, computationOptions);
+}
+#endif
+
+} // end namespace Eigen
+
+#endif
diff --git a/nuparu/include/Eigen/src/SVD/JacobiSVD.h b/nuparu/include/Eigen/src/SVD/JacobiSVD.h
index 4786768f..59c965e1 100644
--- a/nuparu/include/Eigen/src/SVD/JacobiSVD.h
+++ b/nuparu/include/Eigen/src/SVD/JacobiSVD.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -51,7 +52,6 @@ template<typename MatrixType, int QRPreconditioner, int Case>
 class qr_preconditioner_impl<MatrixType, QRPreconditioner, Case, false>
 {
 public:
-  typedef typename MatrixType::Index Index;
   void allocate(const JacobiSVD<MatrixType, QRPreconditioner>&) {}
   bool run(JacobiSVD<MatrixType, QRPreconditioner>&, const MatrixType&)
   {
@@ -65,7 +65,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -106,7 +105,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -156,8 +154,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
-
   void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
   {
     if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
@@ -197,7 +193,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -256,8 +251,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
-
   void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
   {
     if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
@@ -296,7 +289,6 @@ template<typename MatrixType>
 class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
 {
 public:
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   enum
   {
@@ -358,7 +350,6 @@ template<typename MatrixType, int QRPreconditioner>
 struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
 {
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename SVD::Index Index;
   static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
 };
 
@@ -368,21 +359,25 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
   typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename SVD::Index Index;
   static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
   {
     using std::sqrt;
     Scalar z;
     JacobiRotation<Scalar> rot;
     RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
+    
     if(n==0)
     {
       z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
       work_matrix.row(p) *= z;
       if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
-      work_matrix.row(q) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+      if(work_matrix.coeff(q,q)!=Scalar(0))
+      {
+        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
+        work_matrix.row(q) *= z;
+        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+      }
+      // otherwise the second row is already zero, so we have nothing to do.
     }
     else
     {
@@ -392,7 +387,7 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
       if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
       if(work_matrix.coeff(p,q) != Scalar(0))
       {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+        z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
         work_matrix.col(q) *= z;
         if(svd.computeV()) svd.m_matrixV.col(q) *= z;
       }
@@ -408,32 +403,43 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
 
 template<typename MatrixType, typename RealScalar, typename Index>
 void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                            JacobiRotation<RealScalar> *j_left,
-                            JacobiRotation<RealScalar> *j_right)
+                         JacobiRotation<RealScalar> *j_left,
+                         JacobiRotation<RealScalar> *j_right)
 {
   using std::sqrt;
+  using std::abs;
   Matrix<RealScalar,2,2> m;
   m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
        numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
   JacobiRotation<RealScalar> rot1;
   RealScalar t = m.coeff(0,0) + m.coeff(1,1);
   RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(t == RealScalar(0))
+  
+  if(d == RealScalar(0))
   {
-    rot1.c() = RealScalar(0);
-    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
+    rot1.s() = RealScalar(0);
+    rot1.c() = RealScalar(1);
   }
   else
   {
-    RealScalar u = d / t;
-    rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = rot1.c() * u;
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
   }
   m.applyOnTheLeft(0,1,rot1);
   j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
+  *j_left = rot1 * j_right->transpose();
 }
 
+template<typename _MatrixType, int QRPreconditioner> 
+struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
+{
+  typedef _MatrixType MatrixType;
+};
+
 } // end namespace internal
 
 /** \ingroup SVD_Module
@@ -490,13 +496,14 @@ void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
   * \sa MatrixBase::jacobiSvd()
   */
 template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
+ : public SVDBase<JacobiSVD<_MatrixType,QRPreconditioner> >
 {
+    typedef SVDBase<JacobiSVD> Base;
   public:
 
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
     enum {
       RowsAtCompileTime = MatrixType::RowsAtCompileTime,
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -507,13 +514,10 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       MatrixOptions = MatrixType::Options
     };
 
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-                   MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-            MatrixUType;
-    typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-                   MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-            MatrixVType;
-    typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
+    typedef typename Base::MatrixUType MatrixUType;
+    typedef typename Base::MatrixVType MatrixVType;
+    typedef typename Base::SingularValuesType SingularValuesType;
+    
     typedef typename internal::plain_row_type<MatrixType>::type RowType;
     typedef typename internal::plain_col_type<MatrixType>::type ColType;
     typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
@@ -526,10 +530,6 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       * perform decompositions via JacobiSVD::compute(const MatrixType&).
       */
     JacobiSVD()
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
     {}
 
 
@@ -540,10 +540,6 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       * \sa JacobiSVD()
       */
     JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
     {
       allocate(rows, cols, computationOptions);
     }
@@ -558,11 +554,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
      * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
      * available with the (non-default) FullPivHouseholderQR preconditioner.
      */
-    JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
+    explicit JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
     {
       compute(matrix, computationOptions);
     }
@@ -590,95 +582,33 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
       return compute(matrix, m_computationOptions);
     }
 
-    /** \returns the \a U matrix.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-     * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
-     *
-     * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
-     *
-     * This method asserts that you asked for \a U to be computed.
-     */
-    const MatrixUType& matrixU() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeU() && "This JacobiSVD decomposition didn't compute U. Did you ask for it?");
-      return m_matrixU;
-    }
-
-    /** \returns the \a V matrix.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-     * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
-     *
-     * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
-     *
-     * This method asserts that you asked for \a V to be computed.
-     */
-    const MatrixVType& matrixV() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeV() && "This JacobiSVD decomposition didn't compute V. Did you ask for it?");
-      return m_matrixV;
-    }
-
-    /** \returns the vector of singular values.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
-     * returned vector has size \a m.  Singular values are always sorted in decreasing order.
-     */
-    const SingularValuesType& singularValues() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      return m_singularValues;
-    }
-
-    /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
-    inline bool computeU() const { return m_computeFullU || m_computeThinU; }
-    /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
-    inline bool computeV() const { return m_computeFullV || m_computeThinV; }
-
-    /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-      *
-      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<JacobiSVD, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeU() && computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-      return internal::solve_retval<JacobiSVD, Rhs>(*this, b.derived());
-    }
-
-    /** \returns the number of singular values that are not exactly 0 */
-    Index nonzeroSingularValues() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      return m_nonzeroSingularValues;
-    }
-
-    inline Index rows() const { return m_rows; }
-    inline Index cols() const { return m_cols; }
+    using Base::computeU;
+    using Base::computeV;
+    using Base::rows;
+    using Base::cols;
+    using Base::rank;
 
   private:
     void allocate(Index rows, Index cols, unsigned int computationOptions);
 
   protected:
-    MatrixUType m_matrixU;
-    MatrixVType m_matrixV;
-    SingularValuesType m_singularValues;
+    using Base::m_matrixU;
+    using Base::m_matrixV;
+    using Base::m_singularValues;
+    using Base::m_isInitialized;
+    using Base::m_isAllocated;
+    using Base::m_usePrescribedThreshold;
+    using Base::m_computeFullU;
+    using Base::m_computeThinU;
+    using Base::m_computeFullV;
+    using Base::m_computeThinV;
+    using Base::m_computationOptions;
+    using Base::m_nonzeroSingularValues;
+    using Base::m_rows;
+    using Base::m_cols;
+    using Base::m_diagSize;
+    using Base::m_prescribedThreshold;
     WorkMatrixType m_workMatrix;
-    bool m_isInitialized, m_isAllocated;
-    bool m_computeFullU, m_computeThinU;
-    bool m_computeFullV, m_computeThinV;
-    unsigned int m_computationOptions;
-    Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
 
     template<typename __MatrixType, int _QRPreconditioner, bool _IsComplex>
     friend struct internal::svd_precondition_2x2_block_to_be_real;
@@ -687,6 +617,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
 
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
     internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
+    MatrixType m_scaledMatrix;
 };
 
 template<typename MatrixType, int QRPreconditioner>
@@ -733,8 +664,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
                             : 0);
   m_workMatrix.resize(m_diagSize, m_diagSize);
   
-  if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
-  if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
+  if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
+  if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
+  if(m_rows!=m_cols)  m_scaledMatrix.resize(rows,cols);
 }
 
 template<typename MatrixType, int QRPreconditioner>
@@ -749,13 +681,25 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
   // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  // FIXME What about considerering any denormal numbers as zero, using:
+  // const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
   const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
 
+  // Scaling factor to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().maxCoeff();
+  if(scale==RealScalar(0)) scale = RealScalar(1);
+  
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
 
-  if(!m_qr_precond_morecols.run(*this, matrix) && !m_qr_precond_morerows.run(*this, matrix))
+  if(m_rows!=m_cols)
   {
-    m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize);
+    m_scaledMatrix = matrix / scale;
+    m_qr_precond_morecols.run(*this, m_scaledMatrix);
+    m_qr_precond_morerows.run(*this, m_scaledMatrix);
+  }
+  else
+  {
+    m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
     if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
     if(m_computeThinU) m_matrixU.setIdentity(m_rows,m_diagSize);
     if(m_computeFullV) m_matrixV.setIdentity(m_cols,m_cols);
@@ -778,10 +722,11 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
         // if this 2x2 sub-matrix is not diagonal already...
         // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
         // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        using std::max;
-        RealScalar threshold = (max)(considerAsZero, precision * (max)(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
-        if((max)(abs(m_workMatrix.coeff(p,q)),abs(m_workMatrix.coeff(q,p))) > threshold)
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero,
+                   precision * numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p,p)),
+                                                        abs(m_workMatrix.coeff(q,q))));
+        // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
+        if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
         {
           finished = false;
 
@@ -809,6 +754,8 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
     m_singularValues.coeffRef(i) = a;
     if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
   }
+  
+  m_singularValues *= scale;
 
   /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
 
@@ -835,31 +782,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int QRPreconditioner, typename Rhs>
-struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-{
-  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    // A = U S V^*
-    // So A^{-1} = V S^{-1} U^*
-
-    Matrix<Scalar, Dynamic, Rhs::ColsAtCompileTime, 0, _MatrixType::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime> tmp;
-    Index nonzeroSingVals = dec().nonzeroSingularValues();
-    
-    tmp.noalias() = dec().matrixU().leftCols(nonzeroSingVals).adjoint() * rhs();
-    tmp = dec().singularValues().head(nonzeroSingVals).asDiagonal().inverse() * tmp;
-    dst = dec().matrixV().leftCols(nonzeroSingVals) * tmp;
-  }
-};
-} // end namespace internal
-
+#ifndef __CUDACC__
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by two-sided
@@ -873,6 +796,7 @@ MatrixBase<Derived>::jacobiSvd(unsigned int computationOptions) const
 {
   return JacobiSVD<PlainObject>(*this, computationOptions);
 }
+#endif // __CUDACC__
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/SVD/JacobiSVD_MKL.h b/nuparu/include/Eigen/src/SVD/JacobiSVD_MKL.h
index decda754..14e461c4 100644
--- a/nuparu/include/Eigen/src/SVD/JacobiSVD_MKL.h
+++ b/nuparu/include/Eigen/src/SVD/JacobiSVD_MKL.h
@@ -45,8 +45,8 @@ JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPiv
 JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) \
 { \
   typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
+  /*typedef MatrixType::Scalar Scalar;*/ \
+  /*typedef MatrixType::RealScalar RealScalar;*/ \
   allocate(matrix.rows(), matrix.cols(), computationOptions); \
 \
   /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
diff --git a/nuparu/include/Eigen/src/SVD/SVDBase.h b/nuparu/include/Eigen/src/SVD/SVDBase.h
new file mode 100644
index 00000000..ad191085
--- /dev/null
+++ b/nuparu/include/Eigen/src/SVD/SVDBase.h
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SVDBASE_H
+#define EIGEN_SVDBASE_H
+
+namespace Eigen {
+/** \ingroup SVD_Module
+ *
+ *
+ * \class SVDBase
+ *
+ * \brief Base class of SVD algorithms
+ *
+ * \tparam Derived the type of the actual SVD decomposition
+ *
+ * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
+ *   \f[ A = U S V^* \f]
+ * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
+ * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left
+ * and right \em singular \em vectors of \a A respectively.
+ *
+ * Singular values are always sorted in decreasing order.
+ *
+ * 
+ * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
+ * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
+ * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
+ * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
+ *  
+ * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
+ * terminate in finite (and reasonable) time.
+ * \sa MatrixBase::genericSvd()
+ */
+template<typename Derived>
+class SVDBase
+{
+
+public:
+  typedef typename internal::traits<Derived>::MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime),
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime),
+    MatrixOptions = MatrixType::Options
+  };
+
+  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixUType;
+  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime, MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime> MatrixVType;
+  typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
+  
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** \returns the \a U matrix.
+   *
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+   * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
+   *
+   * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
+   *
+   * This method asserts that you asked for \a U to be computed.
+   */
+  const MatrixUType& matrixU() const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
+    return m_matrixU;
+  }
+
+  /** \returns the \a V matrix.
+   *
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+   * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
+   *
+   * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
+   *
+   * This method asserts that you asked for \a V to be computed.
+   */
+  const MatrixVType& matrixV() const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
+    return m_matrixV;
+  }
+
+  /** \returns the vector of singular values.
+   *
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
+   * returned vector has size \a m.  Singular values are always sorted in decreasing order.
+   */
+  const SingularValuesType& singularValues() const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    return m_singularValues;
+  }
+
+  /** \returns the number of singular values that are not exactly 0 */
+  Index nonzeroSingularValues() const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    return m_nonzeroSingularValues;
+  }
+  
+  /** \returns the rank of the matrix of which \c *this is the SVD.
+    *
+    * \note This method has to determine which singular values should be considered nonzero.
+    *       For that, it uses the threshold value that you can control by calling
+    *       setThreshold(const RealScalar&).
+    */
+  inline Index rank() const
+  {
+    using std::abs;
+    using std::max;
+    eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
+    if(m_singularValues.size()==0) return 0;
+    RealScalar premultiplied_threshold = (max)(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
+    Index i = m_nonzeroSingularValues-1;
+    while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
+    return i+1;
+  }
+  
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
+    * which need to determine when singular values are to be considered nonzero.
+    * This is not used for the SVD decomposition itself.
+    *
+    * When it needs to get the threshold value, Eigen calls threshold().
+    * The default is \c NumTraits<Scalar>::epsilon()
+    *
+    * \param threshold The new value to use as the threshold.
+    *
+    * A singular value will be considered nonzero if its value is strictly greater than
+    *  \f$ \vert singular value \vert \leqslant threshold \times \vert max singular value \vert \f$.
+    *
+    * If you want to come back to the default behavior, call setThreshold(Default_t)
+    */
+  Derived& setThreshold(const RealScalar& threshold)
+  {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return derived();
+  }
+
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+    * determining the threshold.
+    *
+    * You should pass the special object Eigen::Default as parameter here.
+    * \code svd.setThreshold(Eigen::Default); \endcode
+    *
+    * See the documentation of setThreshold(const RealScalar&).
+    */
+  Derived& setThreshold(Default_t)
+  {
+    m_usePrescribedThreshold = false;
+    return derived();
+  }
+
+  /** Returns the threshold that will be used by certain methods such as rank().
+    *
+    * See the documentation of setThreshold(const RealScalar&).
+    */
+  RealScalar threshold() const
+  {
+    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+    return m_usePrescribedThreshold ? m_prescribedThreshold
+                                    : (std::max<Index>)(1,m_diagSize)*NumTraits<Scalar>::epsilon();
+  }
+
+  /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
+  inline bool computeU() const { return m_computeFullU || m_computeThinU; }
+  /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
+  inline bool computeV() const { return m_computeFullV || m_computeThinV; }
+
+  inline Index rows() const { return m_rows; }
+  inline Index cols() const { return m_cols; }
+  
+  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
+    *
+    * \param b the right-hand-side of the equation to solve.
+    *
+    * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
+    *
+    * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
+    * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
+    */
+  template<typename Rhs>
+  inline const Solve<Derived, Rhs>
+  solve(const MatrixBase<Rhs>& b) const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
+    return Solve<Derived, Rhs>(derived(), b.derived());
+  }
+  
+  #ifndef EIGEN_PARSED_BY_DOXYGEN
+  template<typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC
+  void _solve_impl(const RhsType &rhs, DstType &dst) const;
+  #endif
+
+protected:
+  
+  static void check_template_parameters()
+  {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+  }
+  
+  // return true if already allocated
+  bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
+
+  MatrixUType m_matrixU;
+  MatrixVType m_matrixV;
+  SingularValuesType m_singularValues;
+  bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
+  bool m_computeFullU, m_computeThinU;
+  bool m_computeFullV, m_computeThinV;
+  unsigned int m_computationOptions;
+  Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
+  RealScalar m_prescribedThreshold;
+
+  /** \brief Default Constructor.
+   *
+   * Default constructor of SVDBase
+   */
+  SVDBase()
+    : m_isInitialized(false),
+      m_isAllocated(false),
+      m_usePrescribedThreshold(false),
+      m_computationOptions(0),
+      m_rows(-1), m_cols(-1), m_diagSize(0)
+  {
+    check_template_parameters();
+  }
+
+
+};
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename Derived>
+template<typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
+{
+  eigen_assert(rhs.rows() == rows());
+
+  // A = U S V^*
+  // So A^{-1} = V S^{-1} U^*
+
+  Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Index l_rank = rank();
+  tmp.noalias() =  m_matrixU.leftCols(l_rank).adjoint() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst = m_matrixV.leftCols(l_rank) * tmp;
+}
+#endif
+
+template<typename MatrixType>
+bool SVDBase<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
+{
+  eigen_assert(rows >= 0 && cols >= 0);
+
+  if (m_isAllocated &&
+      rows == m_rows &&
+      cols == m_cols &&
+      computationOptions == m_computationOptions)
+  {
+    return true;
+  }
+
+  m_rows = rows;
+  m_cols = cols;
+  m_isInitialized = false;
+  m_isAllocated = true;
+  m_computationOptions = computationOptions;
+  m_computeFullU = (computationOptions & ComputeFullU) != 0;
+  m_computeThinU = (computationOptions & ComputeThinU) != 0;
+  m_computeFullV = (computationOptions & ComputeFullV) != 0;
+  m_computeThinV = (computationOptions & ComputeThinV) != 0;
+  eigen_assert(!(m_computeFullU && m_computeThinU) && "SVDBase: you can't ask for both full and thin U");
+  eigen_assert(!(m_computeFullV && m_computeThinV) && "SVDBase: you can't ask for both full and thin V");
+  eigen_assert(EIGEN_IMPLIES(m_computeThinU || m_computeThinV, MatrixType::ColsAtCompileTime==Dynamic) &&
+	       "SVDBase: thin U and V are only available when your matrix has a dynamic number of columns.");
+
+  m_diagSize = (std::min)(m_rows, m_cols);
+  m_singularValues.resize(m_diagSize);
+  if(RowsAtCompileTime==Dynamic)
+    m_matrixU.resize(m_rows, m_computeFullU ? m_rows : m_computeThinU ? m_diagSize : 0);
+  if(ColsAtCompileTime==Dynamic)
+    m_matrixV.resize(m_cols, m_computeFullV ? m_cols : m_computeThinV ? m_diagSize : 0);
+
+  return false;
+}
+
+}// end namespace
+
+#endif // EIGEN_SVDBASE_H
diff --git a/nuparu/include/Eigen/src/SVD/UpperBidiagonalization.h b/nuparu/include/Eigen/src/SVD/UpperBidiagonalization.h
index 587de37a..0b146089 100644
--- a/nuparu/include/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/nuparu/include/Eigen/src/SVD/UpperBidiagonalization.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -28,15 +29,15 @@ template<typename _MatrixType> class UpperBidiagonalization
     };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
     typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
     typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
-    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0> BidiagonalType;
+    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0, RowMajor> BidiagonalType;
     typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
     typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
     typedef HouseholderSequence<
               const MatrixType,
-              CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Diagonal<const MatrixType,0> >
+              const typename internal::remove_all<typename Diagonal<const MatrixType,0>::ConjugateReturnType>::type
             > HouseholderUSequenceType;
     typedef HouseholderSequence<
               const typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type,
@@ -52,7 +53,7 @@ template<typename _MatrixType> class UpperBidiagonalization
     */
     UpperBidiagonalization() : m_householder(), m_bidiagonal(), m_isInitialized(false) {}
 
-    UpperBidiagonalization(const MatrixType& matrix)
+    explicit UpperBidiagonalization(const MatrixType& matrix)
       : m_householder(matrix.rows(), matrix.cols()),
         m_bidiagonal(matrix.cols(), matrix.cols()),
         m_isInitialized(false)
@@ -61,6 +62,7 @@ template<typename _MatrixType> class UpperBidiagonalization
     }
     
     UpperBidiagonalization& compute(const MatrixType& matrix);
+    UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);
     
     const MatrixType& householder() const { return m_householder; }
     const BidiagonalType& bidiagonal() const { return m_bidiagonal; }
@@ -85,45 +87,307 @@ template<typename _MatrixType> class UpperBidiagonalization
     bool m_isInitialized;
 };
 
-template<typename _MatrixType>
-UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+// Standard upper bidiagonalization without fancy optimizations
+// This version should be faster for small matrix size
+template<typename MatrixType>
+void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
+                                              typename MatrixType::RealScalar *diagonal,
+                                              typename MatrixType::RealScalar *upper_diagonal,
+                                              typename MatrixType::Scalar* tempData = 0)
 {
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
-  
-  m_householder = matrix;
+  typedef typename MatrixType::Scalar Scalar;
 
-  ColVectorType temp(rows);
+  Index rows = mat.rows();
+  Index cols = mat.cols();
+
+  typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixType::MaxRowsAtCompileTime,1> TempType;
+  TempType tempVector;
+  if(tempData==0)
+  {
+    tempVector.resize(rows);
+    tempData = tempVector.data();
+  }
 
   for (Index k = 0; /* breaks at k==cols-1 below */ ; ++k)
   {
     Index remainingRows = rows - k;
     Index remainingCols = cols - k - 1;
 
-    // construct left householder transform in-place in m_householder
-    m_householder.col(k).tail(remainingRows)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k),
-                                         m_bidiagonal.template diagonal<0>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows, remainingCols)
-                 .applyHouseholderOnTheLeft(m_householder.col(k).tail(remainingRows-1),
-                                            m_householder.coeff(k,k),
-                                            temp.data());
+    // construct left householder transform in-place in A
+    mat.col(k).tail(remainingRows)
+       .makeHouseholderInPlace(mat.coeffRef(k,k), diagonal[k]);
+    // apply householder transform to remaining part of A on the left
+    mat.bottomRightCorner(remainingRows, remainingCols)
+       .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows-1), mat.coeff(k,k), tempData);
 
     if(k == cols-1) break;
+
+    // construct right householder transform in-place in mat
+    mat.row(k).tail(remainingCols)
+       .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
+    // apply householder transform to remaining part of mat on the left
+    mat.bottomRightCorner(remainingRows-1, remainingCols)
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
+  }
+}
+
+/** \internal
+  * Helper routine for the block reduction to upper bidiagonal form.
+  *
+  * Let's partition the matrix A:
+  * 
+  *      | A00 A01 |
+  *  A = |         |
+  *      | A10 A11 |
+  *
+  * This function reduces to bidiagonal form the left \c rows x \a blockSize vertical panel [A00/A10]
+  * and the \a blockSize x \c cols horizontal panel [A00 A01] of the matrix \a A. The bottom-right block A11
+  * is updated using matrix-matrix products:
+  *   A22 -= V * Y^T - X * U^T
+  * where V and U contains the left and right Householder vectors. U and V are stored in A10, and A01
+  * respectively, and the update matrices X and Y are computed during the reduction.
+  * 
+  */
+template<typename MatrixType>
+void upperbidiagonalization_blocked_helper(MatrixType& A,
+                                           typename MatrixType::RealScalar *diagonal,
+                                           typename MatrixType::RealScalar *upper_diagonal,
+                                           Index bs,
+                                           Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic,
+                                                      traits<MatrixType>::Flags & RowMajorBit> > X,
+                                           Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic,
+                                                      traits<MatrixType>::Flags & RowMajorBit> > Y)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
+  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
+  typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride>    SubColumnType;
+  typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride>    SubRowType;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder > > SubMatType;
+  
+  Index brows = A.rows();
+  Index bcols = A.cols();
+
+  Scalar tau_u, tau_u_prev(0), tau_v;
+
+  for(Index k = 0; k < bs; ++k)
+  {
+    Index remainingRows = brows - k;
+    Index remainingCols = bcols - k - 1;
+
+    SubMatType X_k1( X.block(k,0, remainingRows,k) );
+    SubMatType V_k1( A.block(k,0, remainingRows,k) );
+
+    // 1 - update the k-th column of A
+    SubColumnType v_k = A.col(k).tail(remainingRows);
+          v_k -= V_k1 * Y.row(k).head(k).adjoint();
+    if(k) v_k -= X_k1 * A.col(k).head(k);
+    
+    // 2 - construct left Householder transform in-place
+    v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
+       
+    if(k+1<bcols)
+    {
+      SubMatType Y_k  ( Y.block(k+1,0, remainingCols, k+1) );
+      SubMatType U_k1 ( A.block(0,k+1, k,remainingCols) );
+      
+      // this eases the application of Householder transforAions
+      // A(k,k) will store tau_v later
+      A(k,k) = Scalar(1);
+
+      // 3 - Compute y_k^T = tau_v * ( A^T*v_k - Y_k-1*V_k-1^T*v_k - U_k-1*X_k-1^T*v_k )
+      {
+        SubColumnType y_k( Y.col(k).tail(remainingCols) );
+        
+        // let's use the begining of column k of Y as a temporary vector
+        SubColumnType tmp( Y.col(k).head(k) );
+        y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
+        tmp.noalias()  = V_k1.adjoint()  * v_k;
+        y_k.noalias() -= Y_k.leftCols(k) * tmp;
+        tmp.noalias()  = X_k1.adjoint()  * v_k;
+        y_k.noalias() -= U_k1.adjoint()  * tmp;
+        y_k *= numext::conj(tau_v);
+      }
+
+      // 4 - update k-th row of A (it will become u_k)
+      SubRowType u_k( A.row(k).tail(remainingCols) );
+      u_k = u_k.conjugate();
+      {
+        u_k -= Y_k * A.row(k).head(k+1).adjoint();
+        if(k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
+      }
+
+      // 5 - construct right Householder transform in-place
+      u_k.makeHouseholderInPlace(tau_u, upper_diagonal[k]);
+
+      // this eases the application of Householder transformations
+      // A(k,k+1) will store tau_u later
+      A(k,k+1) = Scalar(1);
+
+      // 6 - Compute x_k = tau_u * ( A*u_k - X_k-1*U_k-1^T*u_k - V_k*Y_k^T*u_k )
+      {
+        SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
+        
+        // let's use the begining of column k of X as a temporary vectors
+        // note that tmp0 and tmp1 overlaps
+        SubColumnType tmp0 ( X.col(k).head(k) ),
+                      tmp1 ( X.col(k).head(k+1) );
+                    
+        x_k.noalias()   = A.block(k+1,k+1, remainingRows-1,remainingCols) * u_k.transpose(); // bottleneck
+        tmp0.noalias()  = U_k1 * u_k.transpose();
+        x_k.noalias()  -= X_k1.bottomRows(remainingRows-1) * tmp0;
+        tmp1.noalias()  = Y_k.adjoint() * u_k.transpose();
+        x_k.noalias()  -= A.block(k+1,0, remainingRows-1,k+1) * tmp1;
+        x_k *= numext::conj(tau_u);
+        tau_u = numext::conj(tau_u);
+        u_k = u_k.conjugate();
+      }
+
+      if(k>0) A.coeffRef(k-1,k) = tau_u_prev;
+      tau_u_prev = tau_u;
+    }
+    else
+      A.coeffRef(k-1,k) = tau_u_prev;
+
+    A.coeffRef(k,k) = tau_v;
+  }
+  
+  if(bs<bcols)
+    A.coeffRef(bs-1,bs) = tau_u_prev;
+
+  // update A22
+  if(bcols>bs && brows>bs)
+  {
+    SubMatType A11( A.bottomRightCorner(brows-bs,bcols-bs) );
+    SubMatType A10( A.block(bs,0, brows-bs,bs) );
+    SubMatType A01( A.block(0,bs, bs,bcols-bs) );
+    Scalar tmp = A01(bs-1,0);
+    A01(bs-1,0) = 1;
+    A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint();
+    A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01;
+    A01(bs-1,0) = tmp;
+  }
+}
+
+/** \internal
+  *
+  * Implementation of a block-bidiagonal reduction.
+  * It is based on the following paper:
+  *   The Design of a Parallel Dense Linear Algebra Software Library: Reduction to Hessenberg, Tridiagonal, and Bidiagonal Form.
+  *   by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995)
+  *   section 3.3
+  */
+template<typename MatrixType, typename BidiagType>
+void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal,
+                                            Index maxBlockSize=32,
+                                            typename MatrixType::Scalar* /*tempData*/ = 0)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+
+  Index rows = A.rows();
+  Index cols = A.cols();
+  Index size = (std::min)(rows, cols);
+
+  // X and Y are work space
+  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  Matrix<Scalar,
+         MatrixType::RowsAtCompileTime,
+         Dynamic,
+         StorageOrder,
+         MatrixType::MaxRowsAtCompileTime> X(rows,maxBlockSize);
+  Matrix<Scalar,
+         MatrixType::ColsAtCompileTime,
+         Dynamic,
+         StorageOrder,
+         MatrixType::MaxColsAtCompileTime> Y(cols,maxBlockSize);
+  Index blockSize = (std::min)(maxBlockSize,size);
+
+  Index k = 0;
+  for(k = 0; k < size; k += blockSize)
+  {
+    Index bs = (std::min)(size-k,blockSize);  // actual size of the block
+    Index brows = rows - k;                   // rows of the block
+    Index bcols = cols - k;                   // columns of the block
+
+    // partition the matrix A:
+    // 
+    //      | A00 A01 A02 |
+    //      |             |
+    // A  = | A10 A11 A12 |
+    //      |             |
+    //      | A20 A21 A22 |
+    //
+    // where A11 is a bs x bs diagonal block,
+    // and let:
+    //      | A11 A12 |
+    //  B = |         |
+    //      | A21 A22 |
+
+    BlockType B = A.block(k,k,brows,bcols);
     
-    // construct right householder transform in-place in m_householder
-    m_householder.row(k).tail(remainingCols)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k+1),
-                                         m_bidiagonal.template diagonal<1>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows-1, remainingCols)
-                 .applyHouseholderOnTheRight(m_householder.row(k).tail(remainingCols-1).transpose(),
-                                             m_householder.coeff(k,k+1),
-                                             temp.data());
+    // This stage performs the bidiagonalization of A11, A21, A12, and updating of A22.
+    // Finally, the algorithm continue on the updated A22.
+    //
+    // However, if B is too small, or A22 empty, then let's use an unblocked strategy
+    if(k+bs==cols || bcols<48) // somewhat arbitrary threshold
+    {
+      upperbidiagonalization_inplace_unblocked(B,
+                                               &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                               &(bidiagonal.template diagonal<1>().coeffRef(k)),
+                                               X.data()
+                                              );
+      break; // We're done
+    }
+    else
+    {
+      upperbidiagonalization_blocked_helper<BlockType>( B,
+                                                        &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                                        &(bidiagonal.template diagonal<1>().coeffRef(k)),
+                                                        bs,
+                                                        X.topLeftCorner(brows,bs),
+                                                        Y.topLeftCorner(bcols,bs)
+                                                      );
+    }
   }
+}
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::computeUnblocked(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  EIGEN_ONLY_USED_FOR_DEBUG(cols);
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+
+  ColVectorType temp(rows);
+
+  upperbidiagonalization_inplace_unblocked(m_householder,
+                                           &(m_bidiagonal.template diagonal<0>().coeffRef(0)),
+                                           &(m_bidiagonal.template diagonal<1>().coeffRef(0)),
+                                           temp.data());
+
+  m_isInitialized = true;
+  return *this;
+}
+
+template<typename _MatrixType>
+UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
+{
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  EIGEN_ONLY_USED_FOR_DEBUG(rows);
+  EIGEN_ONLY_USED_FOR_DEBUG(cols);
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+  upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);
+            
   m_isInitialized = true;
   return *this;
 }
diff --git a/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky.h b/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky.h
index f41d7e01..1343eb15 100644
--- a/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -17,6 +17,27 @@ enum SimplicialCholeskyMode {
   SimplicialCholeskyLDLT
 };
 
+namespace internal {
+  template<typename CholMatrixType, typename InputMatrixType>
+  struct simplicial_cholesky_grab_input {
+    typedef CholMatrixType const * ConstCholMatrixPtr;
+    static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
+    {
+      tmp = input;
+      pmat = &tmp;
+    }
+  };
+  
+  template<typename MatrixType>
+  struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
+    typedef MatrixType const * ConstMatrixPtr;
+    static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
+    {
+      pmat = &input;
+    }
+  };
+} // end namespace internal
+
 /** \ingroup SparseCholesky_Module
   * \brief A direct sparse Cholesky factorizations
   *
@@ -33,26 +54,39 @@ enum SimplicialCholeskyMode {
   *
   */
 template<typename Derived>
-class SimplicialCholeskyBase : internal::noncopyable
+class SimplicialCholeskyBase : public SparseSolverBase<Derived>
 {
+    typedef SparseSolverBase<Derived> Base;
+    using Base::m_isInitialized;
+    
   public:
     typedef typename internal::traits<Derived>::MatrixType MatrixType;
+    typedef typename internal::traits<Derived>::OrderingType OrderingType;
     enum { UpLo = internal::traits<Derived>::UpLo };
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
+    typedef CholMatrixType const * ConstCholMatrixPtr;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
+    
+    using Base::derived;
 
     /** Default constructor */
     SimplicialCholeskyBase()
-      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
+      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
     {}
 
-    SimplicialCholeskyBase(const MatrixType& matrix)
-      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
+    explicit SimplicialCholeskyBase(const MatrixType& matrix)
+      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
     {
       derived().compute(matrix);
     }
@@ -78,42 +112,14 @@ class SimplicialCholeskyBase : internal::noncopyable
       return m_info;
     }
     
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SimplicialCholeskyBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SimplicialCholeskyBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SimplicialCholesky::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
-    }
-    
     /** \returns the permutation P
       * \sa permutationPinv() */
-    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationP() const
+    const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& permutationP() const
     { return m_P; }
     
     /** \returns the inverse P^-1 of the permutation P
       * \sa permutationP() */
-    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationPinv() const
+    const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& permutationPinv() const
     { return m_Pinv; }
 
     /** Sets the shift parameters that will be used to adjust the diagonal coefficients during the numerical factorization.
@@ -149,7 +155,7 @@ class SimplicialCholeskyBase : internal::noncopyable
 
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       eigen_assert(m_matrix.rows()==b.rows());
@@ -174,6 +180,12 @@ class SimplicialCholeskyBase : internal::noncopyable
       if(m_P.size()>0)
         dest = m_Pinv * dest;
     }
+    
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(derived(), b, dest);
+    }
 
 #endif // EIGEN_PARSED_BY_DOXYGEN
 
@@ -185,20 +197,33 @@ class SimplicialCholeskyBase : internal::noncopyable
     {
       eigen_assert(matrix.rows()==matrix.cols());
       Index size = matrix.cols();
-      CholMatrixType ap(size,size);
-      ordering(matrix, ap);
-      analyzePattern_preordered(ap, DoLDLT);
-      factorize_preordered<DoLDLT>(ap);
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(matrix, pmat, tmp);
+      analyzePattern_preordered(*pmat, DoLDLT);
+      factorize_preordered<DoLDLT>(*pmat);
     }
     
     template<bool DoLDLT>
     void factorize(const MatrixType& a)
     {
       eigen_assert(a.rows()==a.cols());
-      int size = a.cols();
-      CholMatrixType ap(size,size);
-      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
-      factorize_preordered<DoLDLT>(ap);
+      Index size = a.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      
+      if(m_P.size()==0 && (UpLo&Upper)==Upper)
+      {
+        // If there is no ordering, try to directly use the input matrix without any copy
+        internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
+      }
+      else
+      {
+        tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+        pmat = &tmp;
+      }
+      
+      factorize_preordered<DoLDLT>(*pmat);
     }
 
     template<bool DoLDLT>
@@ -207,14 +232,15 @@ class SimplicialCholeskyBase : internal::noncopyable
     void analyzePattern(const MatrixType& a, bool doLDLT)
     {
       eigen_assert(a.rows()==a.cols());
-      int size = a.cols();
-      CholMatrixType ap(size,size);
-      ordering(a, ap);
-      analyzePattern_preordered(ap,doLDLT);
+      Index size = a.cols();
+      CholMatrixType tmp(size,size);
+      ConstCholMatrixPtr pmat;
+      ordering(a, pmat, tmp);
+      analyzePattern_preordered(*pmat,doLDLT);
     }
     void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
     
-    void ordering(const MatrixType& a, CholMatrixType& ap);
+    void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
 
     /** keeps off-diagonal entries; drops diagonal entries */
     struct keep_diag {
@@ -225,56 +251,58 @@ class SimplicialCholeskyBase : internal::noncopyable
     };
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     bool m_factorizationIsOk;
     bool m_analysisIsOk;
     
     CholMatrixType m_matrix;
     VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
-    VectorXi m_parent;                                // elimination tree
-    VectorXi m_nonZerosPerCol;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // the permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // the inverse permutation
+    VectorI m_parent;                                 // elimination tree
+    VectorI m_nonZerosPerCol;
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_P;     // the permutation
+    PermutationMatrix<Dynamic,Dynamic,StorageIndex> m_Pinv;  // the inverse permutation
 
     RealScalar m_shiftOffset;
     RealScalar m_shiftScale;
 };
 
-template<typename _MatrixType, int _UpLo = Lower> class SimplicialLLT;
-template<typename _MatrixType, int _UpLo = Lower> class SimplicialLDLT;
-template<typename _MatrixType, int _UpLo = Lower> class SimplicialCholesky;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialLLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialLDLT;
+template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::StorageIndex> > class SimplicialCholesky;
 
 namespace internal {
 
-template<typename _MatrixType, int _UpLo> struct traits<SimplicialLLT<_MatrixType,_UpLo> >
+template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
 {
   typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
   enum { UpLo = _UpLo };
   typedef typename MatrixType::Scalar                         Scalar;
-  typedef typename MatrixType::Index                          Index;
-  typedef SparseMatrix<Scalar, ColMajor, Index>               CholMatrixType;
-  typedef SparseTriangularView<CholMatrixType, Eigen::Lower>  MatrixL;
-  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  typedef typename MatrixType::StorageIndex                   StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex>        CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::Lower>  MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
-template<typename _MatrixType,int _UpLo> struct traits<SimplicialLDLT<_MatrixType,_UpLo> >
+template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
 {
   typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
   enum { UpLo = _UpLo };
   typedef typename MatrixType::Scalar                             Scalar;
-  typedef typename MatrixType::Index                              Index;
-  typedef SparseMatrix<Scalar, ColMajor, Index>                   CholMatrixType;
-  typedef SparseTriangularView<CholMatrixType, Eigen::UnitLower>  MatrixL;
-  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  typedef typename MatrixType::StorageIndex                       StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex>            CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::UnitLower>  MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
-template<typename _MatrixType, int _UpLo> struct traits<SimplicialCholesky<_MatrixType,_UpLo> >
+template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
 {
   typedef _MatrixType MatrixType;
+  typedef _Ordering OrderingType;
   enum { UpLo = _UpLo };
 };
 
@@ -294,11 +322,14 @@ template<typename _MatrixType, int _UpLo> struct traits<SimplicialCholesky<_Matr
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
+  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+  *
+  * \implsparsesolverconcept
   *
-  * \sa class SimplicialLDLT
+  * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
   */
-template<typename _MatrixType, int _UpLo>
-    class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<_MatrixType,_UpLo> >
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
 {
 public:
     typedef _MatrixType MatrixType;
@@ -306,7 +337,7 @@ template<typename _MatrixType, int _UpLo>
     typedef SimplicialCholeskyBase<SimplicialLLT> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef internal::traits<SimplicialLLT> Traits;
@@ -316,7 +347,7 @@ template<typename _MatrixType, int _UpLo>
     /** Default constructor */
     SimplicialLLT() : Base() {}
     /** Constructs and performs the LLT factorization of \a matrix */
-    SimplicialLLT(const MatrixType& matrix)
+    explicit SimplicialLLT(const MatrixType& matrix)
         : Base(matrix) {}
 
     /** \returns an expression of the factor L */
@@ -382,11 +413,14 @@ template<typename _MatrixType, int _UpLo>
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
   *               or Upper. Default is Lower.
+  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
   *
-  * \sa class SimplicialLLT
+  * \implsparsesolverconcept
+  *
+  * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
   */
-template<typename _MatrixType, int _UpLo>
-    class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<_MatrixType,_UpLo> >
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
 {
 public:
     typedef _MatrixType MatrixType;
@@ -394,8 +428,8 @@ template<typename _MatrixType, int _UpLo>
     typedef SimplicialCholeskyBase<SimplicialLDLT> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef internal::traits<SimplicialLDLT> Traits;
     typedef typename Traits::MatrixL  MatrixL;
@@ -405,7 +439,7 @@ template<typename _MatrixType, int _UpLo>
     SimplicialLDLT() : Base() {}
 
     /** Constructs and performs the LLT factorization of \a matrix */
-    SimplicialLDLT(const MatrixType& matrix)
+    explicit SimplicialLDLT(const MatrixType& matrix)
         : Base(matrix) {}
 
     /** \returns a vector expression of the diagonal D */
@@ -467,8 +501,8 @@ template<typename _MatrixType, int _UpLo>
   *
   * \sa class SimplicialLDLT, class SimplicialLLT
   */
-template<typename _MatrixType, int _UpLo>
-    class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<_MatrixType,_UpLo> >
+template<typename _MatrixType, int _UpLo, typename _Ordering>
+    class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
 {
 public:
     typedef _MatrixType MatrixType;
@@ -476,8 +510,8 @@ template<typename _MatrixType, int _UpLo>
     typedef SimplicialCholeskyBase<SimplicialCholesky> Base;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
     typedef Matrix<Scalar,Dynamic,1> VectorType;
     typedef internal::traits<SimplicialCholesky> Traits;
     typedef internal::traits<SimplicialLDLT<MatrixType,UpLo> > LDLTTraits;
@@ -485,7 +519,7 @@ template<typename _MatrixType, int _UpLo>
   public:
     SimplicialCholesky() : Base(), m_LDLT(true) {}
 
-    SimplicialCholesky(const MatrixType& matrix)
+    explicit SimplicialCholesky(const MatrixType& matrix)
       : Base(), m_LDLT(true)
     {
       compute(matrix);
@@ -554,7 +588,7 @@ template<typename _MatrixType, int _UpLo>
 
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
     {
       eigen_assert(Base::m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
       eigen_assert(Base::m_matrix.rows()==b.rows());
@@ -590,6 +624,13 @@ template<typename _MatrixType, int _UpLo>
         dest = Base::m_Pinv * dest;
     }
     
+    /** \internal */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(*this, b, dest);
+    }
+    
     Scalar determinant() const
     {
       if(m_LDLT)
@@ -608,60 +649,43 @@ template<typename _MatrixType, int _UpLo>
 };
 
 template<typename Derived>
-void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
 {
   eigen_assert(a.rows()==a.cols());
   const Index size = a.rows();
-  // TODO allows to configure the permutation
-  // Note that amd compute the inverse permutation
+  pmat = &ap;
+  // Note that ordering methods compute the inverse permutation
+  if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
   {
-    CholMatrixType C;
-    C = a.template selfadjointView<UpLo>();
-    // remove diagonal entries:
-    // seems not to be needed
-    // C.prune(keep_diag());
-    internal::minimum_degree_ordering(C, m_Pinv);
-  }
+    {
+      CholMatrixType C;
+      C = a.template selfadjointView<UpLo>();
+      
+      OrderingType ordering;
+      ordering(C,m_Pinv);
+    }
 
-  if(m_Pinv.size()>0)
-    m_P = m_Pinv.inverse();
+    if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
+    else                m_P.resize(0);
+    
+    ap.resize(size,size);
+    ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+  }
   else
+  {
+    m_Pinv.resize(0);
     m_P.resize(0);
-
-  ap.resize(size,size);
-  ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
+    if(int(UpLo)==int(Lower) || MatrixType::IsRowMajor)
+    {
+      // we have to transpose the lower part to to the upper one
+      ap.resize(size,size);
+      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
+    }
+    else
+      internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
+  }  
 }
 
-namespace internal {
-  
-template<typename Derived, typename Rhs>
-struct solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
-  : solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
-{
-  typedef SimplicialCholeskyBase<Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
-
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
-  : sparse_solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
-{
-  typedef SimplicialCholeskyBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_SIMPLICIAL_CHOLESKY_H
diff --git a/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 7aaf702b..31e06995 100644
--- a/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/nuparu/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -50,14 +50,14 @@ namespace Eigen {
 template<typename Derived>
 void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT)
 {
-  const Index size = ap.rows();
+  const StorageIndex size = StorageIndex(ap.rows());
   m_matrix.resize(size, size);
   m_parent.resize(size);
   m_nonZerosPerCol.resize(size);
 
-  ei_declare_aligned_stack_constructed_variable(Index, tags, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex, tags, size, 0);
 
-  for(Index k = 0; k < size; ++k)
+  for(StorageIndex k = 0; k < size; ++k)
   {
     /* L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k) */
     m_parent[k] = -1;             /* parent of k is not yet known */
@@ -65,7 +65,7 @@ void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrix
     m_nonZerosPerCol[k] = 0;      /* count of nonzeros in column k of L */
     for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = it.index();
       if(i < k)
       {
         /* follow path from i to root of etree, stop at flagged node */
@@ -82,9 +82,9 @@ void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrix
   }
 
   /* construct Lp index array from m_nonZerosPerCol column counts */
-  Index* Lp = m_matrix.outerIndexPtr();
+  StorageIndex* Lp = m_matrix.outerIndexPtr();
   Lp[0] = 0;
-  for(Index k = 0; k < size; ++k)
+  for(StorageIndex k = 0; k < size; ++k)
     Lp[k+1] = Lp[k] + m_nonZerosPerCol[k] + (doLDLT ? 0 : 1);
 
   m_matrix.resizeNonZeros(Lp[size]);
@@ -104,31 +104,31 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
 
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
   eigen_assert(ap.rows()==ap.cols());
-  const Index size = ap.rows();
-  eigen_assert(m_parent.size()==size);
-  eigen_assert(m_nonZerosPerCol.size()==size);
+  eigen_assert(m_parent.size()==ap.rows());
+  eigen_assert(m_nonZerosPerCol.size()==ap.rows());
 
-  const Index* Lp = m_matrix.outerIndexPtr();
-  Index* Li = m_matrix.innerIndexPtr();
+  const StorageIndex size = StorageIndex(ap.rows());
+  const StorageIndex* Lp = m_matrix.outerIndexPtr();
+  StorageIndex* Li = m_matrix.innerIndexPtr();
   Scalar* Lx = m_matrix.valuePtr();
 
   ei_declare_aligned_stack_constructed_variable(Scalar, y, size, 0);
-  ei_declare_aligned_stack_constructed_variable(Index,  pattern, size, 0);
-  ei_declare_aligned_stack_constructed_variable(Index,  tags, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,  pattern, size, 0);
+  ei_declare_aligned_stack_constructed_variable(StorageIndex,  tags, size, 0);
 
   bool ok = true;
   m_diag.resize(DoLDLT ? size : 0);
 
-  for(Index k = 0; k < size; ++k)
+  for(StorageIndex k = 0; k < size; ++k)
   {
     // compute nonzero pattern of kth row of L, in topological order
     y[k] = 0.0;                     // Y(0:k) is now all zero
-    Index top = size;               // stack for pattern is empty
+    StorageIndex top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
-    for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
+    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = it.index();
       if(i <= k)
       {
         y[i] += numext::conj(it.value());            /* scatter A(i,k) into Y (sum duplicates) */
diff --git a/nuparu/include/Eigen/src/SparseCore/AmbiVector.h b/nuparu/include/Eigen/src/SparseCore/AmbiVector.h
index 17fff96a..1233e164 100644
--- a/nuparu/include/Eigen/src/SparseCore/AmbiVector.h
+++ b/nuparu/include/Eigen/src/SparseCore/AmbiVector.h
@@ -19,15 +19,15 @@ namespace internal {
   *
   * See BasicSparseLLT and SparseProduct for usage examples.
   */
-template<typename _Scalar, typename _Index>
+template<typename _Scalar, typename _StorageIndex>
 class AmbiVector
 {
   public:
     typedef _Scalar Scalar;
-    typedef _Index Index;
+    typedef _StorageIndex StorageIndex;
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
-    AmbiVector(Index size)
+    explicit AmbiVector(Index size)
       : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
     {
       resize(size);
@@ -39,7 +39,7 @@ class AmbiVector
     Index nonZeros() const;
 
     /** Specifies a sub-vector to work on */
-    void setBounds(Index start, Index end) { m_start = start; m_end = end; }
+    void setBounds(Index start, Index end) { m_start = convert_index(start); m_end = convert_index(end); }
 
     void setZero();
 
@@ -55,12 +55,16 @@ class AmbiVector
     {
       if (m_allocatedSize < size)
         reallocate(size);
-      m_size = size;
+      m_size = convert_index(size);
     }
 
-    Index size() const { return m_size; }
+    StorageIndex size() const { return m_size; }
 
   protected:
+    StorageIndex convert_index(Index idx)
+    {
+      return internal::convert_index<StorageIndex>(idx);
+    }
 
     void reallocate(Index size)
     {
@@ -69,16 +73,16 @@ class AmbiVector
       delete[] m_buffer;
       if (size<1000)
       {
-        Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar);
-        m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl);
+        Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
+        m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[allocSize];
       }
       else
       {
-        m_allocatedElements = (size*sizeof(Scalar))/sizeof(ListEl);
+        m_allocatedElements = convert_index((size*sizeof(Scalar))/sizeof(ListEl));
         m_buffer = new Scalar[size];
       }
-      m_size = size;
+      m_size = convert_index(size);
       m_start = 0;
       m_end = m_size;
     }
@@ -86,9 +90,9 @@ class AmbiVector
     void reallocateSparse()
     {
       Index copyElements = m_allocatedElements;
-      m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size);
+      m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size);
       Index allocSize = m_allocatedElements * sizeof(ListEl);
-      allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0);
+      allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
       Scalar* newBuffer = new Scalar[allocSize];
       memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
       delete[] m_buffer;
@@ -99,30 +103,30 @@ class AmbiVector
     // element type of the linked list
     struct ListEl
     {
-      Index next;
-      Index index;
+      StorageIndex next;
+      StorageIndex index;
       Scalar value;
     };
 
     // used to store data in both mode
     Scalar* m_buffer;
     Scalar m_zero;
-    Index m_size;
-    Index m_start;
-    Index m_end;
-    Index m_allocatedSize;
-    Index m_allocatedElements;
-    Index m_mode;
+    StorageIndex m_size;
+    StorageIndex m_start;
+    StorageIndex m_end;
+    StorageIndex m_allocatedSize;
+    StorageIndex m_allocatedElements;
+    StorageIndex m_mode;
 
     // linked list mode
-    Index m_llStart;
-    Index m_llCurrent;
-    Index m_llSize;
+    StorageIndex m_llStart;
+    StorageIndex m_llCurrent;
+    StorageIndex m_llSize;
 };
 
 /** \returns the number of non zeros in the current sub vector */
-template<typename _Scalar,typename _Index>
-_Index AmbiVector<_Scalar,_Index>::nonZeros() const
+template<typename _Scalar,typename _StorageIndex>
+Index AmbiVector<_Scalar,_StorageIndex>::nonZeros() const
 {
   if (m_mode==IsSparse)
     return m_llSize;
@@ -130,8 +134,8 @@ _Index AmbiVector<_Scalar,_Index>::nonZeros() const
     return m_end - m_start;
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::init(double estimatedDensity)
 {
   if (estimatedDensity>0.1)
     init(IsDense);
@@ -139,8 +143,8 @@ void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
     init(IsSparse);
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(int mode)
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::init(int mode)
 {
   m_mode = mode;
   if (m_mode==IsSparse)
@@ -155,15 +159,15 @@ void AmbiVector<_Scalar,_Index>::init(int mode)
   *
   * Don't worry, this function is extremely cheap.
   */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::restart()
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::restart()
 {
   m_llCurrent = m_llStart;
 }
 
 /** Set all coefficients of current subvector to zero */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::setZero()
+template<typename _Scalar,typename _StorageIndex>
+void AmbiVector<_Scalar,_StorageIndex>::setZero()
 {
   if (m_mode==IsDense)
   {
@@ -178,8 +182,8 @@ void AmbiVector<_Scalar,_Index>::setZero()
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
+template<typename _Scalar,typename _StorageIndex>
+_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeffRef(Index i)
 {
   if (m_mode==IsDense)
     return m_buffer[i];
@@ -195,7 +199,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
       m_llCurrent = 0;
       ++m_llSize;
       llElements[0].value = Scalar(0);
-      llElements[0].index = i;
+      llElements[0].index = convert_index(i);
       llElements[0].next = -1;
       return llElements[0].value;
     }
@@ -204,7 +208,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
       // this is going to be the new first element of the list
       ListEl& el = llElements[m_llSize];
       el.value = Scalar(0);
-      el.index = i;
+      el.index = convert_index(i);
       el.next = m_llStart;
       m_llStart = m_llSize;
       ++m_llSize;
@@ -213,7 +217,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
     }
     else
     {
-      Index nextel = llElements[m_llCurrent].next;
+      StorageIndex nextel = llElements[m_llCurrent].next;
       eigen_assert(i>=llElements[m_llCurrent].index && "you must call restart() before inserting an element with lower or equal index");
       while (nextel >= 0 && llElements[nextel].index<=i)
       {
@@ -237,7 +241,7 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
         // let's insert a new coefficient
         ListEl& el = llElements[m_llSize];
         el.value = Scalar(0);
-        el.index = i;
+        el.index = convert_index(i);
         el.next = llElements[m_llCurrent].next;
         llElements[m_llCurrent].next = m_llSize;
         ++m_llSize;
@@ -247,8 +251,8 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
+template<typename _Scalar,typename _StorageIndex>
+_Scalar& AmbiVector<_Scalar,_StorageIndex>::coeff(Index i)
 {
   if (m_mode==IsDense)
     return m_buffer[i];
@@ -275,8 +279,8 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
 }
 
 /** Iterator over the nonzero coefficients */
-template<typename _Scalar,typename _Index>
-class AmbiVector<_Scalar,_Index>::Iterator
+template<typename _Scalar,typename _StorageIndex>
+class AmbiVector<_Scalar,_StorageIndex>::Iterator
 {
   public:
     typedef _Scalar Scalar;
@@ -288,7 +292,7 @@ class AmbiVector<_Scalar,_Index>::Iterator
       * In practice, all coefficients having a magnitude smaller than \a epsilon
       * are skipped.
       */
-    Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
+    explicit Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
       : m_vector(vec)
     {
       using std::abs;
@@ -320,7 +324,7 @@ class AmbiVector<_Scalar,_Index>::Iterator
       }
     }
 
-    Index index() const { return m_cachedIndex; }
+    StorageIndex index() const { return m_cachedIndex; }
     Scalar value() const { return m_cachedValue; }
 
     operator bool() const { return m_cachedIndex>=0; }
@@ -359,9 +363,9 @@ class AmbiVector<_Scalar,_Index>::Iterator
 
   protected:
     const AmbiVector& m_vector; // the target vector
-    Index m_currentEl;            // the current element in sparse/linked-list mode
+    StorageIndex m_currentEl;            // the current element in sparse/linked-list mode
     RealScalar m_epsilon;       // epsilon used to prune zero coefficients
-    Index m_cachedIndex;          // current coordinate
+    StorageIndex m_cachedIndex;          // current coordinate
     Scalar m_cachedValue;       // current value
     bool m_isDense;             // mode of the vector
 };
diff --git a/nuparu/include/Eigen/src/SparseCore/CompressedStorage.h b/nuparu/include/Eigen/src/SparseCore/CompressedStorage.h
index 3321fab4..2199848e 100644
--- a/nuparu/include/Eigen/src/SparseCore/CompressedStorage.h
+++ b/nuparu/include/Eigen/src/SparseCore/CompressedStorage.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,13 +18,13 @@ namespace internal {
   * Stores a sparse set of values as a list of values and a list of indices.
   *
   */
-template<typename _Scalar,typename _Index>
+template<typename _Scalar,typename _StorageIndex>
 class CompressedStorage
 {
   public:
 
     typedef _Scalar Scalar;
-    typedef _Index Index;
+    typedef _StorageIndex StorageIndex;
 
   protected:
 
@@ -36,7 +36,7 @@ class CompressedStorage
       : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
     {}
 
-    CompressedStorage(size_t size)
+    explicit CompressedStorage(Index size)
       : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
     {
       resize(size);
@@ -51,8 +51,11 @@ class CompressedStorage
     CompressedStorage& operator=(const CompressedStorage& other)
     {
       resize(other.size());
-      memcpy(m_values, other.m_values, m_size * sizeof(Scalar));
-      memcpy(m_indices, other.m_indices, m_size * sizeof(Index));
+      if(other.size()>0)
+      {
+        internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
+        internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
+      }
       return *this;
     }
 
@@ -70,9 +73,9 @@ class CompressedStorage
       delete[] m_indices;
     }
 
-    void reserve(size_t size)
+    void reserve(Index size)
     {
-      size_t newAllocatedSize = m_size + size;
+      Index newAllocatedSize = m_size + size;
       if (newAllocatedSize > m_allocatedSize)
         reallocate(newAllocatedSize);
     }
@@ -83,39 +86,35 @@ class CompressedStorage
         reallocate(m_size);
     }
 
-    void resize(size_t size, float reserveSizeFactor = 0)
+    void resize(Index size, double reserveSizeFactor = 0)
     {
       if (m_allocatedSize<size)
-        reallocate(size + size_t(reserveSizeFactor*size));
+      {
+        Index realloc_size = (std::min<Index>)(NumTraits<StorageIndex>::highest(),  size + Index(reserveSizeFactor*double(size)));
+        if(realloc_size<size)
+          internal::throw_std_bad_alloc();
+        reallocate(realloc_size);
+      }
       m_size = size;
     }
 
     void append(const Scalar& v, Index i)
     {
-      Index id = static_cast<Index>(m_size);
+      Index id = m_size;
       resize(m_size+1, 1);
       m_values[id] = v;
-      m_indices[id] = i;
+      m_indices[id] = internal::convert_index<StorageIndex>(i);
     }
 
-    inline size_t size() const { return m_size; }
-    inline size_t allocatedSize() const { return m_allocatedSize; }
+    inline Index size() const { return m_size; }
+    inline Index allocatedSize() const { return m_allocatedSize; }
     inline void clear() { m_size = 0; }
 
-    inline Scalar& value(size_t i) { return m_values[i]; }
-    inline const Scalar& value(size_t i) const { return m_values[i]; }
+    inline Scalar& value(Index i) { return m_values[i]; }
+    inline const Scalar& value(Index i) const { return m_values[i]; }
 
-    inline Index& index(size_t i) { return m_indices[i]; }
-    inline const Index& index(size_t i) const { return m_indices[i]; }
-
-    static CompressedStorage Map(Index* indices, Scalar* values, size_t size)
-    {
-      CompressedStorage res;
-      res.m_indices = indices;
-      res.m_values = values;
-      res.m_allocatedSize = res.m_size = size;
-      return res;
-    }
+    inline StorageIndex& index(Index i) { return m_indices[i]; }
+    inline const StorageIndex& index(Index i) const { return m_indices[i]; }
 
     /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */
     inline Index searchLowerIndex(Index key) const
@@ -124,17 +123,17 @@ class CompressedStorage
     }
 
     /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
-    inline Index searchLowerIndex(size_t start, size_t end, Index key) const
+    inline Index searchLowerIndex(Index start, Index end, Index key) const
     {
       while(end>start)
       {
-        size_t mid = (end+start)>>1;
+        Index mid = (end+start)>>1;
         if (m_indices[mid]<key)
           start = mid+1;
         else
           end = mid;
       }
-      return static_cast<Index>(start);
+      return start;
     }
 
     /** \returns the stored value at index \a key
@@ -147,20 +146,20 @@ class CompressedStorage
         return m_values[m_size-1];
       // ^^  optimization: let's first check if it is the last coefficient
       // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(0,m_size-1,key);
+      const Index id = searchLowerIndex(0,m_size-1,key);
       return ((id<m_size) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
     }
 
     /** Like at(), but the search is performed in the range [start,end) */
-    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
+    inline Scalar atInRange(Index start, Index end, Index key, const Scalar &defaultValue = Scalar(0)) const
     {
       if (start>=end)
-        return Scalar(0);
+        return defaultValue;
       else if (end>start && key==m_indices[end-1])
         return m_values[end-1];
       // ^^  optimization: let's first check if it is the last coefficient
       // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(start,end-1,key);
+      const Index id = searchLowerIndex(start,end-1,key);
       return ((id<end) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
     }
 
@@ -169,16 +168,35 @@ class CompressedStorage
       * such that the keys are sorted. */
     inline Scalar& atWithInsertion(Index key, const Scalar& defaultValue = Scalar(0))
     {
-      size_t id = searchLowerIndex(0,m_size,key);
+      Index id = searchLowerIndex(0,m_size,key);
       if (id>=m_size || m_indices[id]!=key)
       {
-        resize(m_size+1,1);
-        for (size_t j=m_size-1; j>id; --j)
+        if (m_allocatedSize<m_size+1)
+        {
+          m_allocatedSize = 2*(m_size+1);
+          internal::scoped_array<Scalar> newValues(m_allocatedSize);
+          internal::scoped_array<StorageIndex> newIndices(m_allocatedSize);
+
+          // copy first chunk
+          internal::smart_copy(m_values,  m_values +id, newValues.ptr());
+          internal::smart_copy(m_indices, m_indices+id, newIndices.ptr());
+
+          // copy the rest
+          if(m_size>id)
+          {
+            internal::smart_copy(m_values +id,  m_values +m_size, newValues.ptr() +id+1);
+            internal::smart_copy(m_indices+id,  m_indices+m_size, newIndices.ptr()+id+1);
+          }
+          std::swap(m_values,newValues.ptr());
+          std::swap(m_indices,newIndices.ptr());
+        }
+        else if(m_size>id)
         {
-          m_indices[j] = m_indices[j-1];
-          m_values[j] = m_values[j-1];
+          internal::smart_memmove(m_values +id, m_values +m_size, m_values +id+1);
+          internal::smart_memmove(m_indices+id, m_indices+m_size, m_indices+id+1);
         }
-        m_indices[id] = key;
+        m_size++;
+        m_indices[id] = internal::convert_index<StorageIndex>(key);
         m_values[id] = defaultValue;
       }
       return m_values[id];
@@ -186,9 +204,9 @@ class CompressedStorage
 
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
-      size_t k = 0;
-      size_t n = size();
-      for (size_t i=0; i<n; ++i)
+      Index k = 0;
+      Index n = size();
+      for (Index i=0; i<n; ++i)
       {
         if (!internal::isMuchSmallerThan(value(i), reference, epsilon))
         {
@@ -202,27 +220,29 @@ class CompressedStorage
 
   protected:
 
-    inline void reallocate(size_t size)
-    {
-      Scalar* newValues  = new Scalar[size];
-      Index* newIndices = new Index[size];
-      size_t copySize = (std::min)(size, m_size);
-      // copy
-      internal::smart_copy(m_values, m_values+copySize, newValues);
-      internal::smart_copy(m_indices, m_indices+copySize, newIndices);
-      // delete old stuff
-      delete[] m_values;
-      delete[] m_indices;
-      m_values = newValues;
-      m_indices = newIndices;
+    inline void reallocate(Index size)
+    {
+      #ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+        EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+      #endif
+      eigen_internal_assert(size!=m_allocatedSize);
+      internal::scoped_array<Scalar> newValues(size);
+      internal::scoped_array<StorageIndex> newIndices(size);
+      Index copySize = (std::min)(size, m_size);
+      if (copySize>0) {
+        internal::smart_copy(m_values, m_values+copySize, newValues.ptr());
+        internal::smart_copy(m_indices, m_indices+copySize, newIndices.ptr());
+      }
+      std::swap(m_values,newValues.ptr());
+      std::swap(m_indices,newIndices.ptr());
       m_allocatedSize = size;
     }
 
   protected:
     Scalar* m_values;
-    Index* m_indices;
-    size_t m_size;
-    size_t m_allocatedSize;
+    StorageIndex* m_indices;
+    Index m_size;
+    Index m_allocatedSize;
 
 };
 
diff --git a/nuparu/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/nuparu/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 4b13f08d..0f683584 100644
--- a/nuparu/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/nuparu/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -15,27 +15,31 @@ namespace Eigen {
 namespace internal {
 
 template<typename Lhs, typename Rhs, typename ResultType>
-static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false)
 {
   typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
-
-  std::vector<bool> mask(rows,false);
-  Matrix<Scalar,Dynamic,1> values(rows);
-  Matrix<Index,Dynamic,1>  indices(rows);
-
+  
+  ei_declare_aligned_stack_constructed_variable(bool,   mask,     rows, 0);
+  ei_declare_aligned_stack_constructed_variable(Scalar, values,   rows, 0);
+  ei_declare_aligned_stack_constructed_variable(Index,  indices,  rows, 0);
+  
+  std::memset(mask,0,sizeof(bool)*rows);
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+  
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
   // of the lhs differs in average of one non zeros, thus the number of non zeros for
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.setZero();
   res.reserve(Index(estimated_nnz_prod));
@@ -45,11 +49,11 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
 
     res.startVec(j);
     Index nnz = 0;
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
       Scalar y = rhsIt.value();
       Index k = rhsIt.index();
-      for (typename Lhs::InnerIterator lhsIt(lhs, k); lhsIt; ++lhsIt)
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
       {
         Index i = lhsIt.index();
         Scalar x = lhsIt.value();
@@ -64,53 +68,51 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
           values[i] += x * y;
       }
     }
-
-    // unordered insertion
-    for(int k=0; k<nnz; ++k)
-    {
-      int i = indices[k];
-      res.insertBackByOuterInnerUnordered(j,i) = values[i];
-      mask[i] = false;
-    }
-
-#if 0
-    // alternative ordered insertion code:
-
-    int t200 = rows/(log2(200)*1.39);
-    int t = (rows*100)/139;
-
-    // FIXME reserve nnz non zeros
-    // FIXME implement fast sort algorithms for very small nnz
-    // if the result is sparse enough => use a quick sort
-    // otherwise => loop through the entire vector
-    // In order to avoid to perform an expensive log2 when the
-    // result is clearly very sparse we use a linear bound up to 200.
-    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
-    //res.startVec(j);
-    if(true)
+    if(!sortedInsertion)
     {
-      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
-      for(int k=0; k<nnz; ++k)
+      // unordered insertion
+      for(Index k=0; k<nnz; ++k)
       {
-        int i = indices[k];
-        res.insertBackByOuterInner(j,i) = values[i];
+        Index i = indices[k];
+        res.insertBackByOuterInnerUnordered(j,i) = values[i];
         mask[i] = false;
       }
     }
     else
     {
-      // dense path
-      for(int i=0; i<rows; ++i)
+      // alternative ordered insertion code:
+      const Index t200 = rows/11; // 11 == (log2(200)*1.39)
+      const Index t = (rows*100)/139;
+
+      // FIXME reserve nnz non zeros
+      // FIXME implement faster sorting algorithms for very small nnz
+      // if the result is sparse enough => use a quick sort
+      // otherwise => loop through the entire vector
+      // In order to avoid to perform an expensive log2 when the
+      // result is clearly very sparse we use a linear bound up to 200.
+      if((nnz<200 && nnz<t200) || nnz * numext::log2(int(nnz)) < t)
       {
-        if(mask[i])
+        if(nnz>1) std::sort(indices,indices+nnz);
+        for(Index k=0; k<nnz; ++k)
         {
-          mask[i] = false;
+          Index i = indices[k];
           res.insertBackByOuterInner(j,i) = values[i];
+          mask[i] = false;
+        }
+      }
+      else
+      {
+        // dense path
+        for(Index i=0; i<rows; ++i)
+        {
+          if(mask[i])
+          {
+            mask[i] = false;
+            res.insertBackByOuterInner(j,i) = values[i];
+          }
         }
       }
     }
-#endif
-
   }
   res.finalize();
 }
@@ -134,13 +136,28 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor> ColMajorMatrix;
-    ColMajorMatrix resCol(lhs.rows(),rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
-    // sort the non zeros:
-    RowMajorMatrix resRow(resCol);
-    res = resRow;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrixAux;
+    typedef typename sparse_eval<ColMajorMatrixAux,ResultType::RowsAtCompileTime,ResultType::ColsAtCompileTime,ColMajorMatrixAux::Flags>::type ColMajorMatrix;
+    
+    // If the result is tall and thin (in the extreme case a column vector)
+    // then it is faster to sort the coefficients inplace instead of transposing twice.
+    // FIXME, the following heuristic is probably not very good.
+    if(lhs.rows()>=rhs.cols())
+    {
+      ColMajorMatrix resCol(lhs.rows(),rhs.cols());
+      // perform sorted insertion
+      internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol, true);
+      res = resCol.markAsRValue();
+    }
+    else
+    {
+      ColMajorMatrixAux resCol(lhs.rows(),rhs.cols());
+      // ressort to transpose to sort the entries
+      internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrixAux>(lhs, rhs, resCol, false);
+      RowMajorMatrix resRow(resCol);
+      res = resRow.markAsRValue();
+    }
   }
 };
 
@@ -149,7 +166,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-     typedef SparseMatrix<typename ResultType::Scalar,RowMajor> RowMajorMatrix;
+     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
      RowMajorMatrix rhsRow = rhs;
      RowMajorMatrix resRow(lhs.rows(), rhs.cols());
      internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
@@ -162,7 +179,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     RowMajorMatrix lhsRow = lhs;
     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
@@ -175,7 +192,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
     res = resRow;
@@ -190,7 +207,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,C
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
     res = resCol;
@@ -202,7 +219,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,C
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix lhsCol = lhs;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
@@ -215,7 +232,7 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     ColMajorMatrix rhsCol = rhs;
     ColMajorMatrix resCol(lhs.rows(), rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
@@ -228,8 +245,8 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
 {
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor> ColMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::StorageIndex> RowMajorMatrix;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
     RowMajorMatrix resRow(lhs.rows(),rhs.cols());
     internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
     // sort the non zeros:
@@ -238,6 +255,89 @@ struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,R
   }
 };
 
+} // end namespace internal
+
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+{
+  typedef typename remove_all<Lhs>::type::Scalar Scalar;
+  Index cols = rhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+
+  for (Index j=0; j<cols; ++j)
+  {
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
+    {
+      Scalar y = rhsIt.value();
+      Index k = rhsIt.index();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt)
+      {
+        Index i = lhsIt.index();
+        Scalar x = lhsIt.value();
+        res.coeffRef(i,j) += x * y;
+      }
+    }
+  }
+}
+
+
+} // end namespace internal
+
+namespace internal {
+
+template<typename Lhs, typename Rhs, typename ResultType,
+  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
+  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor>
+struct sparse_sparse_to_dense_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    internal::sparse_sparse_to_dense_product_impl<Lhs,Rhs,ResultType>(lhs, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
+    ColMajorMatrix lhsCol(lhs);
+    internal::sparse_sparse_to_dense_product_impl<ColMajorMatrix,Rhs,ResultType>(lhsCol, rhs, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> ColMajorMatrix;
+    ColMajorMatrix rhsCol(rhs);
+    internal::sparse_sparse_to_dense_product_impl<Lhs,ColMajorMatrix,ResultType>(lhs, rhsCol, res);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor>
+{
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
+  {
+    Transpose<ResultType> trRes(res);
+    internal::sparse_sparse_to_dense_product_impl<Rhs,Lhs,Transpose<ResultType> >(rhs, lhs, trRes);
+  }
+};
+
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/SparseCore/MappedSparseMatrix.h b/nuparu/include/Eigen/src/SparseCore/MappedSparseMatrix.h
index 93cd4832..67718c85 100644
--- a/nuparu/include/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ b/nuparu/include/Eigen/src/SparseCore/MappedSparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,9 +10,10 @@
 #ifndef EIGEN_MAPPED_SPARSEMATRIX_H
 #define EIGEN_MAPPED_SPARSEMATRIX_H
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \class MappedSparseMatrix
+/** \deprecated Use Map<SparseMatrix<> >
+  * \class MappedSparseMatrix
   *
   * \brief Sparse matrix
   *
@@ -22,158 +23,45 @@ namespace Eigen {
   *
   */
 namespace internal {
-template<typename _Scalar, int _Flags, typename _Index>
-struct traits<MappedSparseMatrix<_Scalar, _Flags, _Index> > : traits<SparseMatrix<_Scalar, _Flags, _Index> >
+template<typename _Scalar, int _Flags, typename _StorageIndex>
+struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {};
-}
+} // end namespace internal
 
-template<typename _Scalar, int _Flags, typename _Index>
+template<typename _Scalar, int _Flags, typename _StorageIndex>
 class MappedSparseMatrix
-  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _Index> >
+  : public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
 {
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
-    enum { IsRowMajor = Base::IsRowMajor };
-
-  protected:
-
-    Index   m_outerSize;
-    Index   m_innerSize;
-    Index   m_nnz;
-    Index*  m_outerIndex;
-    Index*  m_innerIndices;
-    Scalar* m_values;
+    typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base;
 
   public:
+    
+    typedef typename Base::StorageIndex StorageIndex;
+    typedef typename Base::Scalar Scalar;
 
-    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return m_outerSize; }
-
-    //----------------------------------------
-    // direct access interface
-    inline const Scalar* valuePtr() const { return m_values; }
-    inline Scalar* valuePtr() { return m_values; }
-
-    inline const Index* innerIndexPtr() const { return m_innerIndices; }
-    inline Index* innerIndexPtr() { return m_innerIndices; }
-
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
-    inline Index* outerIndexPtr() { return m_outerIndex; }
-    //----------------------------------------
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      if (start==end)
-        return Scalar(0);
-      else if (end>0 && inner==m_innerIndices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
-
-      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
-      const Index id = r-&m_innerIndices[0];
-      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
-      const Index id = r-&m_innerIndices[0];
-      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
-      return m_values[id];
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr)
-      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
-        m_innerIndices(innerIndexPtr), m_values(valuePtr)
+    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
     {}
 
     /** Empty destructor */
     inline ~MappedSparseMatrix() {}
 };
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer]),
-        m_start(m_id),
-        m_end(mat.outerIndexPtr()[outer+1])
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
+namespace internal {
 
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::ReverseInnerIterator
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> >
+  : evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > >
 {
-  public:
-    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer+1]),
-        m_start(mat.outerIndexPtr()[outer]),
-        m_end(m_id)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
+  typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
 };
 
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_MAPPED_SPARSEMATRIX_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseAssign.h b/nuparu/include/Eigen/src/SparseCore/SparseAssign.h
new file mode 100644
index 00000000..4a8dd12e
--- /dev/null
+++ b/nuparu/include/Eigen/src/SparseCore/SparseAssign.h
@@ -0,0 +1,205 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEASSIGN_H
+#define EIGEN_SPARSEASSIGN_H
+
+namespace Eigen { 
+
+template<typename Derived>    
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+{
+  // TODO use the evaluator mechanism
+  other.evalTo(derived());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+inline Derived& SparseMatrixBase<Derived>::operator=(const SparseMatrixBase<OtherDerived>& other)
+{
+  // by default sparse evaluation do not alias, so we can safely bypass the generic call_assignment routine
+  internal::Assignment<Derived,OtherDerived,internal::assign_op<Scalar> >
+          ::run(derived(), other.derived(), internal::assign_op<Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+inline Derived& SparseMatrixBase<Derived>::operator=(const Derived& other)
+{
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+namespace internal {
+
+template<>
+struct storage_kind_to_evaluator_kind<Sparse> {
+  typedef IteratorBased Kind;
+};
+
+template<>
+struct storage_kind_to_shape<Sparse> {
+  typedef SparseShape Shape;
+};
+
+struct Sparse2Sparse {};
+struct Sparse2Dense  {};
+
+template<> struct AssignmentKind<SparseShape, SparseShape>           { typedef Sparse2Sparse Kind; };
+template<> struct AssignmentKind<SparseShape, SparseTriangularShape> { typedef Sparse2Sparse Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseShape>           { typedef Sparse2Dense  Kind; };
+template<> struct AssignmentKind<DenseShape,  SparseTriangularShape> { typedef Sparse2Dense  Kind; };
+
+
+template<typename DstXprType, typename SrcXprType>
+void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
+{
+  typedef typename DstXprType::Scalar Scalar;
+  typedef internal::evaluator<DstXprType> DstEvaluatorType;
+  typedef internal::evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  const bool transpose = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit);
+  const Index outerEvaluationSize = (SrcEvaluatorType::Flags&RowMajorBit) ? src.rows() : src.cols();
+  if ((!transpose) && src.isRValue())
+  {
+    // eval without temporary
+    dst.resize(src.rows(), src.cols());
+    dst.setZero();
+    dst.reserve((std::max)(src.rows(),src.cols())*2);
+    for (Index j=0; j<outerEvaluationSize; ++j)
+    {
+      dst.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it)
+      {
+        Scalar v = it.value();
+        dst.insertBackByOuterInner(j,it.index()) = v;
+      }
+    }
+    dst.finalize();
+  }
+  else
+  {
+    // eval through a temporary
+    eigen_assert(( ((internal::traits<DstXprType>::SupportedAccessPatterns & OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
+              (!((DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit)))) &&
+              "the transpose operation is supposed to be handled in SparseMatrix::operator=");
+
+    enum { Flip = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit) };
+
+    
+    DstXprType temp(src.rows(), src.cols());
+
+    temp.reserve((std::max)(src.rows(),src.cols())*2);
+    for (Index j=0; j<outerEvaluationSize; ++j)
+    {
+      temp.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it)
+      {
+        Scalar v = it.value();
+        temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
+      }
+    }
+    temp.finalize();
+
+    dst = temp.markAsRValue();
+  }
+}
+
+// Generic Sparse to Sparse assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse, Scalar>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    assign_sparse_to_sparse(dst.derived(), src.derived());
+  }
+};
+
+// Generic Sparse to Dense assignment
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Scalar>
+{
+  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
+  {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+
+    if(internal::is_same<Functor,internal::assign_op<Scalar> >::value)
+      dst.setZero();
+    
+    internal::evaluator<SrcXprType> srcEval(src);
+    internal::evaluator<DstXprType> dstEval(dst);
+    const Index outerEvaluationSize = (internal::evaluator<SrcXprType>::Flags&RowMajorBit) ? src.rows() : src.cols();
+    for (Index j=0; j<outerEvaluationSize; ++j)
+      for (typename internal::evaluator<SrcXprType>::InnerIterator i(srcEval,j); i; ++i)
+        func.assignCoeff(dstEval.coeffRef(i.row(),i.col()), i.value());
+  }
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
+template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType,RhsType>, internal::assign_op<Scalar>, Sparse2Sparse, Scalar>
+{
+  typedef Solve<DecType,RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+struct Diagonal2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,DiagonalShape> { typedef Diagonal2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse, Scalar>
+{
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
+  typedef Array<Scalar,Dynamic,1> ArrayXS;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    Index size = src.diagonal().size();
+    dst.makeCompressed();
+    dst.resizeNonZeros(size);
+    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
+    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
+    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
+  }
+  
+  template<typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    dst.diagonal() = src.diagonal();
+  }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
+  
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
+};
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEASSIGN_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseBlock.h b/nuparu/include/Eigen/src/SparseCore/SparseBlock.h
index 0b3e193d..b574d13c 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseBlock.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseBlock.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,6 +12,7 @@
 
 namespace Eigen { 
 
+// Subset of columns or rows
 template<typename XprType, int BlockRows, int BlockCols>
 class BlockImpl<XprType,BlockRows,BlockCols,true,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,true> >
@@ -22,100 +23,89 @@ class BlockImpl<XprType,BlockRows,BlockCols,true,Sparse>
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
 protected:
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
-    
-    class InnerIterator: public XprType::InnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : XprType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public XprType::ReverseInnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : XprType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
 
-    inline BlockImpl(const XprType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    inline BlockImpl(const XprType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
 
     EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
     EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+    
+    Index nonZeros() const
+    {
+      typedef internal::evaluator<XprType> EvaluatorType;
+      EvaluatorType matEval(m_matrix);
+      Index nnz = 0;
+      Index end = m_outerStart + m_outerSize.value();
+      for(Index j=m_outerStart; j<end; ++j)
+        for(typename EvaluatorType::InnerIterator it(matEval, j); it; ++it)
+          ++nnz;
+      return nnz;
+    }
+    
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    }
+    
+    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
+    Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+    Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+    Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
   protected:
 
     typename XprType::Nested m_matrix;
     Index m_outerStart;
     const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+  
+  public:
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
 };
 
 
 /***************************************************************************
-* specialisation for SparseMatrix
+* specialization for SparseMatrix
 ***************************************************************************/
 
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public SparseMatrixBase<Block<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
+namespace internal {
+
+template<typename SparseMatrixType, int BlockRows, int BlockCols>
+class sparse_matrix_block_impl
+  : public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
 {
-    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
     typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
     typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
+    typedef SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> > Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 protected:
+    typedef typename Base::IndexVector IndexVector;
     enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
 public:
-    
-    class InnerIterator: public SparseMatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
-    {
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
 
-    inline BlockImpl(const SparseMatrixType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
+    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize)
     {}
 
-    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
+    inline sparse_matrix_block_impl(const SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_outerStart(convert_index(IsRowMajor ? startRow : startCol)), m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols))
     {}
 
     template<typename OtherDerived>
@@ -123,16 +113,17 @@ class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true
     {
       typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
       _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
-      // This assignement is slow if this vector set is not empty
+      // This assignment is slow if this vector set is not empty
       // and/or it is not at the end of the nonzeros of the underlying matrix.
 
       // 1 - eval to a temporary to avoid transposition and/or aliasing issues
-      SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, Index> tmp(other);
+      Ref<const SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, StorageIndex> > tmp(other.derived());
+      eigen_internal_assert(tmp.outerSize()==m_outerSize.value());
 
       // 2 - let's check whether there is enough allocated memory
       Index nnz           = tmp.nonZeros();
       Index start         = m_outerStart==0 ? 0 : matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
-      Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending posiiton of the current block
+      Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending position of the current block
       Index block_size    = end - start;                                                // available room in the current block
       Index tail_size     = m_matrix.outerIndexPtr()[m_matrix.outerSize()] - end;
       
@@ -140,52 +131,71 @@ class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true
                           ? Index(matrix.data().allocatedSize()) + block_size
                           : block_size;
 
+      bool update_trailing_pointers = false;
       if(nnz>free_size) 
       {
         // realloc manually to reduce copies
         typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz);
 
-        std::memcpy(&newdata.value(0), &m_matrix.data().value(0), start*sizeof(Scalar));
-        std::memcpy(&newdata.index(0), &m_matrix.data().index(0), start*sizeof(Index));
+        internal::smart_copy(&m_matrix.data().value(0),  &m_matrix.data().value(0) + start, &newdata.value(0));
+        internal::smart_copy(&m_matrix.data().index(0),  &m_matrix.data().index(0) + start, &newdata.index(0));
 
-        std::memcpy(&newdata.value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
-        std::memcpy(&newdata.index(start), &tmp.data().index(0), nnz*sizeof(Index));
+        internal::smart_copy(tmp.valuePtr(), tmp.valuePtr() + nnz, &newdata.value(start));
+        internal::smart_copy(tmp.innerIndexPtr(), tmp.innerIndexPtr() + nnz, &newdata.index(start));
 
-        std::memcpy(&newdata.value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
-        std::memcpy(&newdata.index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
+        internal::smart_copy(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &newdata.value(start+nnz));
+        internal::smart_copy(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &newdata.index(start+nnz));
         
         newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
 
         matrix.data().swap(newdata);
+
+        update_trailing_pointers = true;
       }
       else
       {
-        // no need to realloc, simply copy the tail at its respective position and insert tmp
-        matrix.data().resize(start + nnz + tail_size);
+        if(m_matrix.isCompressed())
+        {
+          // no need to realloc, simply copy the tail at its respective position and insert tmp
+          matrix.data().resize(start + nnz + tail_size);
 
-        std::memmove(&matrix.data().value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
-        std::memmove(&matrix.data().index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
+          internal::smart_memmove(&matrix.data().value(end),  &matrix.data().value(end) + tail_size, &matrix.data().value(start + nnz));
+          internal::smart_memmove(&matrix.data().index(end),  &matrix.data().index(end) + tail_size, &matrix.data().index(start + nnz));
 
-        std::memcpy(&matrix.data().value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
-        std::memcpy(&matrix.data().index(start), &tmp.data().index(0), nnz*sizeof(Index));
+          update_trailing_pointers = true;
+        }
+
+        internal::smart_copy(tmp.valuePtr(),  tmp.valuePtr() + nnz, &matrix.data().value(start));
+        internal::smart_copy(tmp.innerIndexPtr(),  tmp.innerIndexPtr() + nnz, &matrix.data().index(start));
       }
-      
-      // update innerNonZeros
-      if(!m_matrix.isCompressed())
-        for(Index j=0; j<m_outerSize.value(); ++j)
-          matrix.innerNonZeroPtr()[m_outerStart+j] = tmp.innerVector(j).nonZeros();
-
-      // update outer index pointers
-      Index p = start;
-      for(Index k=0; k<m_outerSize.value(); ++k)
+
+      // update outer index pointers and innerNonZeros
+      if(IsVectorAtCompileTime)
+      {
+        if(!m_matrix.isCompressed())
+          matrix.innerNonZeroPtr()[m_outerStart] = StorageIndex(nnz);
+        matrix.outerIndexPtr()[m_outerStart] = StorageIndex(start);
+      }
+      else
       {
-        matrix.outerIndexPtr()[m_outerStart+k] = p;
-        p += tmp.innerVector(k).nonZeros();
+        StorageIndex p = StorageIndex(start);
+        for(Index k=0; k<m_outerSize.value(); ++k)
+        {
+          Index nnz_k = tmp.innerVector(k).nonZeros();
+          if(!m_matrix.isCompressed())
+            matrix.innerNonZeroPtr()[m_outerStart+k] = StorageIndex(nnz_k);
+          matrix.outerIndexPtr()[m_outerStart+k] = p;
+          p += nnz_k;
+        }
       }
-      std::ptrdiff_t offset = nnz - block_size;
-      for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+
+      if(update_trailing_pointers)
       {
-        matrix.outerIndexPtr()[k] += offset;
+        StorageIndex offset = internal::convert_index<StorageIndex>(nnz - block_size);
+        for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
+        {
+          matrix.outerIndexPtr()[k] += offset;
+        }
       }
 
       return derived();
@@ -197,35 +207,46 @@ class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true
     }
 
     inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.valuePtr(); }
     inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    { return m_matrix.const_cast_derived().valuePtr(); }
 
-    inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-    inline Index* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
+    inline const StorageIndex* innerIndexPtr() const
+    { return m_matrix.innerIndexPtr(); }
+    inline StorageIndex* innerIndexPtr()
+    { return m_matrix.const_cast_derived().innerIndexPtr(); }
 
-    inline const Index* outerIndexPtr() const
+    inline const StorageIndex* outerIndexPtr() const
     { return m_matrix.outerIndexPtr() + m_outerStart; }
-    inline Index* outerIndexPtr()
+    inline StorageIndex* outerIndexPtr()
     { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
-
-    Index nonZeros() const
+    
+    inline const StorageIndex* innerNonZeroPtr() const
+    { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr()+m_outerStart); }
+    inline StorageIndex* innerNonZeroPtr()
+    { return isCompressed() ? 0 : (m_matrix.const_cast_derived().innerNonZeroPtr()+m_outerStart); }
+    
+    bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
+    
+    inline Scalar& coeffRef(Index row, Index col)
     {
-      if(m_matrix.isCompressed())
-        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
+      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index row, Index col) const
+    {
+      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
+    }
+    
+    inline const Scalar coeff(Index index) const
+    {
+      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
     }
 
     const Scalar& lastCoeff() const
     {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
-      eigen_assert(nonZeros()>0);
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
+      eigen_assert(Base::nonZeros()>0);
       if(m_matrix.isCompressed())
         return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
       else
@@ -234,6 +255,12 @@ class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true
 
     EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
     EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+    
+    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
+    Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+    Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+    Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+    Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
   protected:
 
@@ -243,6 +270,49 @@ class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true
 
 };
 
+} // namespace internal
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+class BlockImpl<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true,Sparse>
+  : public internal::sparse_matrix_block_impl<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols>
+{
+public:
+  typedef _StorageIndex StorageIndex;
+  typedef SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType;
+  typedef internal::sparse_matrix_block_impl<SparseMatrixType,BlockRows,BlockCols> Base;
+  inline BlockImpl(SparseMatrixType& xpr, Index i)
+    : Base(xpr, i)
+  {}
+
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    : Base(xpr, startRow, startCol, blockRows, blockCols)
+  {}
+  
+  using Base::operator=;
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+class BlockImpl<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true,Sparse>
+  : public internal::sparse_matrix_block_impl<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols>
+{
+public:
+  typedef _StorageIndex StorageIndex;
+  typedef const SparseMatrix<_Scalar, _Options, _StorageIndex> SparseMatrixType;
+  typedef internal::sparse_matrix_block_impl<SparseMatrixType,BlockRows,BlockCols> Base;
+  inline BlockImpl(SparseMatrixType& xpr, Index i)
+    : Base(xpr, i)
+  {}
+
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    : Base(xpr, startRow, startCol, blockRows, blockCols)
+  {}
+  
+  using Base::operator=;
+private:
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr, Index i);
+  template<typename Derived> BlockImpl(const SparseMatrixBase<Derived>& xpr);
+};
+
 //----------
 
 /** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
@@ -263,7 +333,8 @@ const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatri
   * is col-major (resp. row-major).
   */
 template<typename Derived>
-Block<Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
+typename SparseMatrixBase<Derived>::InnerVectorsReturnType
+SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
 {
   return Block<Derived,Dynamic,Dynamic,true>(derived(),
                                              IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
@@ -275,7 +346,8 @@ Block<Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Inde
   * is col-major (resp. row-major). Read-only.
   */
 template<typename Derived>
-const Block<const Derived,Dynamic,Dynamic,true> SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
+const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
+SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
 {
   return Block<const Derived,Dynamic,Dynamic,true>(derived(),
                                                   IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
@@ -290,50 +362,53 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
   : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,InnerPanel> >, internal::no_assignment_operator
 {
-  typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
-  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+    typedef SparseMatrixBase<BlockType> Base;
+    using Base::convert_index;
 public:
     enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
     EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+    
+    typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
 
     /** Column or Row constructor
       */
-    inline BlockImpl(const XprType& xpr, int i)
+    inline BlockImpl(const XprType& xpr, Index i)
       : m_matrix(xpr),
-        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
-        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
-        m_blockRows(xpr.rows()),
-        m_blockCols(xpr.cols())
+        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? convert_index(i) : 0),
+        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? convert_index(i) : 0),
+        m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
+        m_blockCols(BlockCols==1 ? 1 : xpr.cols())
     {}
 
     /** Dynamic-size constructor
       */
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols)
+    inline BlockImpl(const XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr), m_startRow(convert_index(startRow)), m_startCol(convert_index(startCol)), m_blockRows(convert_index(blockRows)), m_blockCols(convert_index(blockCols))
     {}
 
-    inline int rows() const { return m_blockRows.value(); }
-    inline int cols() const { return m_blockCols.value(); }
+    inline Index rows() const { return m_blockRows.value(); }
+    inline Index cols() const { return m_blockCols.value(); }
 
-    inline Scalar& coeffRef(int row, int col)
+    inline Scalar& coeffRef(Index row, Index col)
     {
       return m_matrix.const_cast_derived()
                .coeffRef(row + m_startRow.value(), col + m_startCol.value());
     }
 
-    inline const Scalar coeff(int row, int col) const
+    inline const Scalar coeff(Index row, Index col) const
     {
       return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
     }
 
-    inline Scalar& coeffRef(int index)
+    inline Scalar& coeffRef(Index index)
     {
       return m_matrix.const_cast_derived()
              .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
                        m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
     }
 
-    inline const Scalar coeff(int index) const
+    inline const Scalar coeff(Index index) const
     {
       return m_matrix
              .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -341,56 +416,19 @@ class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
     }
     
     inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
+    Index startRow() const { return m_startRow.value(); }
+    Index startCol() const { return m_startCol.value(); }
+    Index blockRows() const { return m_blockRows.value(); }
+    Index blockCols() const { return m_blockCols.value(); }
     
-    class InnerIterator : public _MatrixTypeNested::InnerIterator
-    {
-      typedef typename _MatrixTypeNested::InnerIterator Base;
-      const BlockType& m_block;
-      Index m_end;
-    public:
-
-      EIGEN_STRONG_INLINE InnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-      {
-        while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
-          Base::operator++();
-      }
-
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-      
-      inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
-    };
-    class ReverseInnerIterator : public _MatrixTypeNested::ReverseInnerIterator
-    {
-      typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-      const BlockType& m_block;
-      Index m_begin;
-    public:
-
-      EIGEN_STRONG_INLINE ReverseInnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_begin(IsRowMajor ? block.m_startCol.value() : block.m_startRow.value())
-      {
-        while( (Base::operator bool()) && (Base::index() >= (IsRowMajor ? m_block.m_startCol.value()+block.m_blockCols.value() : m_block.m_startRow.value()+block.m_blockRows.value())) )
-          Base::operator--();
-      }
-
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-      
-      inline operator bool() const { return Base::operator bool() && Base::index() >= m_begin; }
-    };
   protected:
-    friend class InnerIterator;
+//     friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
     friend class ReverseInnerIterator;
+    friend struct internal::unary_evaluator<Block<XprType,BlockRows,BlockCols,InnerPanel>, internal::IteratorBased, Scalar >;
+    
+    Index nonZeros() const { return Dynamic; }
+    
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
 
     typename XprType::Nested m_matrix;
     const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
@@ -400,6 +438,149 @@ class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
 
 };
 
+namespace internal {
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased >
+ : public evaluator_base<Block<ArgType,BlockRows,BlockCols,InnerPanel> >
+{
+    class InnerVectorInnerIterator;
+    class OuterVectorInnerIterator;
+  public:
+    typedef Block<ArgType,BlockRows,BlockCols,InnerPanel> XprType;
+    typedef typename XprType::StorageIndex StorageIndex;
+    typedef typename XprType::Scalar Scalar;
+    
+    class ReverseInnerIterator;
+    
+    enum {
+      IsRowMajor = XprType::IsRowMajor,
+      
+      OuterVector =  (BlockCols==1 && ArgType::IsRowMajor)
+                    | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
+                      // revert to || as soon as not needed anymore. 
+                     (BlockRows==1 && !ArgType::IsRowMajor),
+      
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    typedef typename internal::conditional<OuterVector,OuterVectorInnerIterator,InnerVectorInnerIterator>::type InnerIterator;
+    
+    explicit unary_evaluator(const XprType& op)
+      : m_argImpl(op.nestedExpression()), m_block(op)
+    {}
+    
+    inline Index nonZerosEstimate() const {
+      Index nnz = m_block.nonZeros();
+      if(nnz<0)
+        return m_argImpl.nonZerosEstimate() * m_block.size() / m_block.nestedExpression().size();
+      return nnz;
+    }
+
+  protected:
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+    
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_block;
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
+ : public EvalIterator
+{
+  const XprType& m_block;
+  Index m_end;
+public:
+  
+  EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+    : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
+      m_block(aEval.m_block),
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+  {
+    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) )
+      EvalIterator::operator++();
+  }
+  
+  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(IsRowMajor ? m_block.startCol() : m_block.startRow()); }
+  inline Index outer()  const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); }
+  inline Index row()    const { return EvalIterator::row()   - m_block.startRow(); }
+  inline Index col()    const { return EvalIterator::col()   - m_block.startCol(); }
+  
+  inline operator bool() const { return EvalIterator::operator bool() && EvalIterator::index() < m_end; }
+};
+
+template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
+{
+  const unary_evaluator& m_eval;
+  Index m_outerPos;
+  Index m_innerIndex;
+  Scalar m_value;
+  Index m_end;
+public:
+
+  EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+    : m_eval(aEval),
+      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) - 1), // -1 so that operator++ finds the first non-zero entry
+      m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_value(0),
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+  {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer==0);
+    
+    ++(*this);
+  }
+  
+  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
+  inline Index outer()  const { return 0; }
+  inline Index row()    const { return IsRowMajor ? 0 : index(); }
+  inline Index col()    const { return IsRowMajor ? index() : 0; }
+  
+  inline Scalar value() const { return m_value; }
+  
+  inline OuterVectorInnerIterator& operator++()
+  {
+    // search next non-zero entry
+    while(++m_outerPos<m_end)
+    {
+      EvalIterator it(m_eval.m_argImpl, m_outerPos);
+      // search for the key m_innerIndex in the current outer-vector
+      while(it && it.index() < m_innerIndex) ++it;
+      if(it && it.index()==m_innerIndex)
+      {
+        m_value = it.value();
+        break;
+      }
+    }
+    return *this;
+  }
+  
+  inline operator bool() const { return m_outerPos < m_end; }
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true>, IteratorBased>
+  : evaluator<SparseCompressedBase<Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> > >
+{
+  typedef Block<SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType &xpr) : Base(xpr) {}
+};
+
+template<typename _Scalar, int _Options, typename _StorageIndex, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true>, IteratorBased>
+  : evaluator<SparseCompressedBase<Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> > >
+{
+  typedef Block<const SparseMatrix<_Scalar, _Options, _StorageIndex>,BlockRows,BlockCols,true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType &xpr) : Base(xpr) {}
+};
+
+} // end namespace internal
+
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_BLOCK_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseColEtree.h b/nuparu/include/Eigen/src/SparseCore/SparseColEtree.h
index f89ca381..ebe02d1a 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseColEtree.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseColEtree.h
@@ -58,29 +58,29 @@ Index etree_find (Index i, IndexVector& pp)
   * \param perm The permutation to apply to the column of \b mat
   */
 template <typename MatrixType, typename IndexVector>
-int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::Index *perm=0)
+int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::StorageIndex *perm=0)
 {
-  typedef typename MatrixType::Index Index;
-  Index nc = mat.cols(); // Number of columns 
-  Index m = mat.rows();
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  StorageIndex nc = convert_index<StorageIndex>(mat.cols()); // Number of columns
+  StorageIndex m = convert_index<StorageIndex>(mat.rows());
+  StorageIndex diagSize = (std::min)(nc,m);
   IndexVector root(nc); // root of subtree of etree 
   root.setZero();
   IndexVector pp(nc); // disjoint sets 
   pp.setZero(); // Initialize disjoint sets 
   parent.resize(mat.cols());
   //Compute first nonzero column in each row 
-  Index row,col; 
   firstRowElt.resize(m);
   firstRowElt.setConstant(nc);
-  firstRowElt.segment(0, nc).setLinSpaced(nc, 0, nc-1);
+  firstRowElt.segment(0, diagSize).setLinSpaced(diagSize, 0, diagSize-1);
   bool found_diag;
-  for (col = 0; col < nc; col++)
+  for (StorageIndex col = 0; col < nc; col++)
   {
-    Index pcol = col;
+    StorageIndex pcol = col;
     if(perm) pcol  = perm[col];
     for (typename MatrixType::InnerIterator it(mat, pcol); it; ++it)
     { 
-      row = it.row();
+      Index row = it.row();
       firstRowElt(row) = (std::min)(firstRowElt(row), col);
     }
   }
@@ -88,24 +88,25 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
           except use (firstRowElt[r],c) in place of an edge (r,c) of A.
     Thus each row clique in A'*A is replaced by a star
     centered at its first vertex, which has the same fill. */
-  Index rset, cset, rroot; 
-  for (col = 0; col < nc; col++) 
+  StorageIndex rset, cset, rroot;
+  for (StorageIndex col = 0; col < nc; col++) 
   {
-    found_diag = false;
+    found_diag = col>=m;
     pp(col) = col; 
     cset = col; 
     root(cset) = col; 
     parent(col) = nc; 
     /* The diagonal element is treated here even if it does not exist in the matrix
      * hence the loop is executed once more */ 
-    Index pcol = col;
+    StorageIndex pcol = col;
     if(perm) pcol  = perm[col];
     for (typename MatrixType::InnerIterator it(mat, pcol); it||!found_diag; ++it)
     { //  A sequence of interleaved find and union is performed 
       Index i = col;
       if(it) i = it.index();
       if (i == col) found_diag = true;
-      row = firstRowElt(i);
+      
+      StorageIndex row = firstRowElt(i);
       if (row >= col) continue; 
       rset = internal::etree_find(row, pp); // Find the name of the set containing row
       rroot = root(rset);
@@ -125,10 +126,11 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
   * Depth-first search from vertex n.  No recursion.
   * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France.
 */
-template <typename Index, typename IndexVector>
-void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, Index postnum)
+template <typename IndexVector>
+void nr_etdfs (typename IndexVector::Scalar n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, typename IndexVector::Scalar postnum)
 {
-  Index current = n, first, next;
+  typedef typename IndexVector::Scalar StorageIndex;
+  StorageIndex current = n, first, next;
   while (postnum != n) 
   {
     // No kid for the current node
@@ -172,22 +174,22 @@ void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector
   * \param parent Input tree
   * \param post postordered tree
   */
-template <typename Index, typename IndexVector>
-void treePostorder(Index n, IndexVector& parent, IndexVector& post)
+template <typename IndexVector>
+void treePostorder(typename IndexVector::Scalar n, IndexVector& parent, IndexVector& post)
 {
+  typedef typename IndexVector::Scalar StorageIndex;
   IndexVector first_kid, next_kid; // Linked list of children 
-  Index postnum; 
+  StorageIndex postnum; 
   // Allocate storage for working arrays and results 
   first_kid.resize(n+1); 
   next_kid.setZero(n+1);
   post.setZero(n+1);
   
   // Set up structure describing children
-  Index v, dad; 
   first_kid.setConstant(-1); 
-  for (v = n-1; v >= 0; v--) 
+  for (StorageIndex v = n-1; v >= 0; v--) 
   {
-    dad = parent(v);
+    StorageIndex dad = parent(v);
     next_kid(v) = first_kid(dad); 
     first_kid(dad) = v; 
   }
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseCompressedBase.h b/nuparu/include/Eigen/src/SparseCore/SparseCompressedBase.h
new file mode 100644
index 00000000..c223e4f4
--- /dev/null
+++ b/nuparu/include/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -0,0 +1,277 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
+#define EIGEN_SPARSE_COMPRESSED_BASE_H
+
+namespace Eigen { 
+
+template<typename Derived> class SparseCompressedBase;
+  
+namespace internal {
+
+template<typename Derived>
+struct traits<SparseCompressedBase<Derived> > : traits<Derived>
+{};
+
+} // end namespace internal
+
+template<typename Derived>
+class SparseCompressedBase
+  : public SparseMatrixBase<Derived>
+{
+  public:
+    typedef SparseMatrixBase<Derived> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+    using Base::operator=;
+    using Base::IsRowMajor;
+    
+    class InnerIterator;
+    class ReverseInnerIterator;
+    
+  protected:
+    typedef typename Base::IndexVector IndexVector;
+    Eigen::Map<IndexVector> innerNonZeros() { return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+    const  Eigen::Map<const IndexVector> innerNonZeros() const { return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed()?0:derived().outerSize()); }
+        
+  public:
+    
+    /** \returns the number of non zero coefficients */
+    inline Index nonZeros() const
+    {
+      if(Derived::IsVectorAtCompileTime && outerIndexPtr()==0)
+        return derived().nonZeros();
+      else if(isCompressed())
+        return outerIndexPtr()[derived().outerSize()]-outerIndexPtr()[0];
+      else if(derived().outerSize()==0)
+        return 0;
+      else
+        return innerNonZeros().sum();
+    }
+    
+    /** \returns a const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline const Scalar* valuePtr() const { return derived().valuePtr(); }
+    /** \returns a non-const pointer to the array of values.
+      * This function is aimed at interoperability with other libraries.
+      * \sa innerIndexPtr(), outerIndexPtr() */
+    inline Scalar* valuePtr() { return derived().valuePtr(); }
+
+    /** \returns a const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
+    /** \returns a non-const pointer to the array of inner indices.
+      * This function is aimed at interoperability with other libraries.
+      * \sa valuePtr(), outerIndexPtr() */
+    inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
+      * \sa valuePtr(), innerIndexPtr() */
+    inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
+    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 for SparseVector
+      * \sa valuePtr(), innerIndexPtr() */
+    inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
+
+    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
+    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+      * This function is aimed at interoperability with other libraries.
+      * \warning it returns the null pointer 0 in compressed mode */
+    inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
+    
+    /** \returns whether \c *this is in compressed form. */
+    inline bool isCompressed() const { return innerNonZeroPtr()==0; }
+
+  protected:
+    /** Default constructor. Do nothing. */
+    SparseCompressedBase() {}
+  private:
+    template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::InnerIterator
+{
+  public:
+    InnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
+    {
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_id = 0;
+        m_end = mat.nonZeros();
+      }
+      else
+      {
+        m_id = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_end = mat.outerIndexPtr()[outer+1];
+        else
+          m_end = m_id + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit InnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_id(0), m_end(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit InnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(&data.value(0)), m_indices(&data.index(0)), m_outer(0), m_id(0), m_end(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    inline InnerIterator& operator++() { m_id++; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+    inline StorageIndex index() const { return m_indices[m_id]; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+    inline operator bool() const { return (m_id < m_end); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
+    Index m_id;
+    Index m_end;
+  private:
+    // If you get here, then you're not using the right InnerIterator type, e.g.:
+    //   SparseMatrix<double,RowMajor> A;
+    //   SparseMatrix<double>::InnerIterator it(A,0);
+    template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer);
+};
+
+template<typename Derived>
+class SparseCompressedBase<Derived>::ReverseInnerIterator
+{
+  public:
+    ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer)
+    {
+      if(Derived::IsVectorAtCompileTime && mat.outerIndexPtr()==0)
+      {
+        m_start = 0;
+        m_id = mat.nonZeros();
+      }
+      else
+      {
+        m_start.value() = mat.outerIndexPtr()[outer];
+        if(mat.isCompressed())
+          m_id = mat.outerIndexPtr()[outer+1];
+        else
+          m_id = m_start.value() + mat.innerNonZeroPtr()[outer];
+      }
+    }
+
+    explicit ReverseInnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_start(0), m_id(mat.nonZeros())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    explicit ReverseInnerIterator(const internal::CompressedStorage<Scalar,StorageIndex>& data)
+      : m_values(&data.value(0)), m_indices(&data.index(0)), m_outer(0), m_start(0), m_id(data.size())
+    {
+      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    }
+
+    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
+
+    inline const Scalar& value() const { return m_values[m_id-1]; }
+    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
+
+    inline StorageIndex index() const { return m_indices[m_id-1]; }
+    inline Index outer() const { return m_outer.value(); }
+    inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+    inline operator bool() const { return (m_id > m_start.value()); }
+
+  protected:
+    const Scalar* m_values;
+    const StorageIndex* m_indices;
+    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_outer;
+    Index m_id;
+    const internal::variable_if_dynamic<Index,Derived::IsVectorAtCompileTime?0:Dynamic> m_start;
+};
+
+namespace internal {
+
+template<typename Derived>
+struct evaluator<SparseCompressedBase<Derived> >
+  : evaluator_base<Derived>
+{
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::InnerIterator InnerIterator;
+  typedef typename Derived::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = Derived::Flags
+  };
+  
+  evaluator() : m_matrix(0)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  explicit evaluator(const Derived &mat) : m_matrix(&mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_matrix->nonZeros();
+  }
+  
+  operator Derived&() { return m_matrix->const_cast_derived(); }
+  operator const Derived&() const { return *m_matrix; }
+  
+  typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
+  Scalar coeff(Index row, Index col) const
+  { return m_matrix->coeff(row,col); }
+  
+  Scalar& coeffRef(Index row, Index col)
+  {
+    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
+      
+    const Index outer = Derived::IsRowMajor ? row : col;
+    const Index inner = Derived::IsRowMajor ? col : row;
+
+    Index start = m_matrix->outerIndexPtr()[outer];
+    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
+    eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    const Index p =   std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner)
+                    - m_matrix->innerIndexPtr();
+    eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+  const Derived *m_matrix;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_COMPRESSED_BASE_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/nuparu/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index ec86ca93..d9420ac6 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -29,87 +29,61 @@ namespace Eigen {
 //  4 - dense op dense     product      dense
 //                         generic      dense
 
-namespace internal {
-
-template<> struct promote_storage_type<Dense,Sparse>
-{ typedef Sparse ret; };
-
-template<> struct promote_storage_type<Sparse,Dense>
-{ typedef Sparse ret; };
-
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
-  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
-  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
-class sparse_cwise_binary_op_inner_iterator_selector;
-
-} // end namespace internal
-
 template<typename BinaryOp, typename Lhs, typename Rhs>
 class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
   : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
   public:
-    class InnerIterator;
-    class ReverseInnerIterator;
     typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+    typedef SparseMatrixBase<Derived> Base;
     EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
     CwiseBinaryOpImpl()
     {
-      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
-      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
       EIGEN_STATIC_ASSERT((
-                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
+                (!internal::is_same<typename internal::traits<Lhs>::StorageKind,
+                                    typename internal::traits<Rhs>::StorageKind>::value)
             ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
             THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
     }
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator
-  : public internal::sparse_cwise_binary_op_inner_iterator_selector<BinaryOp,Lhs,Rhs,typename CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator>
-{
-  public:
-    typedef typename Lhs::Index Index;
-    typedef internal::sparse_cwise_binary_op_inner_iterator_selector<
-      BinaryOp,Lhs,Rhs, InnerIterator> Base;
-
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseBinaryOpImpl& binOp, Index outer)
-      : Base(binOp.derived(),outer)
-    {}
-};
-
-/***************************************************************************
-* Implementation of inner-iterators
-***************************************************************************/
+namespace internal {
 
-// template<typename T> struct internal::func_is_conjunction { enum { ret = false }; };
-// template<typename T> struct internal::func_is_conjunction<internal::scalar_product_op<T> > { enum { ret = true }; };
+template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
+  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
+  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
+class sparse_cwise_binary_op_inner_iterator_selector;
 
-// TODO generalize the internal::scalar_product_op specialization to all conjunctions if any !
+} // end namespace internal
 
 namespace internal {
 
-// sparse - sparse  (generic)
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived, Sparse, Sparse>
+  
+// Generic "sparse OP sparse"
+template<typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 {
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename traits<CwiseBinaryXpr>::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
-
+protected:
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
   public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
     {
       this->operator++();
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index()))
       {
@@ -135,12 +109,12 @@ class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived
         m_value = 0; // this is to avoid a compilation warning
         m_id = -1;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_id; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
     EIGEN_STRONG_INLINE Index row() const { return Lhs::IsRowMajor ? m_lhsIter.row() : index(); }
     EIGEN_STRONG_INLINE Index col() const { return Lhs::IsRowMajor ? index() : m_lhsIter.col(); }
 
@@ -151,25 +125,55 @@ class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived
     RhsIterator m_rhsIter;
     const BinaryOp& m_functor;
     Scalar m_value;
-    Index m_id;
+    StorageIndex m_id;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    Flags = XprType::Flags
+  };
+  
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
-// sparse - sparse  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Sparse>
+// "sparse .* sparse"
+template<typename T, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IteratorBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs> >
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+protected:
+  typedef scalar_product_op<T> BinaryOp;
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
   public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
+    
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor)
     {
       while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
       {
@@ -180,7 +184,7 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
       }
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
       ++m_lhsIter;
       ++m_rhsIter;
@@ -191,12 +195,12 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
         else
           ++m_rhsIter;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
-
+    
     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(m_lhsIter.value(), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
     EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
     EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
@@ -205,91 +209,180 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
   protected:
     LhsIterator m_lhsIter;
     RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    const BinaryOp& m_functor;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    Flags = XprType::Flags
+  };
+  
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Dense>
+// "dense .* sparse"
+template<typename T, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IndexBased, IteratorBased>
+  : evaluator_base<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs> >
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::RhsNested RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename Lhs::Index Index;
-    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
-  public:
+protected:
+  typedef scalar_product_op<T> BinaryOp;
+  typedef evaluator<Lhs>  LhsEvaluator;
+  typedef typename evaluator<Rhs>::InnerIterator  RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
 
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_rhs(xpr.rhs()), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+  public:
+    
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl,outer), m_functor(aEval.m_functor), m_outer(outer)
     {}
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
-      ++m_lhsIter;
-      return *static_cast<Derived*>(this);
+      ++m_rhsIter;
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_lhsIter.value(),
-                       m_rhs.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
+    { return m_functor(m_lhsEval.coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
-    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
-    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
 
+    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+    
   protected:
-    RhsNested m_rhs;
-    LhsIterator m_lhsIter;
-    const BinaryFunc m_functor;
+    const LhsEvaluator &m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
     const Index m_outer;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    Flags = XprType::Flags
+  };
+  
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_rhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Dense, Sparse>
+// "sparse .* dense"
+template<typename T, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs>, IteratorBased, IndexBased>
+  : evaluator_base<CwiseBinaryOp<scalar_product_op<T>, Lhs, Rhs> >
 {
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
+protected:
+  typedef scalar_product_op<T> BinaryOp;
+  typedef typename evaluator<Lhs>::InnerIterator  LhsIterator;
+  typedef evaluator<Rhs>  RhsEvaluator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+public:
+
+  class ReverseInnerIterator;
+  class InnerIterator
+  {
+    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
 
-    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
   public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_xpr(xpr), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor()), m_outer(outer)
+    
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+      : m_lhsIter(aEval.m_lhsImpl,outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_outer(outer)
     {}
 
-    EIGEN_STRONG_INLINE Derived& operator++()
+    EIGEN_STRONG_INLINE InnerIterator& operator++()
     {
-      ++m_rhsIter;
-      return *static_cast<Derived*>(this);
+      ++m_lhsIter;
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_xpr.lhs().coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_rhsIter.index(); }
-    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
-    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
+    { return m_functor(m_lhsIter.value(),
+                       m_rhsEval.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
 
-    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
+    EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
+    
   protected:
-    const CwiseBinaryXpr& m_xpr;
-    RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    LhsIterator m_lhsIter;
+    const evaluator<Rhs> &m_rhsEval;
+    const BinaryOp& m_functor;
     const Index m_outer;
+  };
+  
+  
+  enum {
+    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    Flags = XprType::Flags
+  };
+  
+  explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_lhsImpl.nonZerosEstimate();
+  }
+
+protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
-} // end namespace internal
+}
 
 /***************************************************************************
 * Implementation of SparseMatrixBase and SparseCwise functions/operators
@@ -313,10 +406,26 @@ SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& othe
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::add_assign_op<Scalar>());
+  return derived();
+}
+
+template<typename Derived>
+template<typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other)
+{
+  call_assignment_no_alias(derived(), other.derived(), internal::sub_assign_op<Scalar>());
+  return derived();
+}
+    
+template<typename Derived>
+template<typename OtherDerived>
+EIGEN_STRONG_INLINE const typename SparseMatrixBase<Derived>::template CwiseProductDenseReturnType<OtherDerived>::Type
 SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) const
 {
-  return EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE(derived(), other.derived());
+  return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/nuparu/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 5a50c780..fe4a9712 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,131 +12,166 @@
 
 namespace Eigen { 
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryOp<UnaryOp, MatrixType> >
+namespace internal {
+  
+template<typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
+  : public evaluator_base<CwiseUnaryOp<UnaryOp,ArgType> >
 {
   public:
+    typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
     class InnerIterator;
-    class ReverseInnerIterator;
-
-    typedef CwiseUnaryOp<UnaryOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+//     class ReverseInnerIterator;
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
   protected:
-    typedef typename internal::traits<Derived>::_XprTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+//     typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
+    
+    const UnaryOp m_functor;
+    evaluator<ArgType> m_argImpl;
 };
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator
+template<typename UnaryOp, typename ArgType>
+class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator
 {
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+    typedef typename XprType::Scalar Scalar;
+    typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl,outer), m_functor(unaryOp.m_functor)
     {}
 
     EIGEN_STRONG_INLINE InnerIterator& operator++()
     { Base::operator++(); return *this; }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
 
   protected:
     const UnaryOp m_functor;
   private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
+    Scalar& valueRef();
 };
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
-
-  protected:
-    const UnaryOp m_functor;
-  private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
-};
-
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryView<ViewOp, MatrixType> >
+// template<typename UnaryOp, typename ArgType>
+// class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::ReverseInnerIterator
+//     : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalReverseIterator
+// {
+//     typedef typename XprType::Scalar Scalar;
+//     typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalReverseIterator Base;
+//   public:
+// 
+//     EIGEN_STRONG_INLINE ReverseInnerIterator(const XprType& unaryOp, typename XprType::Index outer)
+//       : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+//     {}
+// 
+//     EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
+//     { Base::operator--(); return *this; }
+// 
+//     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
+// 
+//   protected:
+//     const UnaryOp m_functor;
+//   private:
+//     Scalar& valueRef();
+// };
+
+
+
+
+
+template<typename ViewOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
+  : public evaluator_base<CwiseUnaryView<ViewOp,ArgType> >
 {
   public:
+    typedef CwiseUnaryView<ViewOp, ArgType> XprType;
 
     class InnerIterator;
     class ReverseInnerIterator;
-
-    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression())
+    {
+      EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<ViewOp>::Cost);
+      EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+    }
 
   protected:
-    typedef typename internal::traits<Derived>::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+//     typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
+    
+    const ViewOp m_functor;
+    evaluator<ArgType> m_argImpl;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator
+template<typename ViewOp, typename ArgType>
+class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator
 {
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator Base;
+    typedef typename XprType::Scalar Scalar;
+    typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator Base;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl,outer), m_functor(unaryOp.m_functor)
     {}
 
     EIGEN_STRONG_INLINE InnerIterator& operator++()
     { Base::operator++(); return *this; }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
+    EIGEN_STRONG_INLINE Scalar& valueRef() { return m_functor(Base::valueRef()); }
 
   protected:
     const ViewOp m_functor;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
-
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
-
-  protected:
-    const ViewOp m_functor;
-};
+// template<typename ViewOp, typename ArgType>
+// class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::ReverseInnerIterator
+//     : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalReverseIterator
+// {
+//     typedef typename XprType::Scalar Scalar;
+//     typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalReverseIterator Base;
+//   public:
+// 
+//     EIGEN_STRONG_INLINE ReverseInnerIterator(const XprType& unaryOp, typename XprType::Index outer)
+//       : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
+//     {}
+// 
+//     EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
+//     { Base::operator--(); return *this; }
+// 
+//     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
+//     EIGEN_STRONG_INLINE Scalar& valueRef() { return m_functor(Base::valueRef()); }
+// 
+//   protected:
+//     const ViewOp m_functor;
+// };
+
+
+} // end namespace internal
 
 template<typename Derived>
 EIGEN_STRONG_INLINE Derived&
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseDenseProduct.h b/nuparu/include/Eigen/src/SparseCore/SparseDenseProduct.h
index 30975c29..87c946b9 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,177 +12,91 @@
 
 namespace Eigen { 
 
-template<typename Lhs, typename Rhs, int InnerSize> struct SparseDenseProductReturnType
-{
-  typedef SparseTimeDenseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct SparseDenseProductReturnType<Lhs,Rhs,1>
-{
-  typedef SparseDenseOuterProduct<Lhs,Rhs,false> Type;
-};
-
-template<typename Lhs, typename Rhs, int InnerSize> struct DenseSparseProductReturnType
-{
-  typedef DenseTimeSparseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct DenseSparseProductReturnType<Lhs,Rhs,1>
-{
-  typedef SparseDenseOuterProduct<Rhs,Lhs,true> Type;
-};
-
 namespace internal {
 
-template<typename Lhs, typename Rhs, bool Tr>
-struct traits<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  typedef Sparse StorageKind;
-  typedef typename scalar_product_traits<typename traits<Lhs>::Scalar,
-                                         typename traits<Rhs>::Scalar>::ReturnType Scalar;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::Nested LhsNested;
-  typedef typename Rhs::Nested RhsNested;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-
-  enum {
-    LhsCoeffReadCost = traits<_LhsNested>::CoeffReadCost,
-    RhsCoeffReadCost = traits<_RhsNested>::CoeffReadCost,
-
-    RowsAtCompileTime    = Tr ? int(traits<Rhs>::RowsAtCompileTime)     : int(traits<Lhs>::RowsAtCompileTime),
-    ColsAtCompileTime    = Tr ? int(traits<Lhs>::ColsAtCompileTime)     : int(traits<Rhs>::ColsAtCompileTime),
-    MaxRowsAtCompileTime = Tr ? int(traits<Rhs>::MaxRowsAtCompileTime)  : int(traits<Lhs>::MaxRowsAtCompileTime),
-    MaxColsAtCompileTime = Tr ? int(traits<Lhs>::MaxColsAtCompileTime)  : int(traits<Rhs>::MaxColsAtCompileTime),
-
-    Flags = Tr ? RowMajorBit : 0,
-
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + NumTraits<Scalar>::MulCost
-  };
-};
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs, bool Tr>
-class SparseDenseOuterProduct
- : public SparseMatrixBase<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseDenseOuterProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseDenseOuterProduct)
-    typedef internal::traits<SparseDenseOuterProduct> Traits;
-
-  private:
-
-    typedef typename Traits::LhsNested LhsNested;
-    typedef typename Traits::RhsNested RhsNested;
-    typedef typename Traits::_LhsNested _LhsNested;
-    typedef typename Traits::_RhsNested _RhsNested;
-
-  public:
-
-    class InnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(!Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Rhs& rhs, const Lhs& lhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return Tr ? m_rhs.rows() : m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return Tr ? m_lhs.cols() : m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-};
-
-template<typename Lhs, typename Rhs, bool Transpose>
-class SparseDenseOuterProduct<Lhs,Rhs,Transpose>::InnerIterator : public _LhsNested::InnerIterator
-{
-    typedef typename _LhsNested::InnerIterator Base;
-    typedef typename SparseDenseOuterProduct::Index Index;
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const SparseDenseOuterProduct& prod, Index outer)
-      : Base(prod.lhs(), 0), m_outer(outer), m_factor(prod.rhs().coeff(outer))
-    {
-    }
-
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return Transpose ? Base::row() : m_outer; }
-    inline Index col() const { return Transpose ? m_outer : Base::row(); }
-
-    inline Scalar value() const { return Base::value() * m_factor; }
-
-  protected:
-    int m_outer;
-    Scalar m_factor;
-};
-
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<SparseTimeDenseProduct<Lhs,Rhs> >
- : traits<ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-  typedef MatrixXpr XprKind;
-};
+template <> struct product_promote_storage_type<Sparse,Dense, OuterProduct> { typedef Sparse ret; };
+template <> struct product_promote_storage_type<Dense,Sparse, OuterProduct> { typedef Sparse ret; };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,
+         typename AlphaType,
          int LhsStorageOrder = ((SparseLhsType::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor,
          bool ColPerCol = ((DenseRhsType::Flags&RowMajorBit)==0) || DenseRhsType::ColsAtCompileTime==1>
 struct sparse_time_dense_product_impl;
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, true>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, typename DenseResType::Scalar, RowMajor, true>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    LhsEval lhsEval(lhs);
+    
+    Index n = lhs.outerSize();
+#ifdef EIGEN_HAS_OPENMP
+    Eigen::initParallel();
+    Index threads = Eigen::nbThreads();
+#endif
+    
     for(Index c=0; c<rhs.cols(); ++c)
     {
-      int n = lhs.outerSize();
-      for(Index j=0; j<n; ++j)
+#ifdef EIGEN_HAS_OPENMP
+      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+      // It basically represents the minimal amount of work to be done to be worth it.
+      if(threads>1 && lhsEval.nonZerosEstimate() > 20000)
       {
-        typename Res::Scalar tmp(0);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
-          tmp += it.value() * rhs.coeff(it.index(),c);
-        res.coeffRef(j,c) = alpha * tmp;
+        #pragma omp parallel for schedule(static) num_threads(threads)
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
+      }
+      else
+#endif
+      {
+        for(Index i=0; i<n; ++i)
+          processRow(lhsEval,rhs,res,alpha,i,c);
       }
     }
   }
+  
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha, Index i, Index col)
+  {
+    typename Res::Scalar tmp(0);
+    for(LhsInnerIterator it(lhsEval,i); it ;++it)
+      tmp += it.value() * rhs.coeff(it.index(),col);
+    res.coeffRef(i,col) += alpha * tmp;
+  }
+  
 };
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, true>
+// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
+template<typename T1, typename T2/*, int _Options, typename _StrideType*/>
+struct scalar_product_traits<T1, Ref<T2/*, _Options, _StrideType*/> >
+{
+  enum {
+    Defined = 1
+  };
+  typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
+};
+template<typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, AlphaType, ColMajor, true>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
-  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
       {
-        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
+//        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
+        typename internal::scalar_product_traits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j,c));
+        for(LhsInnerIterator it(lhsEval,j); it ;++it)
           res.coeffRef(it.index(),c) += it.value() * rhs_j;
       }
     }
@@ -190,38 +104,38 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, C
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, false>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, typename DenseResType::Scalar, RowMajor, false>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index j=0; j<lhs.outerSize(); ++j)
     {
       typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
+      for(LhsInnerIterator it(lhsEval,j); it ;++it)
         res_j += (alpha*it.value()) * rhs.row(it.index());
     }
   }
 };
 
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, false>
+struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, typename DenseResType::Scalar, ColMajor, false>
 {
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
+    evaluator<Lhs> lhsEval(lhs);
     for(Index j=0; j<lhs.outerSize(); ++j)
     {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
+      for(LhsInnerIterator it(lhsEval,j); it ;++it)
         res.row(it.index()) += (alpha*it.value()) * rhs_j;
     }
   }
@@ -230,71 +144,174 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, C
 template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,typename AlphaType>
 inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
 {
-  sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType>::run(lhs, rhs, res, alpha);
+  sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, AlphaType>::run(lhs, rhs, res, alpha);
 }
 
 } // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class SparseTimeDenseProduct
-  : public ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseTimeDenseProduct)
-
-    SparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::sparse_time_dense_product(m_lhs, m_rhs, dest, alpha);
-    }
+namespace internal {
 
-  private:
-    SparseTimeDenseProduct& operator=(const SparseTimeDenseProduct&);
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+ : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,SparseShape,DenseShape,ProductType> >
+{
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? 1 : Rhs::ColsAtCompileTime>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==0) ? 1 : Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha);
+  }
 };
 
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, DenseShape, ProductType>
+  : generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+{};
 
-// dense = dense * sparse
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<DenseTimeSparseProduct<Lhs,Rhs> >
- : traits<ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs> >
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+  : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,SparseShape,ProductType> >
 {
-  typedef Dense StorageKind;
+  typedef typename Product<Lhs,Rhs>::Scalar Scalar;
+  
+  template<typename Dst>
+  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  {
+    typedef typename nested_eval<Lhs,((Rhs::Flags&RowMajorBit)==0) ? Dynamic : 1>::type LhsNested;
+    typedef typename nested_eval<Rhs,((Lhs::Flags&RowMajorBit)==RowMajorBit) ? 1 : Lhs::RowsAtCompileTime>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    
+    // transpose everything
+    Transpose<Dst> dstT(dst);
+    internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+  }
 };
-} // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class DenseTimeSparseProduct
-  : public ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs>
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseTriangularShape, ProductType>
+  : generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+{};
+
+template<typename LhsT, typename RhsT, bool NeedToTranspose>
+struct sparse_dense_outer_product_evaluator
 {
+protected:
+  typedef typename conditional<NeedToTranspose,RhsT,LhsT>::type Lhs1;
+  typedef typename conditional<NeedToTranspose,LhsT,RhsT>::type ActualRhs;
+  typedef Product<LhsT,RhsT,DefaultProduct> ProdXprType;
+  
+  // if the actual left-hand side is a dense vector,
+  // then build a sparse-view so that we can seamlessly iterate over it.
+  typedef typename conditional<is_same<typename internal::traits<Lhs1>::StorageKind,Sparse>::value,
+            Lhs1, SparseView<Lhs1> >::type ActualLhs;
+  typedef typename conditional<is_same<typename internal::traits<Lhs1>::StorageKind,Sparse>::value,
+            Lhs1 const&, SparseView<Lhs1> >::type LhsArg;
+            
+  typedef evaluator<ActualLhs> LhsEval;
+  typedef evaluator<ActualRhs> RhsEval;
+  typedef typename evaluator<ActualLhs>::InnerIterator LhsIterator;
+  typedef typename ProdXprType::Scalar Scalar;
+  
+public:
+  enum {
+    Flags = NeedToTranspose ? RowMajorBit : 0,
+    CoeffReadCost = HugeCost
+  };
+  
+  class InnerIterator : public LhsIterator
+  {
   public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseProduct)
-
-    DenseTimeSparseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+    InnerIterator(const sparse_dense_outer_product_evaluator &xprEval, Index outer)
+      : LhsIterator(xprEval.m_lhsXprImpl, 0),
+        m_outer(outer),
+        m_empty(false),
+        m_factor(get(xprEval.m_rhsXprImpl, outer, typename internal::traits<ActualRhs>::StorageKind() ))
     {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+    
+    EIGEN_STRONG_INLINE Index outer() const { return m_outer; }
+    EIGEN_STRONG_INLINE Index row()   const { return NeedToTranspose ? m_outer : LhsIterator::index(); }
+    EIGEN_STRONG_INLINE Index col()   const { return NeedToTranspose ? LhsIterator::index() : m_outer; }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return LhsIterator::value() * m_factor; }
+    EIGEN_STRONG_INLINE operator bool() const { return LhsIterator::operator bool() && (!m_empty); }
+    
+  protected:
+    Scalar get(const RhsEval &rhs, Index outer, Dense = Dense()) const
     {
-      Transpose<const _LhsNested> lhs_t(m_lhs);
-      Transpose<const _RhsNested> rhs_t(m_rhs);
-      Transpose<Dest> dest_t(dest);
-      internal::sparse_time_dense_product(rhs_t, lhs_t, dest_t, alpha);
+      return rhs.coeff(outer);
     }
+    
+    Scalar get(const RhsEval &rhs, Index outer, Sparse = Sparse())
+    {
+      typename RhsEval::InnerIterator it(rhs, outer);
+      if (it && it.index()==0 && it.value()!=Scalar(0))
+        return it.value();
+      m_empty = true;
+      return Scalar(0);
+    }
+    
+    Index m_outer;
+    bool m_empty;
+    Scalar m_factor;
+  };
+  
+  sparse_dense_outer_product_evaluator(const Lhs1 &lhs, const ActualRhs &rhs)
+     : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  // transpose case
+  sparse_dense_outer_product_evaluator(const ActualRhs &rhs, const Lhs1 &lhs)
+     : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+    
+protected:
+  const LhsArg m_lhs;
+  evaluator<ActualLhs> m_lhsXprImpl;
+  evaluator<ActualRhs> m_rhsXprImpl;
+};
 
-  private:
-    DenseTimeSparseProduct& operator=(const DenseTimeSparseProduct&);
+// sparse * dense outer product
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape>
+  : sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor>
+{
+  typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Lhs::IsRowMajor> Base;
+  
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs())
+  {}
+  
 };
 
-// sparse * dense
-template<typename Derived>
-template<typename OtherDerived>
-inline const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
-SparseMatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
+template<typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape>
+  : sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor>
 {
-  return typename SparseDenseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
-}
+  typedef sparse_dense_outer_product_evaluator<Lhs,Rhs, Rhs::IsRowMajor> Base;
+  
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr)
+    : Base(xpr.lhs(), xpr.rhs())
+  {}
+  
+};
+
+} // end namespace internal
 
 } // end namespace Eigen
 
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseDiagonalProduct.h b/nuparu/include/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 1bb590e6..e4af49e0 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,171 +26,109 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<SparseDiagonalProduct<Lhs, Rhs> >
-{
-  typedef typename remove_all<Lhs>::type _Lhs;
-  typedef typename remove_all<Rhs>::type _Rhs;
-  typedef typename _Lhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  typedef Sparse StorageKind;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Lhs::RowsAtCompileTime,
-    ColsAtCompileTime = _Rhs::ColsAtCompileTime,
-
-    MaxRowsAtCompileTime = _Lhs::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _Rhs::MaxColsAtCompileTime,
-
-    SparseFlags = is_diagonal<_Lhs>::ret ? int(_Rhs::Flags) : int(_Lhs::Flags),
-    Flags = (SparseFlags&RowMajorBit),
-    CoeffReadCost = Dynamic
-  };
+enum {
+  SDP_AsScalarProduct,
+  SDP_AsCwiseProduct
 };
+  
+template<typename SparseXprType, typename DiagonalCoeffType, int SDP_Tag>
+struct sparse_diagonal_product_evaluator;
 
-enum {SDP_IsDiagonal, SDP_IsSparseRowMajor, SDP_IsSparseColMajor};
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType, int RhsMode, int LhsMode>
-class sparse_diagonal_product_inner_iterator_selector;
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs>
-class SparseDiagonalProduct
-  : public SparseMatrixBase<SparseDiagonalProduct<Lhs,Rhs> >,
-    internal::no_assignment_operator
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DiagonalShape, SparseShape>
+  : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct>
 {
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
-
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-
-    enum {
-      LhsMode = internal::is_diagonal<_LhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_LhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor,
-      RhsMode = internal::is_diagonal<_RhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_RhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor
-    };
-
-  public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseDiagonalProduct)
-
-    typedef internal::sparse_diagonal_product_inner_iterator_selector
-                      <_LhsNested,_RhsNested,SparseDiagonalProduct,LhsMode,RhsMode> InnerIterator;
-    
-    // We do not want ReverseInnerIterator for diagonal-sparse products,
-    // but this dummy declaration is needed to make diag * sparse * diag compile.
-    class ReverseInnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDiagonalProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows() && "invalid sparse matrix * diagonal matrix product");
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  
+  typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType, Rhs::Flags&RowMajorBit?SDP_AsScalarProduct:SDP_AsCwiseProduct> Base;
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
 };
 
-namespace internal {
-
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseRowMajor>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseShape, DiagonalShape>
+  : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct>
 {
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs()*(expr.lhs().diagonal().coeff(outer)), outer)
-    {}
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags&RowMajorBit, Alignment = 0 }; // FIXME CoeffReadCost & Flags
+  
+  typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>, Lhs::Flags&RowMajorBit?SDP_AsCwiseProduct:SDP_AsScalarProduct> Base;
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal().transpose()) {}
 };
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseColMajor>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator
+template<typename SparseXprType, typename DiagonalCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagonalCoeffType, SDP_AsScalarProduct>
 {
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
+protected:
+  typedef typename evaluator<SparseXprType>::InnerIterator SparseXprInnerIterator;
+  typedef typename SparseXprType::Scalar Scalar;
+  
+public:
+  class InnerIterator : public SparseXprInnerIterator
+  {
   public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs().innerVector(outer) .cwiseProduct(expr.lhs().diagonal()), 0), m_outer(outer)
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+      : SparseXprInnerIterator(xprEval.m_sparseXprImpl, outer),
+        m_coeff(xprEval.m_diagCoeffImpl.coeff(outer))
     {}
     
-    inline Index outer() const { return m_outer; }
-    inline Index col() const { return m_outer; }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_coeff * SparseXprInnerIterator::value(); }
+  protected:
+    typename DiagonalCoeffType::Scalar m_coeff;
+  };
+  
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagonalCoeffType &diagCoeff)
+    : m_sparseXprImpl(sparseXpr), m_diagCoeffImpl(diagCoeff)
+  {}
+    
+protected:
+  evaluator<SparseXprType> m_sparseXprImpl;
+  evaluator<DiagonalCoeffType> m_diagCoeffImpl;
 };
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseColMajor,SDP_IsDiagonal>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator
-{
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs()*expr.rhs().diagonal().coeff(outer), outer)
-    {}
-};
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseRowMajor,SDP_IsDiagonal>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator
+template<typename SparseXprType, typename DiagCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwiseProduct>
 {
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
+  typedef typename SparseXprType::Scalar Scalar;
+  typedef typename SparseXprType::StorageIndex StorageIndex;
+  
+  typedef typename nested_eval<DiagCoeffType,SparseXprType::IsRowMajor ? SparseXprType::RowsAtCompileTime
+                                                                       : SparseXprType::ColsAtCompileTime>::type DiagCoeffNested;
+  
+  class InnerIterator
+  {
+    typedef typename evaluator<SparseXprType>::InnerIterator SparseXprIter;
   public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs().innerVector(outer) .cwiseProduct(expr.rhs().diagonal().transpose()), 0), m_outer(outer)
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+      : m_sparseIter(xprEval.m_sparseXprEval, outer), m_diagCoeffNested(xprEval.m_diagCoeffNested)
     {}
     
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return m_outer; }
+    inline Scalar value() const { return m_sparseIter.value() * m_diagCoeffNested.coeff(index()); }
+    inline StorageIndex index() const  { return m_sparseIter.index(); }
+    inline Index outer() const  { return m_sparseIter.outer(); }
+    inline Index col() const    { return SparseXprType::IsRowMajor ? m_sparseIter.index() : m_sparseIter.outer(); }
+    inline Index row() const    { return SparseXprType::IsRowMajor ? m_sparseIter.outer() : m_sparseIter.index(); }
+    
+    EIGEN_STRONG_INLINE InnerIterator& operator++() { ++m_sparseIter; return *this; }
+    inline operator bool() const  { return m_sparseIter; }
+    
+  protected:
+    SparseXprIter m_sparseIter;
+    DiagCoeffNested m_diagCoeffNested;
+  };
+  
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagCoeffType &diagCoeff)
+    : m_sparseXprEval(sparseXpr), m_diagCoeffNested(diagCoeff)
+  {}
+    
+protected:
+  evaluator<SparseXprType> m_sparseXprEval;
+  DiagCoeffNested m_diagCoeffNested;
 };
 
 } // end namespace internal
 
-// SparseMatrixBase functions
-
-template<typename Derived>
-template<typename OtherDerived>
-const SparseDiagonalProduct<Derived,OtherDerived>
-SparseMatrixBase<Derived>::operator*(const DiagonalBase<OtherDerived> &other) const
-{
-  return SparseDiagonalProduct<Derived,OtherDerived>(this->derived(), other.derived());
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_DIAGONAL_PRODUCT_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseDot.h b/nuparu/include/Eigen/src/SparseCore/SparseDot.h
index db39c9ae..38bc4aa9 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseDot.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseDot.h
@@ -26,7 +26,8 @@ SparseMatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   eigen_assert(size() == other.size());
   eigen_assert(other.size()>0 && "you are using a non initialized vector");
 
-  typename Derived::InnerIterator i(derived(),0);
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
   Scalar res(0);
   while (i)
   {
@@ -49,16 +50,12 @@ SparseMatrixBase<Derived>::dot(const SparseMatrixBase<OtherDerived>& other) cons
 
   eigen_assert(size() == other.size());
 
-  typedef typename Derived::Nested  Nested;
-  typedef typename OtherDerived::Nested  OtherNested;
-  typedef typename internal::remove_all<Nested>::type  NestedCleaned;
-  typedef typename internal::remove_all<OtherNested>::type  OtherNestedCleaned;
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
+  
+  internal::evaluator<OtherDerived>  otherEval(other.derived());
+  typename internal::evaluator<OtherDerived>::InnerIterator j(otherEval, 0);
 
-  Nested nthis(derived());
-  OtherNested nother(other.derived());
-
-  typename NestedCleaned::InnerIterator i(nthis,0);
-  typename OtherNestedCleaned::InnerIterator j(nother,0);
   Scalar res(0);
   while (i && j)
   {
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseFuzzy.h b/nuparu/include/Eigen/src/SparseCore/SparseFuzzy.h
index 45f36e9e..7d47eb94 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseFuzzy.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseFuzzy.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,17 +10,20 @@
 #ifndef EIGEN_SPARSE_FUZZY_H
 #define EIGEN_SPARSE_FUZZY_H
 
-// template<typename Derived>
-// template<typename OtherDerived>
-// bool SparseMatrixBase<Derived>::isApprox(
-//   const OtherDerived& other,
-//   typename NumTraits<Scalar>::Real prec
-// ) const
-// {
-//   const typename internal::nested<Derived,2>::type nested(derived());
-//   const typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
-//   return    (nested - otherNested).cwise().abs2().sum()
-//          <= prec * prec * (std::min)(nested.cwise().abs2().sum(), otherNested.cwise().abs2().sum());
-// }
+namespace Eigen {
+  
+template<typename Derived>
+template<typename OtherDerived>
+bool SparseMatrixBase<Derived>::isApprox(const SparseMatrixBase<OtherDerived>& other, const RealScalar &prec) const
+{
+  const typename internal::nested_eval<Derived,2,PlainObject>::type actualA(derived());
+  typename internal::conditional<bool(IsRowMajor)==bool(OtherDerived::IsRowMajor),
+    const typename internal::nested_eval<OtherDerived,2,PlainObject>::type,
+    const PlainObject>::type actualB(other.derived());
+
+  return (actualA - actualB).squaredNorm() <= prec * prec * numext::mini(actualA.squaredNorm(), actualB.squaredNorm());
+}
+
+} // end namespace Eigen
 
 #endif // EIGEN_SPARSE_FUZZY_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseMap.h b/nuparu/include/Eigen/src/SparseCore/SparseMap.h
new file mode 100644
index 00000000..36c09ab0
--- /dev/null
+++ b/nuparu/include/Eigen/src/SparseCore/SparseMap.h
@@ -0,0 +1,254 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MAP_H
+#define EIGEN_SPARSE_MAP_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~NestByRefBit)
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum {
+    Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
+  };
+};
+
+} // end namespace internal
+  
+template<typename Derived,
+         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
+> class SparseMapBase;
+
+template<typename Derived>
+class SparseMapBase<Derived,ReadOnlyAccessors>
+  : public SparseCompressedBase<Derived>
+{
+  public:
+    typedef SparseCompressedBase<Derived> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    using Base::operator=;
+  protected:
+    
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         Scalar *, const Scalar *>::type ScalarPointer;
+    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<Derived>::value),
+                         StorageIndex *, const StorageIndex *>::type IndexPointer;
+
+    Index   m_outerSize;
+    Index   m_innerSize;
+    Array<StorageIndex,2,1>  m_zero_nnz;
+    IndexPointer  m_outerIndex;
+    IndexPointer  m_innerIndices;
+    ScalarPointer m_values;
+    IndexPointer  m_innerNonZeros;
+
+  public:
+
+    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+    inline Index innerSize() const { return m_innerSize; }
+    inline Index outerSize() const { return m_outerSize; }
+    inline Index nonZeros() const { return m_zero_nnz[1]; }
+    
+    bool isCompressed() const { return m_innerNonZeros==0; }
+
+    //----------------------------------------
+    // direct access interface
+    inline const Scalar* valuePtr() const { return m_values; }
+    inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
+    //----------------------------------------
+
+    inline Scalar coeff(Index row, Index col) const
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = m_outerIndex[outer];
+      Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
+      if (start==end)
+        return Scalar(0);
+      else if (end>0 && inner==m_innerIndices[end-1])
+        return m_values[end-1];
+      // ^^  optimization: let's first check if it is the last coefficient
+      // (very common in high level algorithms)
+
+      const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
+      const Index id = r-&m_innerIndices[0];
+      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
+    }
+
+    inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
+                              ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
+      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
+    {}
+
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, IndexPointer innerIndexPtr, ScalarPointer valuePtr)
+      : m_outerSize(1), m_innerSize(size), m_zero_nnz(0,internal::convert_index<StorageIndex>(nnz)), m_outerIndex(m_zero_nnz.data()),
+        m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(0)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
+};
+
+template<typename Derived>
+class SparseMapBase<Derived,WriteAccessors>
+  : public SparseMapBase<Derived,ReadOnlyAccessors>
+{
+    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+    
+  public:
+    typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
+    enum { IsRowMajor = Base::IsRowMajor };
+    
+    using Base::operator=;
+
+  public:
+    
+    //----------------------------------------
+    // direct access interface
+    using Base::valuePtr;
+    using Base::innerIndexPtr;
+    using Base::outerIndexPtr;
+    using Base::innerNonZeroPtr;
+    inline Scalar* valuePtr()       { return Base::m_values; }
+    inline StorageIndex* innerIndexPtr()   { return Base::m_innerIndices; }
+    inline StorageIndex* outerIndexPtr()   { return Base::m_outerIndex; }
+    inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
+    //----------------------------------------
+
+    inline Scalar& coeffRef(Index row, Index col)
+    {
+      const Index outer = IsRowMajor ? row : col;
+      const Index inner = IsRowMajor ? col : row;
+
+      Index start = Base::m_outerIndex[outer];
+      Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
+      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
+      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
+      Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
+      const Index id = r - &Base::m_innerIndices[0];
+      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
+      return const_cast<Scalar*>(Base::m_values)[id];
+    }
+    
+    inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
+                              Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    // for vectors
+    inline SparseMapBase(Index size, Index nnz, StorageIndex* innerIndexPtr, Scalar* valuePtr)
+      : Base(size, nnz, innerIndexPtr, valuePtr)
+    {}
+
+    /** Empty destructor */
+    inline ~SparseMapBase() {}
+
+  protected:
+    inline SparseMapBase() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
+               StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+  public:
+    typedef SparseMapBase<Map> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+    enum { IsRowMajor = Base::IsRowMajor };
+
+  public:
+
+    inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
+               const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
+    {}
+
+    /** Empty destructor */
+    inline ~Map() {}
+};
+
+namespace internal {
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MAP_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseMatrix.h b/nuparu/include/Eigen/src/SparseCore/SparseMatrix.h
index adceafe1..91bada40 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseMatrix.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -43,7 +43,7 @@ template<typename _Scalar, int _Options, typename _Index>
 struct traits<SparseMatrix<_Scalar, _Options, _Index> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _Index StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -51,22 +51,21 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
     ColsAtCompileTime = Dynamic,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
 
 template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
-struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+struct traits<Diagonal<SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
 {
   typedef SparseMatrix<_Scalar, _Options, _Index> MatrixType;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
   typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
 
   typedef _Scalar Scalar;
   typedef Dense StorageKind;
-  typedef _Index Index;
+  typedef _Index StorageIndex;
   typedef MatrixXpr XprKind;
 
   enum {
@@ -74,8 +73,16 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
     ColsAtCompileTime = 1,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = 1,
-    Flags = 0,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost*10
+    Flags = LvalueBit
+  };
+};
+
+template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
+struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+ : public traits<Diagonal<SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
+{
+  enum {
+    Flags = 0
   };
 };
 
@@ -83,38 +90,43 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
 
 template<typename _Scalar, int _Options, typename _Index>
 class SparseMatrix
-  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
 {
+    typedef SparseCompressedBase<SparseMatrix> Base;
+    using Base::convert_index;
   public:
+    using Base::isCompressed;
+    using Base::nonZeros;
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
+    using Base::operator+=;
+    using Base::operator-=;
 
     typedef MappedSparseMatrix<Scalar,Flags> Map;
+    typedef Diagonal<SparseMatrix> DiagonalReturnType;
+    typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
+    
+
     using Base::IsRowMajor;
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    typedef internal::CompressedStorage<Scalar,StorageIndex> Storage;
     enum {
       Options = _Options
     };
 
+    typedef typename Base::IndexVector IndexVector;
+    typedef typename Base::ScalarVector ScalarVector;
   protected:
-
     typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
 
     Index m_outerSize;
     Index m_innerSize;
-    Index* m_outerIndex;
-    Index* m_innerNonZeros;     // optional, if null then the data is compressed
+    StorageIndex* m_outerIndex;
+    StorageIndex* m_innerNonZeros;     // optional, if null then the data is compressed
     Storage m_data;
-    
-    Eigen::Map<Matrix<Index,Dynamic,1> > innerNonZeros() { return Eigen::Map<Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-    const  Eigen::Map<const Matrix<Index,Dynamic,1> > innerNonZeros() const { return Eigen::Map<const Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
 
   public:
     
-    /** \returns whether \c *this is in compressed form. */
-    inline bool isCompressed() const { return m_innerNonZeros==0; }
-
     /** \returns the number of rows of the matrix */
     inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
     /** \returns the number of columns of the matrix */
@@ -137,29 +149,29 @@ class SparseMatrix
     /** \returns a const pointer to the array of inner indices.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), outerIndexPtr() */
-    inline const Index* innerIndexPtr() const { return &m_data.index(0); }
+    inline const StorageIndex* innerIndexPtr() const { return &m_data.index(0); }
     /** \returns a non-const pointer to the array of inner indices.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), outerIndexPtr() */
-    inline Index* innerIndexPtr() { return &m_data.index(0); }
+    inline StorageIndex* innerIndexPtr() { return &m_data.index(0); }
 
     /** \returns a const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), innerIndexPtr() */
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
+    inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
     /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \sa valuePtr(), innerIndexPtr() */
-    inline Index* outerIndexPtr() { return m_outerIndex; }
+    inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
 
     /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \warning it returns the null pointer 0 in compressed mode */
-    inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; }
+    inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
     /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
       * This function is aimed at interoperability with other libraries.
       * \warning it returns the null pointer 0 in compressed mode */
-    inline Index* innerNonZeroPtr() { return m_innerNonZeros; }
+    inline StorageIndex* innerNonZeroPtr() { return m_innerNonZeros; }
 
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -175,7 +187,7 @@ class SparseMatrix
       const Index outer = IsRowMajor ? row : col;
       const Index inner = IsRowMajor ? col : row;
       Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      return m_data.atInRange(m_outerIndex[outer], end, inner);
+      return m_data.atInRange(m_outerIndex[outer], end, StorageIndex(inner));
     }
 
     /** \returns a non-const reference to the value of the matrix at position \a i, \a j
@@ -198,7 +210,7 @@ class SparseMatrix
       eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
       if(end<=start)
         return insert(row,col);
-      const Index p = m_data.searchLowerIndex(start,end-1,inner);
+      const Index p = m_data.searchLowerIndex(start,end-1,StorageIndex(inner));
       if((p<end) && (m_data.index(p)==inner))
         return m_data.value(p);
       else
@@ -209,45 +221,34 @@ class SparseMatrix
       * The non zero coefficient must \b not already exist.
       *
       * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
-      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
-      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
-      * inner vector that better match your scenario.
+      * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier.
+      * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to be
+      * inserted by increasing outer-indices.
+      * 
+      * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to first
+      * call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector.
       *
-      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
-      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
+      * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1)
+      * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
       *
       */
-    Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      if(isCompressed())
-      {
-        reserve(VectorXi::Constant(outerSize(), 2));
-      }
-      return insertUncompressed(row,col);
-    }
+    Scalar& insert(Index row, Index col);
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** Removes all non zeros but keep allocated memory */
+    /** Removes all non zeros but keep allocated memory
+      *
+      * This function does not free the currently allocated memory. To release as much as memory as possible,
+      * call \code mat.data().squeeze(); \endcode after resizing it.
+      * 
+      * \sa resize(Index,Index), data()
+      */
     inline void setZero()
     {
       m_data.clear();
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
-      if(m_innerNonZeros)
-        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(Index));
-    }
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const
-    {
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
       if(m_innerNonZeros)
-        return innerNonZeros().sum();
-      return static_cast<Index>(m_data.size());
+        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
     }
 
     /** Preallocates \a reserveSize non zeros.
@@ -262,22 +263,25 @@ class SparseMatrix
     #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
       *
-      * This function turns the matrix in non-compressed mode */
+      * This function turns the matrix in non-compressed mode.
+      * 
+      * The type \c SizesType must expose the following interface:
+        \code
+        typedef value_type;
+        const value_type& operator[](i) const;
+        \endcode
+      * for \c i in the [0,this->outerSize()[ range.
+      * Typical choices include std::vector<int>, Eigen::VectorXi, Eigen::VectorXi::Constant, etc.
+      */
     template<class SizesType>
     inline void reserve(const SizesType& reserveSizes);
     #else
     template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = typename SizesType::value_type())
-    {
-      EIGEN_UNUSED_VARIABLE(enableif);
-      reserveInnerVectors(reserveSizes);
-    }
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::Scalar& enableif =
-    #if (!defined(_MSC_VER)) || (_MSC_VER>=1500) // MSVC 2005 fails to compile with this typename
+    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif =
+    #if (!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1500) // MSVC 2005 fails to compile with this typename
         typename
     #endif
-        SizesType::Scalar())
+        SizesType::value_type())
     {
       EIGEN_UNUSED_VARIABLE(enableif);
       reserveInnerVectors(reserveSizes);
@@ -289,15 +293,15 @@ class SparseMatrix
     {
       if(isCompressed())
       {
-        std::size_t totalReserveSize = 0;
+        Index totalReserveSize = 0;
         // turn the matrix into non-compressed mode
-        m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
         
         // temporarily use m_innerSizes to hold the new starting points.
-        Index* newOuterIndex = m_innerNonZeros;
+        StorageIndex* newOuterIndex = m_innerNonZeros;
         
-        Index count = 0;
+        StorageIndex count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
           newOuterIndex[j] = count;
@@ -305,10 +309,10 @@ class SparseMatrix
           totalReserveSize += reserveSizes[j];
         }
         m_data.reserve(totalReserveSize);
-        Index previousOuterIndex = m_outerIndex[m_outerSize];
+        StorageIndex previousOuterIndex = m_outerIndex[m_outerSize];
         for(Index j=m_outerSize-1; j>=0; --j)
         {
-          Index innerNNZ = previousOuterIndex - m_outerIndex[j];
+          StorageIndex innerNNZ = previousOuterIndex - m_outerIndex[j];
           for(Index i=innerNNZ-1; i>=0; --i)
           {
             m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
@@ -324,15 +328,15 @@ class SparseMatrix
       }
       else
       {
-        Index* newOuterIndex = static_cast<Index*>(std::malloc((m_outerSize+1)*sizeof(Index)));
+        StorageIndex* newOuterIndex = static_cast<StorageIndex*>(std::malloc((m_outerSize+1)*sizeof(StorageIndex)));
         if (!newOuterIndex) internal::throw_std_bad_alloc();
         
-        Index count = 0;
+        StorageIndex count = 0;
         for(Index j=0; j<m_outerSize; ++j)
         {
           newOuterIndex[j] = count;
-          Index alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
-          Index toReserve = std::max<Index>(reserveSizes[j], alreadyReserved);
+          StorageIndex alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
+          StorageIndex toReserve = std::max<StorageIndex>(reserveSizes[j], alreadyReserved);
           count += toReserve + m_innerNonZeros[j];
         }
         newOuterIndex[m_outerSize] = count;
@@ -343,7 +347,7 @@ class SparseMatrix
           Index offset = newOuterIndex[j] - m_outerIndex[j];
           if(offset>0)
           {
-            Index innerNNZ = m_innerNonZeros[j];
+            StorageIndex innerNNZ = m_innerNonZeros[j];
             for(Index i=innerNNZ-1; i>=0; --i)
             {
               m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
@@ -380,11 +384,11 @@ class SparseMatrix
       * \sa insertBack, startVec */
     inline Scalar& insertBackByOuterInner(Index outer, Index inner)
     {
-      eigen_assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
+      eigen_assert(Index(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
       eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
       Index p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
-      m_data.append(0, inner);
+      m_data.append(Scalar(0), inner);
       return m_data.value(p);
     }
 
@@ -394,7 +398,7 @@ class SparseMatrix
     {
       Index p = m_outerIndex[outer+1];
       ++m_outerIndex[outer+1];
-      m_data.append(0, inner);
+      m_data.append(Scalar(0), inner);
       return m_data.value(p);
     }
 
@@ -402,7 +406,7 @@ class SparseMatrix
       * \sa insertBack, insertBackByOuterInner */
     inline void startVec(Index outer)
     {
-      eigen_assert(m_outerIndex[outer]==int(m_data.size()) && "You must call startVec for each inner vector sequentially");
+      eigen_assert(m_outerIndex[outer]==Index(m_data.size()) && "You must call startVec for each inner vector sequentially");
       eigen_assert(m_outerIndex[outer+1]==0 && "You must call startVec for each inner vector sequentially");
       m_outerIndex[outer+1] = m_outerIndex[outer];
     }
@@ -414,7 +418,7 @@ class SparseMatrix
     {
       if(isCompressed())
       {
-        Index size = static_cast<Index>(m_data.size());
+        StorageIndex size = internal::convert_index<StorageIndex>(m_data.size());
         Index i = m_outerSize;
         // find the last filled column
         while (i>=0 && m_outerIndex[i]==0)
@@ -433,7 +437,13 @@ class SparseMatrix
     template<typename InputIterators>
     void setFromTriplets(const InputIterators& begin, const InputIterators& end);
 
-    void sumupDuplicates();
+    template<typename InputIterators,typename DupFunctor>
+    void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+    void sumupDuplicates() { collapseDuplicates(internal::scalar_sum_op<Scalar>()); }
+
+    template<typename DupFunctor>
+    void collapseDuplicates(DupFunctor dup_func = DupFunctor());
 
     //---
     
@@ -451,6 +461,8 @@ class SparseMatrix
       if(isCompressed())
         return;
       
+      eigen_internal_assert(m_outerIndex!=0 && m_outerSize>0);
+      
       Index oldStart = m_outerIndex[1];
       m_outerIndex[1] = m_innerNonZeros[0];
       for(Index j=1; j<m_outerSize; ++j)
@@ -479,8 +491,8 @@ class SparseMatrix
     {
       if(m_innerNonZeros != 0)
         return; 
-      m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
-      for (int i = 0; i < m_outerSize; i++)
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      for (Index i = 0; i < m_outerSize; i++)
       {
         m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
       }
@@ -503,10 +515,9 @@ class SparseMatrix
     void prune(const KeepFunc& keep = KeepFunc())
     {
       // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
-      // TODO also implement a unit test
       makeCompressed();
 
-      Index k = 0;
+      StorageIndex k = 0;
       for(Index j=0; j<m_outerSize; ++j)
       {
         Index previousStart = m_outerIndex[j];
@@ -527,7 +538,7 @@ class SparseMatrix
     }
 
     /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      * \sa reserve(), setZero()
       */
     void conservativeResize(Index rows, Index cols) 
     {
@@ -539,13 +550,13 @@ class SparseMatrix
 
       Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
       Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
-      Index newInnerSize = IsRowMajor ? cols : rows;
+      StorageIndex newInnerSize = convert_index(IsRowMajor ? cols : rows);
 
       // Deals with inner non zeros
       if (m_innerNonZeros)
       {
         // Resize m_innerNonZeros
-        Index *newInnerNonZeros = static_cast<Index*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(Index)));
+        StorageIndex *newInnerNonZeros = static_cast<StorageIndex*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(StorageIndex)));
         if (!newInnerNonZeros) internal::throw_std_bad_alloc();
         m_innerNonZeros = newInnerNonZeros;
         
@@ -555,7 +566,7 @@ class SparseMatrix
       else if (innerChange < 0) 
       {
         // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = static_cast<Index*>(std::malloc((m_outerSize+outerChange+1) * sizeof(Index)));
+        m_innerNonZeros = static_cast<StorageIndex*>(std::malloc((m_outerSize+outerChange+1) * sizeof(StorageIndex)));
         if (!m_innerNonZeros) internal::throw_std_bad_alloc();
         for(Index i = 0; i < m_outerSize; i++)
           m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
@@ -566,8 +577,8 @@ class SparseMatrix
       {
         for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
         {
-          Index &n = m_innerNonZeros[i];
-          Index start = m_outerIndex[i];
+          StorageIndex &n = m_innerNonZeros[i];
+          StorageIndex start = m_outerIndex[i];
           while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n; 
         }
       }
@@ -578,12 +589,12 @@ class SparseMatrix
       if (outerChange == 0)
         return;
           
-      Index *newOuterIndex = static_cast<Index*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(Index)));
+      StorageIndex *newOuterIndex = static_cast<StorageIndex*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(StorageIndex)));
       if (!newOuterIndex) internal::throw_std_bad_alloc();
       m_outerIndex = newOuterIndex;
       if (outerChange > 0)
       {
-        Index last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
         for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
           m_outerIndex[i] = last; 
       }
@@ -591,7 +602,11 @@ class SparseMatrix
     }
     
     /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
+      * 
+      * This function does not free the currently allocated memory. To release as much as memory as possible,
+      * call \code mat.data().squeeze(); \endcode after resizing it.
+      * 
+      * \sa reserve(), setZero()
       */
     void resize(Index rows, Index cols)
     {
@@ -601,7 +616,7 @@ class SparseMatrix
       if (m_outerSize != outerSize || m_outerSize==0)
       {
         std::free(m_outerIndex);
-        m_outerIndex = static_cast<Index*>(std::malloc((outerSize + 1) * sizeof(Index)));
+        m_outerIndex = static_cast<StorageIndex*>(std::malloc((outerSize + 1) * sizeof(StorageIndex)));
         if (!m_outerIndex) internal::throw_std_bad_alloc();
         
         m_outerSize = outerSize;
@@ -611,19 +626,24 @@ class SparseMatrix
         std::free(m_innerNonZeros);
         m_innerNonZeros = 0;
       }
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
+      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(StorageIndex));
     }
 
     /** \internal
       * Resize the nonzero vector to \a size */
     void resizeNonZeros(Index size)
     {
-      // TODO remove this function
       m_data.resize(size);
     }
 
-    /** \returns a const expression of the diagonal coefficients */
-    const Diagonal<const SparseMatrix> diagonal() const { return *this; }
+    /** \returns a const expression of the diagonal coefficients. */
+    const ConstDiagonalReturnType diagonal() const { return ConstDiagonalReturnType(*this); }
+    
+    /** \returns a read-write expression of the diagonal coefficients.
+      * \warning If the diagonal entries are written, then all diagonal
+      * entries \b must already exist, otherwise an assertion will be raised.
+      */
+    DiagonalReturnType diagonal() { return DiagonalReturnType(*this); }
 
     /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
     inline SparseMatrix()
@@ -649,7 +669,16 @@ class SparseMatrix
       EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
       check_template_parameters();
-      *this = other.derived();
+      const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
+      if (needToTranspose)
+        *this = other.derived();
+      else
+      {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
+        internal::call_assignment_no_alias(*this, other.derived());
+      }
     }
     
     /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
@@ -658,7 +687,7 @@ class SparseMatrix
       : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
     {
       check_template_parameters();
-      *this = other;
+      Base::operator=(other);
     }
 
     /** Copy constructor (it performs a deep copy) */
@@ -678,6 +707,15 @@ class SparseMatrix
       initAssignment(other);
       other.evalTo(*this);
     }
+    
+    /** \brief Copy constructor with in-place evaluation */
+    template<typename OtherDerived>
+    explicit SparseMatrix(const DiagonalBase<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
+    {
+      check_template_parameters();
+      *this = other.derived();
+    }
 
     /** Swaps the content of two sparse matrices of the same type.
       * This is a fast operation that simply swaps the underlying pointers and parameters. */
@@ -691,14 +729,17 @@ class SparseMatrix
       m_data.swap(other.m_data);
     }
 
-    /** Sets *this to the identity matrix */
+    /** Sets *this to the identity matrix.
+      * This function also turns the matrix into compressed mode, and drop any reserved memory. */
     inline void setIdentity()
     {
       eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
       this->m_data.resize(rows());
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(&this->m_data.index(0), rows()).setLinSpaced(0, rows()-1);
-      Eigen::Map<Matrix<Scalar, Dynamic, 1> >(&this->m_data.value(0), rows()).setOnes();
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(this->m_outerIndex, rows()+1).setLinSpaced(0, rows());
+      Eigen::Map<IndexVector>(&this->m_data.index(0), rows()).setLinSpaced(0, StorageIndex(rows()-1));
+      Eigen::Map<ScalarVector>(&this->m_data.value(0), rows()).setOnes();
+      Eigen::Map<IndexVector>(this->m_outerIndex, rows()+1).setLinSpaced(0, StorageIndex(rows()));
+      std::free(m_innerNonZeros);
+      m_innerNonZeros = 0;
     }
     inline SparseMatrix& operator=(const SparseMatrix& other)
     {
@@ -708,10 +749,13 @@ class SparseMatrix
       }
       else if(this!=&other)
       {
+        #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+          EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        #endif
         initAssignment(other);
         if(other.isCompressed())
         {
-          memcpy(m_outerIndex, other.m_outerIndex, (m_outerSize+1)*sizeof(Index));
+          internal::smart_copy(other.m_outerIndex, other.m_outerIndex + m_outerSize + 1, m_outerIndex);
           m_data = other.m_data;
         }
         else
@@ -722,22 +766,11 @@ class SparseMatrix
       return *this;
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Lhs, typename Rhs>
-    inline SparseMatrix& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-    { return Base::operator=(product); }
-    
-    template<typename OtherDerived>
-    inline SparseMatrix& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      initAssignment(other);
-      return Base::operator=(other.derived());
-    }
-    
+#ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename OtherDerived>
     inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
     { return Base::operator=(other.derived()); }
-    #endif
+#endif // EIGEN_PARSED_BY_DOXYGEN
 
     template<typename OtherDerived>
     EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
@@ -752,8 +785,8 @@ class SparseMatrix
         else
           for (Index i=0; i<m.outerSize(); ++i)
           {
-            int p = m.m_outerIndex[i];
-            int pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
+            Index p = m.m_outerIndex[i];
+            Index pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
             Index k=p;
             for (; k<pe; ++k)
               s << "(" << m.m_data.value(k) << "," << m.m_data.index(k) << ") ";
@@ -786,10 +819,8 @@ class SparseMatrix
       std::free(m_innerNonZeros);
     }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
     /** Overloaded for performance */
     Scalar sum() const;
-#endif
     
 #   ifdef EIGEN_SPARSEMATRIX_PLUGIN
 #     include EIGEN_SPARSEMATRIX_PLUGIN
@@ -816,15 +847,15 @@ class SparseMatrix
       * A vector object that is equal to 0 everywhere but v at the position i */
     class SingletonVector
     {
-        Index m_index;
-        Index m_value;
+        StorageIndex m_index;
+        StorageIndex m_value;
       public:
-        typedef Index value_type;
+        typedef StorageIndex value_type;
         SingletonVector(Index i, Index v)
-          : m_index(i), m_value(v)
+          : m_index(convert_index(i)), m_value(convert_index(v))
         {}
 
-        Index operator[](Index i) const { return i==m_index ? m_value : 0; }
+        StorageIndex operator[](Index i) const { return i==m_index ? m_value : 0; }
     };
 
     /** \internal
@@ -843,14 +874,14 @@ class SparseMatrix
       eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
 
       Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
-      m_data.index(p) = inner;
+      m_data.index(p) = convert_index(inner);
       return (m_data.value(p) = 0);
     }
 
 private:
   static void check_template_parameters()
   {
-    EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+    EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
     EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
   }
 
@@ -865,86 +896,20 @@ class SparseMatrix
   };
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_end = mat.m_outerIndex[outer+1];
-      else
-        m_end = m_id + mat.m_innerNonZeros[outer];
-    }
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
-
-    inline Index index() const { return m_indices[m_id]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    Index m_end;
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_id = mat.m_outerIndex[outer+1];
-      else
-        m_id = m_start + mat.m_innerNonZeros[outer];
-    }
-
-    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
-
-    inline Index index() const { return m_indices[m_id-1]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-};
-
 namespace internal {
 
-template<typename InputIterator, typename SparseMatrixType>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, int Options = 0)
+template<typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, DupFunctor dup_func)
 {
-  EIGEN_UNUSED_VARIABLE(Options);
   enum { IsRowMajor = SparseMatrixType::IsRowMajor };
   typedef typename SparseMatrixType::Scalar Scalar;
-  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor> trMat(mat.rows(),mat.cols());
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,StorageIndex> trMat(mat.rows(),mat.cols());
 
-  if(begin<end)
+  if(begin!=end)
   {
     // pass 1: count the nnz per inner-vector
-    VectorXi wi(trMat.outerSize());
+    typename SparseMatrixType::IndexVector wi(trMat.outerSize());
     wi.setZero();
     for(InputIterator it(begin); it!=end; ++it)
     {
@@ -958,7 +923,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
       trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
 
     // pass 3:
-    trMat.sumupDuplicates();
+    trMat.collapseDuplicates(dup_func);
   }
 
   // pass 4: transposed copy -> implicit sorting
@@ -1009,22 +974,39 @@ template<typename Scalar, int _Options, typename _Index>
 template<typename InputIterators>
 void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
 {
-  internal::set_from_triplets(begin, end, *this);
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index> >(begin, end, *this, internal::scalar_sum_op<Scalar>());
+}
+
+/** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
+  * \code
+  * value = dup_func(OldValue, NewValue)
+  * \endcode 
+  * Here is a C++11 example keeping the latest entry only:
+  * \code
+  * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+  * \endcode
+  */
+template<typename Scalar, int _Options, typename _Index>
+template<typename InputIterators,typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func)
+{
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar,_Options,_Index>, DupFunctor>(begin, end, *this, dup_func);
 }
 
 /** \internal */
 template<typename Scalar, int _Options, typename _Index>
-void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
+template<typename DupFunctor>
+void SparseMatrix<Scalar,_Options,_Index>::collapseDuplicates(DupFunctor dup_func)
 {
   eigen_assert(!isCompressed());
   // TODO, in practice we should be able to use m_innerNonZeros for that task
-  VectorXi wi(innerSize());
+  IndexVector wi(innerSize());
   wi.fill(-1);
-  Index count = 0;
+  StorageIndex count = 0;
   // for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
-  for(int j=0; j<outerSize(); ++j)
+  for(Index j=0; j<outerSize(); ++j)
   {
-    Index start   = count;
+    StorageIndex start   = count;
     Index oldEnd  = m_outerIndex[j]+m_innerNonZeros[j];
     for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
     {
@@ -1032,7 +1014,7 @@ void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
       if(wi(i)>=start)
       {
         // we already meet this entry => accumulate it
-        m_data.value(wi(i)) += m_data.value(k);
+        m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
       }
       else
       {
@@ -1058,30 +1040,39 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
 {
   EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-  
-  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+
+  #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+    EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+  #endif
+      
+  const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
   if (needToTranspose)
   {
+    #ifdef EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+      EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+    #endif
     // two passes algorithm:
     //  1 - compute the number of coeffs per dest inner vector
     //  2 - do the actual copy/eval
     // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
-    typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
+    typedef typename internal::nested_eval<OtherDerived,2,typename internal::plain_matrix_type<OtherDerived>::type >::type OtherCopy;
     typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
+    typedef internal::evaluator<_OtherCopy> OtherCopyEval;
     OtherCopy otherCopy(other.derived());
+    OtherCopyEval otherCopyEval(otherCopy);
 
     SparseMatrix dest(other.rows(),other.cols());
-    Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
+    Eigen::Map<IndexVector> (dest.m_outerIndex,dest.outerSize()).setZero();
 
     // pass 1
     // FIXME the above copy could be merged with that pass
     for (Index j=0; j<otherCopy.outerSize(); ++j)
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it)
         ++dest.m_outerIndex[it.index()];
 
     // prefix sum
-    Index count = 0;
-    VectorXi positions(dest.outerSize());
+    StorageIndex count = 0;
+    IndexVector positions(dest.outerSize());
     for (Index j=0; j<dest.outerSize(); ++j)
     {
       Index tmp = dest.m_outerIndex[j];
@@ -1093,9 +1084,9 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
     // alloc
     dest.m_data.resize(count);
     // pass 2
-    for (Index j=0; j<otherCopy.outerSize(); ++j)
+    for (StorageIndex j=0; j<otherCopy.outerSize(); ++j)
     {
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it)
       {
         Index pos = positions[it.index()]++;
         dest.m_data.index(pos) = j;
@@ -1108,26 +1099,148 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
   else
   {
     if(other.isRValue())
+    {
       initAssignment(other.derived());
+    }
     // there is no special optimization
     return Base::operator=(other.derived());
   }
 }
 
+template<typename _Scalar, int _Options, typename _Index>
+typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insert(Index row, Index col)
+{
+  eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
+  
+  const Index outer = IsRowMajor ? row : col;
+  const Index inner = IsRowMajor ? col : row;
+  
+  if(isCompressed())
+  {
+    if(nonZeros()==0)
+    {
+      // reserve space if not already done
+      if(m_data.allocatedSize()==0)
+        m_data.reserve(2*m_innerSize);
+      
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      
+      memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(StorageIndex));
+      
+      // pack all inner-vectors to the end of the pre-allocated space
+      // and allocate the entire free-space to the first inner-vector
+      StorageIndex end = convert_index(m_data.allocatedSize());
+      for(Index j=1; j<=m_outerSize; ++j)
+        m_outerIndex[j] = end;
+    }
+    else
+    {
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = static_cast<StorageIndex*>(std::malloc(m_outerSize * sizeof(StorageIndex)));
+      if(!m_innerNonZeros) internal::throw_std_bad_alloc();
+      for(Index j=0; j<m_outerSize; ++j)
+        m_innerNonZeros[j] = m_outerIndex[j+1]-m_outerIndex[j];
+    }
+  }
+  
+  // check whether we can do a fast "push back" insertion
+  Index data_end = m_data.allocatedSize();
+  
+  // First case: we are filling a new inner vector which is packed at the end.
+  // We assume that all remaining inner-vectors are also empty and packed to the end.
+  if(m_outerIndex[outer]==data_end)
+  {
+    eigen_internal_assert(m_innerNonZeros[outer]==0);
+    
+    // pack previous empty inner-vectors to end of the used-space
+    // and allocate the entire free-space to the current inner-vector.
+    StorageIndex p = convert_index(m_data.size());
+    Index j = outer;
+    while(j>=0 && m_innerNonZeros[j]==0)
+      m_outerIndex[j--] = p;
+    
+    // push back the new element
+    ++m_innerNonZeros[outer];
+    m_data.append(Scalar(0), inner);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
+    }
+    return m_data.value(p);
+  }
+  
+  // Second case: the next inner-vector is packed to the end
+  // and the current inner-vector end match the used-space.
+  if(m_outerIndex[outer+1]==data_end && m_outerIndex[outer]+m_innerNonZeros[outer]==m_data.size())
+  {
+    eigen_internal_assert(outer+1==m_outerSize || m_innerNonZeros[outer+1]==0);
+    
+    // add space for the new element
+    ++m_innerNonZeros[outer];
+    m_data.resize(m_data.size()+1);
+    
+    // check for reallocation
+    if(data_end != m_data.allocatedSize())
+    {
+      // m_data has been reallocated
+      //  -> move remaining inner-vectors back to the end of the free-space
+      //     so that the entire free-space is allocated to the current inner-vector.
+      eigen_internal_assert(data_end < m_data.allocatedSize());
+      StorageIndex new_end = convert_index(m_data.allocatedSize());
+      for(Index k=outer+1; k<=m_outerSize; ++k)
+        if(m_outerIndex[k]==data_end)
+          m_outerIndex[k] = new_end;
+    }
+    
+    // and insert it at the right position (sorted insertion)
+    Index startId = m_outerIndex[outer];
+    Index p = m_outerIndex[outer]+m_innerNonZeros[outer]-1;
+    while ( (p > startId) && (m_data.index(p-1) > inner) )
+    {
+      m_data.index(p) = m_data.index(p-1);
+      m_data.value(p) = m_data.value(p-1);
+      --p;
+    }
+    
+    m_data.index(p) = convert_index(inner);
+    return (m_data.value(p) = 0);
+  }
+  
+  if(m_data.size() != m_data.allocatedSize())
+  {
+    // make sure the matrix is compatible to random un-compressed insertion:
+    m_data.resize(m_data.allocatedSize());
+    this->reserveInnerVectors(Array<StorageIndex,Dynamic,1>::Constant(m_outerSize, 2));
+  }
+  
+  return insertUncompressed(row,col);
+}
+    
 template<typename _Scalar, int _Options, typename _Index>
 EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
 {
   eigen_assert(!isCompressed());
 
   const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
+  const StorageIndex inner = convert_index(IsRowMajor ? col : row);
 
   Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
-  Index innerNNZ = m_innerNonZeros[outer];
+  StorageIndex innerNNZ = m_innerNonZeros[outer];
   if(innerNNZ>=room)
   {
     // this inner vector is full, we need to reallocate the whole buffer :(
-    reserve(SingletonVector(outer,std::max<Index>(2,innerNNZ)));
+    reserve(SingletonVector(outer,std::max<StorageIndex>(2,innerNNZ)));
   }
 
   Index startId = m_outerIndex[outer];
@@ -1138,7 +1251,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
     m_data.value(p) = m_data.value(p-1);
     --p;
   }
-  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
+  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exists, you must call coeffRef to this end");
 
   m_innerNonZeros[outer]++;
 
@@ -1160,7 +1273,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
     // we start a new inner vector
     while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
     {
-      m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
+      m_outerIndex[previousOuter] = convert_index(m_data.size());
       --previousOuter;
     }
     m_outerIndex[outer+1] = m_outerIndex[outer];
@@ -1177,7 +1290,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
   size_t p = m_outerIndex[outer+1];
   ++m_outerIndex[outer+1];
 
-  float reallocRatio = 1;
+  double reallocRatio = 1;
   if (m_data.allocatedSize()<=m_data.size())
   {
     // if there is no preallocated memory, let's reserve a minimum of 32 elements
@@ -1189,13 +1302,13 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
     {
       // we need to reallocate the data, to reduce multiple reallocations
       // we use a smart resize algorithm based on the current filling ratio
-      // in addition, we use float to avoid integers overflows
-      float nnzEstimate = float(m_outerIndex[outer])*float(m_outerSize)/float(outer+1);
-      reallocRatio = (nnzEstimate-float(m_data.size()))/float(m_data.size());
+      // in addition, we use double to avoid integers overflows
+      double nnzEstimate = double(m_outerIndex[outer])*double(m_outerSize)/double(outer+1);
+      reallocRatio = (nnzEstimate-double(m_data.size()))/double(m_data.size());
       // furthermore we bound the realloc ratio to:
       //   1) reduce multiple minor realloc when the matrix is almost filled
       //   2) avoid to allocate too much memory when the matrix is almost empty
-      reallocRatio = (std::min)((std::max)(reallocRatio,1.5f),8.f);
+      reallocRatio = (std::min)((std::max)(reallocRatio,1.5),8.);
     }
   }
   m_data.resize(m_data.size()+1,reallocRatio);
@@ -1253,6 +1366,20 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& Sparse
   return (m_data.value(p) = 0);
 }
 
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _Index>
+struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
+  : evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > >
+{
+  typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base;
+  typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType;
+  evaluator() : Base() {}
+  explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
+};
+
+}
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEMATRIX_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseMatrixBase.h b/nuparu/include/Eigen/src/SparseCore/SparseMatrixBase.h
index 706f699b..648ae1f8 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -23,27 +23,38 @@ namespace Eigen {
   * This class can be extended with the help of the plugin mechanism described on the page
   * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
   */
-template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
+template<typename Derived> class SparseMatrixBase
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
+                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                                            EigenBase<Derived> >
+#else
+  : public EigenBase<Derived>
+#endif // not EIGEN_PARSED_BY_DOXYGEN
 {
   public:
 
     typedef typename internal::traits<Derived>::Scalar Scalar;
+    
+    /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+      *
+      * It is an alias for the Scalar type */
+    typedef Scalar value_type;
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
+    typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
     typedef typename internal::add_const_on_value_type_if_arithmetic<
                          typename internal::packet_traits<Scalar>::type
                      >::type PacketReturnType;
 
     typedef SparseMatrixBase StorageBaseType;
-    typedef EigenBase<Derived> Base;
+
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
+    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
     
     template<typename OtherDerived>
-    Derived& operator=(const EigenBase<OtherDerived> &other)
-    {
-      other.derived().evalTo(derived());
-      return derived();
-    }
+    Derived& operator=(const EigenBase<OtherDerived> &other);
 
     enum {
 
@@ -83,11 +94,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
           * constructed from this one. See the \ref flags "list of flags".
           */
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
       IsRowMajor = Flags&RowMajorBit ? 1 : 0,
       
       InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
@@ -103,10 +109,11 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
                         CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, Eigen::Transpose<const Derived> >,
                         Transpose<const Derived>
                      >::type AdjointReturnType;
+    typedef Transpose<Derived> TransposeReturnType;
+    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
 
-
-    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, Index> PlainObject;
-
+    // FIXME storage order do not match evaluator storage order
+    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** This is the "real scalar" type; if the \a Scalar type is already real numbers
@@ -124,6 +131,8 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     /** \internal Represents a matrix with all coefficients equal to one another*/
     typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Matrix<Scalar,Dynamic,Dynamic> > ConstantReturnType;
 
+    /** type of the equivalent dense matrix */
+    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
     /** type of the equivalent square matrix */
     typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
                           EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
@@ -132,6 +141,10 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     inline Derived& derived() { return *static_cast<Derived*>(this); }
     inline Derived& const_cast_derived() const
     { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
+
+    typedef internal::special_scalar_op_base<Derived, Scalar, RealScalar, EigenBase<Derived> > Base;
+    using Base::operator*;
+    using Base::operator/;
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
@@ -153,9 +166,6 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     /** \returns the number of coefficients, which is \a rows()*cols().
       * \sa rows(), cols(). */
     inline Index size() const { return rows() * cols(); }
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return derived().nonZeros(); }
     /** \returns true if either the number of rows or the number of columns is equal to 1.
       * In other words, this function returns
       * \code rows()==1 || cols()==1 \endcode
@@ -175,93 +185,23 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
 
     
     template<typename OtherDerived>
-    Derived& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      other.evalTo(derived());
-      return derived();
-    }
-
+    Derived& operator=(const ReturnByValue<OtherDerived>& other);
 
     template<typename OtherDerived>
-    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      return assign(other.derived());
-    }
+    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other);
 
-    inline Derived& operator=(const Derived& other)
-    {
-//       if (other.isRValue())
-//         derived().swap(other.const_cast_derived());
-//       else
-      return assign(other.derived());
-    }
+    inline Derived& operator=(const Derived& other);
 
   protected:
 
     template<typename OtherDerived>
-    inline Derived& assign(const OtherDerived& other)
-    {
-      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      const Index outerSize = (int(OtherDerived::Flags) & RowMajorBit) ? other.rows() : other.cols();
-      if ((!transpose) && other.isRValue())
-      {
-        // eval without temporary
-        derived().resize(other.rows(), other.cols());
-        derived().setZero();
-        derived().reserve((std::max)(this->rows(),this->cols())*2);
-        for (Index j=0; j<outerSize; ++j)
-        {
-          derived().startVec(j);
-          for (typename OtherDerived::InnerIterator it(other, j); it; ++it)
-          {
-            Scalar v = it.value();
-            derived().insertBackByOuterInner(j,it.index()) = v;
-          }
-        }
-        derived().finalize();
-      }
-      else
-      {
-        assignGeneric(other);
-      }
-      return derived();
-    }
+    inline Derived& assign(const OtherDerived& other);
 
     template<typename OtherDerived>
-    inline void assignGeneric(const OtherDerived& other)
-    {
-      //const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      eigen_assert(( ((internal::traits<Derived>::SupportedAccessPatterns&OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
-                  (!((Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit)))) &&
-                  "the transpose operation is supposed to be handled in SparseMatrix::operator=");
-
-      enum { Flip = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit) };
-
-      const Index outerSize = other.outerSize();
-      //typedef typename internal::conditional<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::type TempType;
-      // thanks to shallow copies, we always eval to a tempary
-      Derived temp(other.rows(), other.cols());
-
-      temp.reserve((std::max)(this->rows(),this->cols())*2);
-      for (Index j=0; j<outerSize; ++j)
-      {
-        temp.startVec(j);
-        for (typename OtherDerived::InnerIterator it(other.derived(), j); it; ++it)
-        {
-          Scalar v = it.value();
-          temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
-        }
-      }
-      temp.finalize();
-
-      derived() = temp.markAsRValue();
-    }
+    inline void assignGeneric(const OtherDerived& other);
 
   public:
 
-    template<typename Lhs, typename Rhs>
-    inline Derived& operator=(const SparseSparseProduct<Lhs,Rhs>& product);
-
     friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
     {
       typedef typename Derived::Nested Nested;
@@ -302,8 +242,8 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
         }
         else
         {
-          SparseMatrix<Scalar, RowMajorBit> trans = m;
-          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit> >&>(trans);
+          SparseMatrix<Scalar, RowMajorBit, StorageIndex> trans = m;
+          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, StorageIndex> >&>(trans);
         }
       }
       return s;
@@ -313,55 +253,60 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
     template<typename OtherDerived>
     Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
+    
+    template<typename OtherDerived>
+    Derived& operator+=(const DiagonalBase<OtherDerived>& other);
+    template<typename OtherDerived>
+    Derived& operator-=(const DiagonalBase<OtherDerived>& other);
 
     Derived& operator*=(const Scalar& other);
     Derived& operator/=(const Scalar& other);
 
-    #define EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE \
-      CwiseBinaryOp< \
-        internal::scalar_product_op< \
-          typename internal::scalar_product_traits< \
-            typename internal::traits<Derived>::Scalar, \
-            typename internal::traits<OtherDerived>::Scalar \
-          >::ReturnType \
-        >, \
-        const Derived, \
-        const OtherDerived \
-      >
+    template<typename OtherDerived> struct CwiseProductDenseReturnType {
+      typedef CwiseBinaryOp<internal::scalar_product_op<typename internal::scalar_product_traits<
+                                                          typename internal::traits<Derived>::Scalar,
+                                                          typename internal::traits<OtherDerived>::Scalar
+                                                        >::ReturnType>,
+                            const Derived,
+                            const OtherDerived
+                          > Type;
+    };
 
     template<typename OtherDerived>
-    EIGEN_STRONG_INLINE const EIGEN_SPARSE_CWISE_PRODUCT_RETURN_TYPE
+    EIGEN_STRONG_INLINE const typename CwiseProductDenseReturnType<OtherDerived>::Type
     cwiseProduct(const MatrixBase<OtherDerived> &other) const;
 
-    // sparse * sparse
-    template<typename OtherDerived>
-    const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const SparseMatrixBase<OtherDerived> &other) const;
-
     // sparse * diagonal
     template<typename OtherDerived>
-    const SparseDiagonalProduct<Derived,OtherDerived>
-    operator*(const DiagonalBase<OtherDerived> &other) const;
+    const Product<Derived,OtherDerived>
+    operator*(const DiagonalBase<OtherDerived> &other) const
+    { return Product<Derived,OtherDerived>(derived(), other.derived()); }
 
     // diagonal * sparse
     template<typename OtherDerived> friend
-    const SparseDiagonalProduct<OtherDerived,Derived>
+    const Product<OtherDerived,Derived>
     operator*(const DiagonalBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
-    { return SparseDiagonalProduct<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
-
-    /** dense * sparse (return a dense object unless it is an outer product) */
-    template<typename OtherDerived> friend
-    const typename DenseSparseProductReturnType<OtherDerived,Derived>::Type
-    operator*(const MatrixBase<OtherDerived>& lhs, const Derived& rhs)
-    { return typename DenseSparseProductReturnType<OtherDerived,Derived>::Type(lhs.derived(),rhs); }
-
-    /** sparse * dense (returns a dense object unless it is an outer product) */
+    { return Product<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
+    
+    // sparse * sparse
     template<typename OtherDerived>
-    const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const;
+    const Product<Derived,OtherDerived,AliasFreeProduct>
+    operator*(const SparseMatrixBase<OtherDerived> &other) const;
+    
+    // sparse * dense
+    template<typename OtherDerived>
+    const Product<Derived,OtherDerived>
+    operator*(const MatrixBase<OtherDerived> &other) const
+    { return Product<Derived,OtherDerived>(derived(), other.derived()); }
+    
+    // dense * sparse
+    template<typename OtherDerived> friend
+    const Product<OtherDerived,Derived>
+    operator*(const MatrixBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
+    { return Product<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
     
      /** \returns an expression of P H P^-1 where H is the matrix represented by \c *this */
-    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
     {
       return SparseSymmetricPermutationProduct<Derived,Upper|Lower>(derived(), perm);
     }
@@ -369,22 +314,16 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     template<typename OtherDerived>
     Derived& operator*=(const SparseMatrixBase<OtherDerived>& other);
 
-    #ifdef EIGEN2_SUPPORT
-    // deprecated
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    // deprecated
-    template<typename OtherDerived>
-    void solveTriangularInPlace(MatrixBase<OtherDerived>& other) const;
-    #endif // EIGEN2_SUPPORT
-
     template<int Mode>
-    inline const SparseTriangularView<Derived, Mode> triangularView() const;
+    inline const TriangularView<const Derived, Mode> triangularView() const;
+    
+    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SparseSelfAdjointView<Derived, UpLo> Type; };
+    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SparseSelfAdjointView<const Derived, UpLo> Type; };
 
-    template<unsigned int UpLo> inline const SparseSelfAdjointView<Derived, UpLo> selfadjointView() const;
-    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
+    template<unsigned int UpLo> inline 
+    typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+    template<unsigned int UpLo> inline
+    typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
 
     template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
@@ -392,9 +331,9 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     RealScalar norm()  const;
     RealScalar blueNorm() const;
 
-    Transpose<Derived> transpose() { return derived(); }
-    const Transpose<const Derived> transpose() const { return derived(); }
-    const AdjointReturnType adjoint() const { return transpose(); }
+    TransposeReturnType transpose() { return TransposeReturnType(derived()); }
+    const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
+    const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
     // inner-vector
     typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
@@ -403,28 +342,19 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     const ConstInnerVectorReturnType innerVector(Index outer) const;
 
     // set of inner-vectors
-    Block<Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize);
-    const Block<const Derived,Dynamic,Dynamic,true> innerVectors(Index outerStart, Index outerSize) const;
-
-    /** \internal use operator= */
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& dst) const
-    {
-      dst.setZero();
-      for (Index j=0; j<outerSize(); ++j)
-        for (typename Derived::InnerIterator i(derived(),j); i; ++i)
-          dst.coeffRef(i.row(),i.col()) = i.value();
-    }
+    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
+    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
 
-    Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> toDense() const
+    DenseMatrixType toDense() const
     {
-      return derived();
+      return DenseMatrixType(derived());
     }
 
     template<typename OtherDerived>
     bool isApprox(const SparseMatrixBase<OtherDerived>& other,
-                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-    { return toDense().isApprox(other.toDense(),prec); }
+                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
 
     template<typename OtherDerived>
     bool isApprox(const MatrixBase<OtherDerived>& other,
@@ -440,10 +370,19 @@ template<typename Derived> class SparseMatrixBase : public EigenBase<Derived>
     { return typename internal::eval<Derived>::type(derived()); }
 
     Scalar sum() const;
+    
+    inline const SparseView<Derived>
+    pruned(const Scalar& reference = Scalar(0), const RealScalar& epsilon = NumTraits<Scalar>::dummy_precision()) const;
 
   protected:
 
     bool m_isRValue;
+
+    static inline StorageIndex convert_index(const Index idx) {
+      return internal::convert_index<StorageIndex>(idx);
+    }
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/SparseCore/SparsePermutation.h b/nuparu/include/Eigen/src/SparseCore/SparsePermutation.h
index b897b759..ef38357a 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparsePermutation.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparsePermutation.h
@@ -16,131 +16,161 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
+template<typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape>
 {
-  typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-  typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-  typedef typename MatrixTypeNestedCleaned::Index Index;
-  enum {
-    SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
-    MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
-  };
+    typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+    typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
 
-  typedef typename internal::conditional<MoveOuter,
-        SparseMatrix<Scalar,SrcStorageOrder,Index>,
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> >::type ReturnType;
-};
-
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_sparsematrix_product_retval
- : public ReturnByValue<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-    typedef typename MatrixTypeNestedCleaned::Index Index;
+    typedef typename MatrixTypeCleaned::Scalar Scalar;
+    typedef typename MatrixTypeCleaned::StorageIndex StorageIndex;
 
     enum {
-      SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
+      SrcStorageOrder = MatrixTypeCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
       MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
     };
+    
+    typedef typename internal::conditional<MoveOuter,
+        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex>,
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> >::type ReturnType;
 
-    permut_sparsematrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
+    template<typename Dest,typename PermutationType>
+    static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr)
     {
+      MatrixType mat(xpr);
       if(MoveOuter)
       {
-        SparseMatrix<Scalar,SrcStorageOrder,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        VectorXi sizes(m_matrix.outerSize());
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        SparseMatrix<Scalar,SrcStorageOrder,StorageIndex> tmp(mat.rows(), mat.cols());
+        Matrix<StorageIndex,Dynamic,1> sizes(mat.outerSize());
+        for(Index j=0; j<mat.outerSize(); ++j)
         {
-          Index jp = m_permutation.indices().coeff(j);
-          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = m_matrix.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).size();
+          Index jp = perm.indices().coeff(j);
+          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = StorageIndex(mat.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros());
         }
         tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
+        for(Index j=0; j<mat.outerSize(); ++j)
         {
-          Index jp = m_permutation.indices().coeff(j);
+          Index jp = perm.indices().coeff(j);
           Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
           Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,jsrc); it; ++it)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,jsrc); it; ++it)
             tmp.insertByOuterInner(jdst,it.index()) = it.value();
         }
         dst = tmp;
       }
       else
       {
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        VectorXi sizes(tmp.outerSize());
+        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,StorageIndex> tmp(mat.rows(), mat.cols());
+        Matrix<StorageIndex,Dynamic,1> sizes(tmp.outerSize());
         sizes.setZero();
-        PermutationMatrix<Dynamic,Dynamic,Index> perm;
+        PermutationMatrix<Dynamic,Dynamic,StorageIndex> perm_cpy;
         if((Side==OnTheLeft) ^ Transposed)
-          perm = m_permutation;
+          perm_cpy = perm;
         else
-          perm = m_permutation.transpose();
+          perm_cpy = perm.transpose();
 
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            sizes[perm.indices().coeff(it.index())]++;
+        for(Index j=0; j<mat.outerSize(); ++j)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
+            sizes[perm_cpy.indices().coeff(it.index())]++;
         tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            tmp.insertByOuterInner(perm.indices().coeff(it.index()),j) = it.value();
+        for(Index j=0; j<mat.outerSize(); ++j)
+          for(typename MatrixTypeCleaned::InnerIterator it(mat,j); it; ++it)
+            tmp.insertByOuterInner(perm_cpy.indices().coeff(it.index()),j) = it.value();
         dst = tmp;
       }
     }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
 };
 
 }
 
+namespace internal {
+
+template <int ProductTag> struct product_promote_storage_type<Sparse,             PermutationStorage, ProductTag> { typedef Sparse ret; };
+template <int ProductTag> struct product_promote_storage_type<PermutationStorage, Sparse,             ProductTag> { typedef Sparse ret; };
+
+// TODO, the following two overloads are only needed to define the right temporary type through 
+// typename traits<permutation_sparse_matrix_product<Rhs,Lhs,OnTheRight,false> >::ReturnType
+// whereas it should be correctly handled by traits<Product<> >::PlainObject
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, PermutationShape, SparseShape>
+  : public evaluator<typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType>
+{
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Rhs,OnTheLeft,false,SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, PermutationShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
 
+protected:
+  PlainObject m_result;
+};
+
+template<typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, SparseShape, PermutationShape >
+  : public evaluator<typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType>
+{
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Lhs,OnTheRight,false,SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum {
+    Flags = Base::Flags | EvalBeforeNestingBit
+  };
+
+  explicit product_evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, SparseShape, PermutationShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+protected:
+  PlainObject m_result;
+};
+
+} // end namespace internal
 
 /** \returns the matrix with the permutation applied to the columns
   */
 template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct>
 operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>(perm, matrix.derived());
-}
+{ return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived()); }
 
 /** \returns the matrix with the permutation applied to the rows
   */
 template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct>
 operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>(perm, matrix.derived());
-}
-
+{ return  Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived()); }
 
 
 /** \returns the matrix with the inverse permutation applied to the columns.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const Transpose<PermutationBase<PermDerived> >& tperm)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>
+operator*(const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm)
 {
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>(tperm.nestedPermutation(), matrix.derived());
+  return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
 }
 
 /** \returns the matrix with the inverse permutation applied to the rows.
   */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>
-operator*(const Transpose<PermutationBase<PermDerived> >& tperm, const SparseMatrixBase<SparseDerived>& matrix)
+template<typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>
+operator*(const InverseImpl<PermutationType,PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix)
 {
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>(tperm.nestedPermutation(), matrix.derived());
+  return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseProduct.h b/nuparu/include/Eigen/src/SparseCore/SparseProduct.h
index 70b6480e..cbd0db71 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseProduct.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,157 +12,6 @@
 
 namespace Eigen { 
 
-template<typename Lhs, typename Rhs>
-struct SparseSparseProductReturnType
-{
-  typedef typename internal::traits<Lhs>::Scalar Scalar;
-  enum {
-    LhsRowMajor = internal::traits<Lhs>::Flags & RowMajorBit,
-    RhsRowMajor = internal::traits<Rhs>::Flags & RowMajorBit,
-    TransposeRhs = (!LhsRowMajor) && RhsRowMajor,
-    TransposeLhs = LhsRowMajor && (!RhsRowMajor)
-  };
-
-  typedef typename internal::conditional<TransposeLhs,
-    SparseMatrix<Scalar,0>,
-    typename internal::nested<Lhs,Rhs::RowsAtCompileTime>::type>::type LhsNested;
-
-  typedef typename internal::conditional<TransposeRhs,
-    SparseMatrix<Scalar,0>,
-    typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type>::type RhsNested;
-
-  typedef SparseSparseProduct<LhsNested, RhsNested> Type;
-};
-
-namespace internal {
-template<typename LhsNested, typename RhsNested>
-struct traits<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  typedef MatrixXpr XprKind;
-  // clean the nested types:
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename _LhsNested::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-
-    RowsAtCompileTime    = _LhsNested::RowsAtCompileTime,
-    ColsAtCompileTime    = _RhsNested::ColsAtCompileTime,
-    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-    EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
-
-    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
-
-    Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeAssigningBit
-          | EvalBeforeNestingBit,
-
-    CoeffReadCost = Dynamic
-  };
-
-  typedef Sparse StorageKind;
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested>
-class SparseSparseProduct : internal::no_assignment_operator,
-  public SparseMatrixBase<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseSparseProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseSparseProduct)
-
-  private:
-
-    typedef typename internal::traits<SparseSparseProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<SparseSparseProduct>::_RhsNested _RhsNested;
-
-  public:
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(0), m_conservative(true)
-    {
-      init();
-    }
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs, const RealScalar& tolerance)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(tolerance), m_conservative(false)
-    {
-      init();
-    }
-
-    SparseSparseProduct pruned(const Scalar& reference = 0, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) const
-    {
-      using std::abs;
-      return SparseSparseProduct(m_lhs,m_rhs,abs(reference)*epsilon);
-    }
-
-    template<typename Dest>
-    void evalTo(Dest& result) const
-    {
-      if(m_conservative)
-        internal::conservative_sparse_sparse_product_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result);
-      else
-        internal::sparse_sparse_product_with_pruning_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result,m_tolerance);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    void init()
-    {
-      eigen_assert(m_lhs.cols() == m_rhs.rows());
-
-      enum {
-        ProductIsValid = _LhsNested::ColsAtCompileTime==Dynamic
-                      || _RhsNested::RowsAtCompileTime==Dynamic
-                      || int(_LhsNested::ColsAtCompileTime)==int(_RhsNested::RowsAtCompileTime),
-        AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
-        SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested,_RhsNested)
-      };
-      // note to the lost user:
-      //    * for a dot product use: v1.dot(v2)
-      //    * for a coeff-wise product use: v1.cwise()*v2
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-      EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-    }
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-    RealScalar m_tolerance;
-    bool m_conservative;
-};
-
-// sparse = sparse * sparse
-template<typename Derived>
-template<typename Lhs, typename Rhs>
-inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-{
-  product.evalTo(derived());
-  return derived();
-}
-
 /** \returns an expression of the product of two sparse matrices.
   * By default a conservative product preserving the symbolic non zeros is performed.
   * The automatic pruning of the small values can be achieved by calling the pruned() function
@@ -176,12 +25,140 @@ inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<L
   * */
 template<typename Derived>
 template<typename OtherDerived>
-inline const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
+inline const Product<Derived,OtherDerived,AliasFreeProduct>
 SparseMatrixBase<Derived>::operator*(const SparseMatrixBase<OtherDerived> &other) const
 {
-  return typename SparseSparseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived,OtherDerived,AliasFreeProduct>(derived(), other.derived());
 }
 
+namespace internal {
+
+// sparse * sparse
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs)
+  {
+    evalTo(dst, lhs, rhs, typename evaluator_traits<Dest>::Shape());
+  }
+
+  // dense += sparse * sparse
+  template<typename Dest,typename ActualLhs>
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  {
+    typedef typename nested_eval<ActualLhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_sparse_to_dense_product_selector<typename remove_all<LhsNested>::type,
+                                                      typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense -= sparse * sparse
+  template<typename Dest>
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, int* = typename enable_if<is_same<typename evaluator_traits<Dest>::Shape,DenseShape>::value,int*>::type(0) )
+  {
+    addTo(dst, -lhs, rhs);
+  }
+
+protected:
+
+  // sparse = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, SparseShape)
+  {
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::conservative_sparse_sparse_product_selector<typename remove_all<LhsNested>::type,
+                                                          typename remove_all<RhsNested>::type, Dest>::run(lhsNested,rhsNested,dst);
+  }
+
+  // dense = sparse * sparse
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, DenseShape)
+  {
+    dst.setZero();
+    addTo(dst, lhs, rhs);
+  }
+};
+
+// sparse * sparse-triangular
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseTriangularShape, ProductType>
+ : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{};
+
+// sparse-triangular * sparse
+template<typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, SparseShape, ProductType>
+ : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType>
+{};
+
+// dense = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::evalTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense += sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::add_assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::addTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+// dense -= sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template< typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<DstXprType, Product<Lhs,Rhs,AliasFreeProduct>, internal::sub_assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef Product<Lhs,Rhs,AliasFreeProduct> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &)
+  {
+    generic_product_impl<Lhs, Rhs>::subTo(dst,src.lhs(),src.rhs());
+  }
+};
+
+template<typename Lhs, typename Rhs, int Options>
+struct evaluator<SparseView<Product<Lhs, Rhs, Options> > > 
+ : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject>
+{
+  typedef SparseView<Product<Lhs, Rhs, Options> > XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  
+  explicit evaluator(const XprType& xpr)
+    : m_result(xpr.rows(), xpr.cols())
+  {
+    using std::abs;
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(xpr.nestedExpression().lhs());
+    RhsNested rhsNested(xpr.nestedExpression().rhs());
+    
+    internal::sparse_sparse_product_with_pruning_selector<typename remove_all<LhsNested>::type,
+                                                          typename remove_all<RhsNested>::type, PlainObject>::run(lhsNested,rhsNested,m_result,
+                                                                                                                  abs(xpr.reference())*xpr.epsilon());
+  }
+  
+protected:  
+  PlainObject m_result;
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSEPRODUCT_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseRedux.h b/nuparu/include/Eigen/src/SparseCore/SparseRedux.h
index f3da93a7..50ebb2e5 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseRedux.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseRedux.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -18,8 +18,9 @@ SparseMatrixBase<Derived>::sum() const
 {
   eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
   Scalar res(0);
+  internal::evaluator<Derived> thisEval(derived());
   for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator iter(derived(),j); iter; ++iter)
+    for (typename internal::evaluator<Derived>::InnerIterator iter(thisEval,j); iter; ++iter)
       res += iter.value();
   return res;
 }
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseRef.h b/nuparu/include/Eigen/src/SparseCore/SparseRef.h
new file mode 100644
index 00000000..19e06fc8
--- /dev/null
+++ b/nuparu/include/Eigen/src/SparseCore/SparseRef.h
@@ -0,0 +1,367 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_REF_H
+#define EIGEN_SPARSE_REF_H
+
+namespace Eigen {
+
+enum {
+  StandardCompressedFormat = 2
+};
+  
+namespace internal {
+
+template<typename Derived> class SparseRefBase;
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+  
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<SparseVector<MatScalar,MatOptions,MatIndex> >
+{
+  typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+  enum {
+    Options = _Options,
+    Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit
+  };
+
+  template<typename Derived> struct match {
+    enum {
+      MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && Derived::IsVectorAtCompileTime
+    };
+    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+  };
+
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
+struct traits<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+  : public traits<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
+{
+  enum {
+    Flags = (traits<SparseVector<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template<typename Derived>
+struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
+
+template<typename Derived> class SparseRefBase
+  : public SparseMapBase<Derived>
+{
+public:
+
+  typedef SparseMapBase<Derived> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+
+  SparseRefBase()
+    : Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
+  {}
+  
+protected:
+
+  template<typename Expression>
+  void construct(Expression& expr)
+  {
+    if(expr.outerIndexPtr()==0)
+      ::new (static_cast<Base*>(this)) Base(expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr());
+    else
+      ::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+  }
+};
+
+} // namespace internal
+
+
+/** 
+  * \ingroup Sparse_Module
+  *
+  * \brief A sparse matrix expression referencing an existing sparse expression
+  *
+  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
+  * \tparam Options specifies whether the a standard compressed format is required \c Options is  \c #StandardCompressedFormat, or \c 0.
+  *                The default is \c 0.
+  * \tparam StrideType Only used for dense Ref
+  *
+  * \sa class Ref
+  */
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+    template<int OtherOptions>
+    inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.derived());
+    }
+    
+    template<int OtherOptions>
+    inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.derived());
+    }
+    
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      eigen_assert( ((Options & int(StandardCompressedFormat))==0) || (expr.isCompressed()) );
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
+      {
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+        ::new (obj) TPlainObjectType(expr);
+        Base::construct(*obj);
+      }
+      else
+      {
+        Base::construct(expr);
+      }
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      ::new (obj) TPlainObjectType(expr);
+      Base::construct(*obj);
+    }
+
+  protected:
+    char m_object_bytes[sizeof(TPlainObjectType)];
+};
+
+
+
+/**
+  * \ingroup Sparse_Module
+  *
+  * \brief A sparse vector expression referencing an existing sparse vector expression
+  *
+  * \tparam PlainObjectType the equivalent sparse matrix type of the referenced data
+  * \tparam Options Not used for SparseVector.
+  * \tparam StrideType Only used for dense Ref
+  *
+  * \sa class Ref
+  */
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType >
+  : public internal::SparseRefBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> PlainObjectType;
+    typedef internal::traits<Ref> Traits;
+    template<int OtherOptions>
+    inline Ref(const SparseVector<MatScalar,OtherOptions,MatIndex>& expr);
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    template<int OtherOptions>
+    inline Ref(SparseVector<MatScalar,OtherOptions,MatIndex>& expr)
+    {
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseVector<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.derived());
+    }
+
+    template<typename Derived>
+    inline Ref(const SparseCompressedBase<Derived>& expr)
+    #else
+    template<typename Derived>
+    inline Ref(SparseCompressedBase<Derived>& expr)
+    #endif
+    {
+      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      Base::construct(expr.const_cast_derived());
+    }
+};
+
+// this is the const ref version
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType>
+  : public internal::SparseRefBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+{
+    typedef SparseVector<MatScalar,MatOptions,MatIndex> TPlainObjectType;
+    typedef internal::traits<Ref> Traits;
+  public:
+
+    typedef internal::SparseRefBase<Ref> Base;
+    EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+    template<typename Derived>
+    inline Ref(const SparseMatrixBase<Derived>& expr)
+    {
+      construct(expr.derived(), typename Traits::template match<Derived>::type());
+    }
+
+    inline Ref(const Ref& other) : Base(other) {
+      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+    }
+
+    template<typename OtherRef>
+    inline Ref(const RefBase<OtherRef>& other) {
+      construct(other.derived(), typename Traits::template match<OtherRef>::type());
+    }
+
+  protected:
+
+    template<typename Expression>
+    void construct(const Expression& expr,internal::true_type)
+    {
+      Base::construct(expr);
+    }
+
+    template<typename Expression>
+    void construct(const Expression& expr, internal::false_type)
+    {
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
+      ::new (obj) TPlainObjectType(expr);
+      Base::construct(*obj);
+    }
+
+  protected:
+    char m_object_bytes[sizeof(TPlainObjectType)];
+};
+
+namespace internal {
+
+// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove this copy-pasta thing...
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;  
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
+  : evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
+{
+  typedef evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
+  typedef Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType &mat) : Base(mat) {}
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_REF_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseSelfAdjointView.h b/nuparu/include/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 0eda96bc..46c6ce1d 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,14 +11,14 @@
 #define EIGEN_SPARSE_SELFADJOINTVIEW_H
 
 namespace Eigen { 
-
+  
 /** \ingroup SparseCore_Module
   * \class SparseSelfAdjointView
   *
   * \brief Pseudo expression to manipulate a triangular sparse matrix as a selfadjoint matrix.
   *
   * \param MatrixType the type of the dense matrix storing the coefficients
-  * \param UpLo can be either \c #Lower or \c #Upper
+  * \param Mode can be either \c #Lower or \c #Upper
   *
   * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
   * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
@@ -26,38 +26,39 @@ namespace Eigen {
   *
   * \sa SparseMatrixBase::selfadjointView()
   */
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct;
-
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct;
-
 namespace internal {
   
-template<typename MatrixType, unsigned int UpLo>
-struct traits<SparseSelfAdjointView<MatrixType,UpLo> > : traits<MatrixType> {
+template<typename MatrixType, unsigned int Mode>
+struct traits<SparseSelfAdjointView<MatrixType,Mode> > : traits<MatrixType> {
 };
 
-template<int SrcUpLo,int DstUpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template<int SrcMode,int DstMode,typename MatrixType,int DestOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm = 0);
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template<int Mode,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm = 0);
 
 }
 
-template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
-  : public EigenBase<SparseSelfAdjointView<MatrixType,UpLo> >
+template<typename MatrixType, unsigned int _Mode> class SparseSelfAdjointView
+  : public EigenBase<SparseSelfAdjointView<MatrixType,_Mode> >
 {
   public:
+    
+    enum {
+      Mode = _Mode,
+      RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
+    };
 
+    typedef EigenBase<SparseSelfAdjointView> Base;
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Index,Dynamic,1> VectorI;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
     typedef typename MatrixType::Nested MatrixTypeNested;
     typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-
-    inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
+    
+    explicit inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
     {
       eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
     }
@@ -75,10 +76,10 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
       * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
       */
     template<typename OtherDerived>
-    SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>
+    Product<SparseSelfAdjointView, OtherDerived>
     operator*(const SparseMatrixBase<OtherDerived>& rhs) const
     {
-      return SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>(*this, rhs.derived());
+      return Product<SparseSelfAdjointView, OtherDerived>(*this, rhs.derived());
     }
 
     /** \returns an expression of the matrix product between a sparse matrix \a lhs and a sparse self-adjoint matrix \a rhs.
@@ -87,26 +88,26 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
       * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
       */
     template<typename OtherDerived> friend
-    SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject >
+    Product<OtherDerived, SparseSelfAdjointView>
     operator*(const SparseMatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
     {
-      return SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject>(lhs.derived(), rhs);
+      return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
     }
     
     /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
     template<typename OtherDerived>
-    SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>
+    Product<SparseSelfAdjointView,OtherDerived>
     operator*(const MatrixBase<OtherDerived>& rhs) const
     {
-      return SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>(m_matrix, rhs.derived());
+      return Product<SparseSelfAdjointView,OtherDerived>(*this, rhs.derived());
     }
 
     /** Efficient dense vector/matrix times sparse self-adjoint matrix product */
     template<typename OtherDerived> friend
-    DenseTimeSparseSelfAdjointProduct<OtherDerived,MatrixType,UpLo>
+    Product<OtherDerived,SparseSelfAdjointView>
     operator*(const MatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
     {
-      return DenseTimeSparseSelfAdjointProduct<OtherDerived,_MatrixTypeNested,UpLo>(lhs.derived(), rhs.m_matrix);
+      return Product<OtherDerived,SparseSelfAdjointView>(lhs.derived(), rhs);
     }
 
     /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
@@ -120,56 +121,48 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
     template<typename DerivedU>
     SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
     
-    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
-    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,Index>& _dest) const
-    {
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, _dest);
-    }
-    
-    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,Index>& _dest) const
-    {
-      // TODO directly evaluate into _dest;
-      SparseMatrix<DestScalar,ColMajor,Index> tmp(_dest.rows(),_dest.cols());
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, tmp);
-      _dest = tmp;
-    }
-    
     /** \returns an expression of P H P^-1 */
-    SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
+    // TODO implement twists in a more evaluator friendly fashion
+    SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode> twistedBy(const PermutationMatrix<Dynamic,Dynamic,StorageIndex>& perm) const
     {
-      return SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo>(m_matrix, perm);
+      return SparseSymmetricPermutationProduct<_MatrixTypeNested,Mode>(m_matrix, perm);
     }
-    
-    template<typename SrcMatrixType,int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcUpLo>& permutedMatrix)
+
+    template<typename SrcMatrixType,int SrcMode>
+    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcMode>& permutedMatrix)
     {
-      permutedMatrix.evalTo(*this);
+      internal::call_assignment_no_alias_no_transpose(*this, permutedMatrix);
       return *this;
     }
 
-
     SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
 
-    template<typename SrcMatrixType,unsigned int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcUpLo>& src)
+    template<typename SrcMatrixType,unsigned int SrcMode>
+    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcMode>& src)
     {
-      PermutationMatrix<Dynamic> pnull;
+      PermutationMatrix<Dynamic,Dynamic,StorageIndex> pnull;
       return *this = src.twistedBy(pnull);
     }
     
-
-    // const SparseLLT<PlainObject, UpLo> llt() const;
-    // const SparseLDLT<PlainObject, UpLo> ldlt() const;
-
+    void resize(Index rows, Index cols)
+    {
+      EIGEN_ONLY_USED_FOR_DEBUG(rows);
+      EIGEN_ONLY_USED_FOR_DEBUG(cols);
+      eigen_assert(rows == this->rows() && cols == this->cols()
+                && "SparseSelfadjointView::resize() does not actually allow to resize.");
+    }
+    
   protected:
 
     typename MatrixType::Nested m_matrix;
-    mutable VectorI m_countPerRow;
-    mutable VectorI m_countPerCol;
+    //mutable VectorI m_countPerRow;
+    //mutable VectorI m_countPerCol;
+  private:
+    template<typename Dest> void evalTo(Dest &) const;
 };
 
 /***************************************************************************
@@ -178,145 +171,224 @@ template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
 
 template<typename Derived>
 template<unsigned int UpLo>
-const SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView() const
+typename SparseMatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView() const
 {
-  return derived();
+  return SparseSelfAdjointView<const Derived, UpLo>(derived());
 }
 
 template<typename Derived>
 template<unsigned int UpLo>
-SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView()
+typename SparseMatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type SparseMatrixBase<Derived>::selfadjointView()
 {
-  return derived();
+  return SparseSelfAdjointView<Derived, UpLo>(derived());
 }
 
 /***************************************************************************
 * Implementation of SparseSelfAdjointView methods
 ***************************************************************************/
 
-template<typename MatrixType, unsigned int UpLo>
+template<typename MatrixType, unsigned int Mode>
 template<typename DerivedU>
-SparseSelfAdjointView<MatrixType,UpLo>&
-SparseSelfAdjointView<MatrixType,UpLo>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
+SparseSelfAdjointView<MatrixType,Mode>&
+SparseSelfAdjointView<MatrixType,Mode>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
 {
-  SparseMatrix<Scalar,MatrixType::Flags&RowMajorBit?RowMajor:ColMajor> tmp = u * u.adjoint();
+  SparseMatrix<Scalar,(MatrixType::Flags&RowMajorBit)?RowMajor:ColMajor> tmp = u * u.adjoint();
   if(alpha==Scalar(0))
-    m_matrix.const_cast_derived() = tmp.template triangularView<UpLo>();
+    m_matrix.const_cast_derived() = tmp.template triangularView<Mode>();
   else
-    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<UpLo>();
+    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<Mode>();
 
   return *this;
 }
 
-/***************************************************************************
-* Implementation of sparse self-adjoint time dense matrix
-***************************************************************************/
-
 namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
+  
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make it work)
+template<typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SparseSelfAdjointView<MatrixType,Mode> >
 {
-  typedef Dense StorageKind;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseSelfAdjointShape Shape;
+  
+  static const int AssumeAliasing = 0;
 };
-}
 
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct
-  : public ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+struct SparseSelfAdjoint2Sparse {};
+
+template<> struct AssignmentKind<SparseShape,SparseSelfAdjointShape> { typedef SparseSelfAdjoint2Sparse Kind; };
+template<> struct AssignmentKind<SparseSelfAdjointShape,SparseShape> { typedef Sparse2Sparse Kind; };
+
+template< typename DstXprType, typename SrcXprType, typename Functor, typename Scalar>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse, Scalar>
 {
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseSelfAdjointTimeDenseProduct)
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  template<typename DestScalar,int StorageOrder>
+  static void run(SparseMatrix<DestScalar,StorageOrder,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), dst);
+  }
+  
+  template<typename DestScalar>
+  static void run(DynamicSparseMatrix<DestScalar,ColMajor,StorageIndex>& dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar> &/*func*/)
+  {
+    // TODO directly evaluate into dst;
+    SparseMatrix<DestScalar,ColMajor,StorageIndex> tmp(dst.rows(),dst.cols());
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode>(src.matrix(), tmp);
+    dst = tmp;
+  }
+};
 
-    SparseSelfAdjointTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+} // end namespace internal
 
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
+/***************************************************************************
+* Implementation of sparse self-adjoint time dense matrix
+***************************************************************************/
+
+namespace internal {
+
+template<int Mode, typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
+{
+  EIGEN_ONLY_USED_FOR_DEBUG(alpha);
+  // TODO use alpha
+  eigen_assert(alpha==AlphaType(1) && "alpha != 1 is not implemented yet, sorry");
+  
+  typedef evaluator<SparseLhsType> LhsEval;
+  typedef typename evaluator<SparseLhsType>::InnerIterator LhsIterator;
+  typedef typename SparseLhsType::Scalar LhsScalar;
+  
+  enum {
+    LhsIsRowMajor = (LhsEval::Flags&RowMajorBit)==RowMajorBit,
+    ProcessFirstHalf =
+              ((Mode&(Upper|Lower))==(Upper|Lower))
+          || ( (Mode&Upper) && !LhsIsRowMajor)
+          || ( (Mode&Lower) && LhsIsRowMajor),
+    ProcessSecondHalf = !ProcessFirstHalf
+  };
+  
+  LhsEval lhsEval(lhs);
+  
+  for (Index j=0; j<lhs.outerSize(); ++j)
+  {
+    LhsIterator i(lhsEval,j);
+    if (ProcessSecondHalf)
     {
-      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
-      // TODO use alpha
-      eigen_assert(alpha==Scalar(1) && "alpha != 1 is not implemented yet, sorry");
-      typedef typename internal::remove_all<Lhs>::type _Lhs;
-      typedef typename _Lhs::InnerIterator LhsInnerIterator;
-      enum {
-        LhsIsRowMajor = (_Lhs::Flags&RowMajorBit)==RowMajorBit,
-        ProcessFirstHalf =
-                 ((UpLo&(Upper|Lower))==(Upper|Lower))
-              || ( (UpLo&Upper) && !LhsIsRowMajor)
-              || ( (UpLo&Lower) && LhsIsRowMajor),
-        ProcessSecondHalf = !ProcessFirstHalf
-      };
-      for (Index j=0; j<m_lhs.outerSize(); ++j)
+      while (i && i.index()<j) ++i;
+      if(i && i.index()==j)
       {
-        LhsInnerIterator i(m_lhs,j);
-        if (ProcessSecondHalf)
-        {
-          while (i && i.index()<j) ++i;
-          if(i && i.index()==j)
-          {
-            dest.row(j) += i.value() * m_rhs.row(j);
-            ++i;
-          }
-        }
-        for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
-        {
-          Index a = LhsIsRowMajor ? j : i.index();
-          Index b = LhsIsRowMajor ? i.index() : j;
-          typename Lhs::Scalar v = i.value();
-          dest.row(a) += (v) * m_rhs.row(b);
-          dest.row(b) += numext::conj(v) * m_rhs.row(a);
-        }
-        if (ProcessFirstHalf && i && (i.index()==j))
-          dest.row(j) += i.value() * m_rhs.row(j);
+        res.row(j) += i.value() * rhs.row(j);
+        ++i;
       }
     }
+    for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
+    {
+      Index a = LhsIsRowMajor ? j : i.index();
+      Index b = LhsIsRowMajor ? i.index() : j;
+      LhsScalar v = i.value();
+      res.row(a) += (v) * rhs.row(b);
+      res.row(b) += numext::conj(v) * rhs.row(a);
+    }
+    if (ProcessFirstHalf && i && (i.index()==j))
+      res.row(j) += i.value() * rhs.row(j);
+  }
+}
 
-  private:
-    SparseSelfAdjointTimeDenseProduct& operator=(const SparseSelfAdjointTimeDenseProduct&);
+
+template<typename LhsView, typename Rhs, int ProductType>
+struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs)
+  {
+    typedef typename LhsView::_MatrixTypeNested Lhs;
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhsView.matrix());
+    RhsNested rhsNested(rhs);
+    
+    dst.setZero();
+    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, typename Dest::Scalar(1));
+  }
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
-{};
-}
+template<typename Lhs, typename RhsView, int ProductType>
+struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType>
+{
+  template<typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView)
+  {
+    typedef typename RhsView::_MatrixTypeNested Rhs;
+    typedef typename nested_eval<Lhs,Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs,Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhsView.matrix());
+    
+    dst.setZero();
+    // transpoe everything
+    Transpose<Dest> dstT(dst);
+    internal::sparse_selfadjoint_time_dense_product<RhsView::Mode>(rhsNested.transpose(), lhsNested.transpose(), dstT, typename Dest::Scalar(1));
+  }
+};
+
+// NOTE: these two overloads are needed to evaluate the sparse selfadjoint view into a full sparse matrix
+// TODO: maybe the copy could be handled by generic_product_impl so that these overloads would not be needed anymore
 
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct
-  : public ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
+template<typename LhsView, typename Rhs, int ProductTag>
+struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, SparseSelfAdjointShape, SparseShape>
+  : public evaluator<typename Product<typename Rhs::PlainObject, Rhs, DefaultProduct>::PlainObject>
 {
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseSelfAdjointProduct)
+  typedef Product<LhsView, Rhs, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
 
-    DenseTimeSparseSelfAdjointProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+  product_evaluator(const XprType& xpr)
+    : m_lhs(xpr.lhs()), m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<typename Rhs::PlainObject, Rhs, SparseShape, SparseShape, ProductTag>::evalTo(m_result, m_lhs, xpr.rhs());
+  }
+  
+protected:
+  typename Rhs::PlainObject m_lhs;
+  PlainObject m_result;
+};
 
-    template<typename Dest> void scaleAndAddTo(Dest& /*dest*/, const Scalar& /*alpha*/) const
-    {
-      // TODO
-    }
+template<typename Lhs, typename RhsView, int ProductTag>
+struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, SparseShape, SparseSelfAdjointShape>
+  : public evaluator<typename Product<Lhs, typename Lhs::PlainObject, DefaultProduct>::PlainObject>
+{
+  typedef Product<Lhs, RhsView, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
 
-  private:
-    DenseTimeSparseSelfAdjointProduct& operator=(const DenseTimeSparseSelfAdjointProduct&);
+  product_evaluator(const XprType& xpr)
+    : m_rhs(xpr.rhs()), m_result(xpr.rows(), xpr.cols())
+  {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, typename Lhs::PlainObject, SparseShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), m_rhs);
+  }
+  
+protected:
+  typename Lhs::PlainObject m_rhs;
+  PlainObject m_result;
 };
 
+} // namespace internal
+
 /***************************************************************************
 * Implementation of symmetric copies and permutations
 ***************************************************************************/
 namespace internal {
-  
-template<typename MatrixType, int UpLo>
-struct traits<SparseSymmetricPermutationProduct<MatrixType,UpLo> > : traits<MatrixType> {
-};
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+template<int Mode,typename MatrixType,int DestOrder>
+void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  typedef SparseMatrix<Scalar,DestOrder,Index> Dest;
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  typedef SparseMatrix<Scalar,DestOrder,StorageIndex> Dest;
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
   
   Dest& dest(_dest.derived());
   enum {
@@ -337,11 +409,11 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
       Index r = it.row();
       Index c = it.col();
       Index ip = perm ? perm[i] : i;
-      if(UpLo==(Upper|Lower))
+      if(Mode==(Upper|Lower))
         count[StorageOrderMatch ? jp : ip]++;
       else if(r==c)
         count[ip]++;
-      else if(( UpLo==Lower && r>c) || ( UpLo==Upper && r<c))
+      else if(( Mode==Lower && r>c) || ( Mode==Upper && r<c))
       {
         count[ip]++;
         count[jp]++;
@@ -359,18 +431,18 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
     count[j] = dest.outerIndexPtr()[j];
   
   // copy data
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
     for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
     {
-      Index i = it.index();
+      StorageIndex i = internal::convert_index<StorageIndex>(it.index());
       Index r = it.row();
       Index c = it.col();
       
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm ? perm[i] : i;
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm ? perm[i] : i;
       
-      if(UpLo==(Upper|Lower))
+      if(Mode==(Upper|Lower))
       {
         Index k = count[StorageOrderMatch ? jp : ip]++;
         dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
@@ -382,7 +454,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
         dest.innerIndexPtr()[k] = ip;
         dest.valuePtr()[k] = it.value();
       }
-      else if(( (UpLo&Lower)==Lower && r>c) || ( (UpLo&Upper)==Upper && r<c))
+      else if(( (Mode&Lower)==Lower && r>c) || ( (Mode&Upper)==Upper && r<c))
       {
         if(!StorageOrderMatch)
           std::swap(ip,jp);
@@ -397,35 +469,35 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
   }
 }
 
-template<int _SrcUpLo,int _DstUpLo,typename MatrixType,int DstOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
+template<int _SrcMode,int _DstMode,typename MatrixType,int DstOrder>
+void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::StorageIndex>& _dest, const typename MatrixType::StorageIndex* perm)
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  SparseMatrix<Scalar,DstOrder,Index>& dest(_dest.derived());
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  SparseMatrix<Scalar,DstOrder,StorageIndex>& dest(_dest.derived());
+  typedef Matrix<StorageIndex,Dynamic,1> VectorI;
   enum {
     SrcOrder = MatrixType::IsRowMajor ? RowMajor : ColMajor,
     StorageOrderMatch = int(SrcOrder) == int(DstOrder),
-    DstUpLo = DstOrder==RowMajor ? (_DstUpLo==Upper ? Lower : Upper) : _DstUpLo,
-    SrcUpLo = SrcOrder==RowMajor ? (_SrcUpLo==Upper ? Lower : Upper) : _SrcUpLo
+    DstMode = DstOrder==RowMajor ? (_DstMode==Upper ? Lower : Upper) : _DstMode,
+    SrcMode = SrcOrder==RowMajor ? (_SrcMode==Upper ? Lower : Upper) : _SrcMode
   };
   
   Index size = mat.rows();
   VectorI count(size);
   count.setZero();
   dest.resize(size,size);
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
-    Index jp = perm ? perm[j] : j;
+    StorageIndex jp = perm ? perm[j] : j;
     for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
     {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+      StorageIndex i = it.index();
+      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
         continue;
                   
-      Index ip = perm ? perm[i] : i;
-      count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      StorageIndex ip = perm ? perm[i] : i;
+      count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
     }
   }
   dest.outerIndexPtr()[0] = 0;
@@ -435,23 +507,23 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
   for(Index j=0; j<size; ++j)
     count[j] = dest.outerIndexPtr()[j];
   
-  for(Index j = 0; j<size; ++j)
+  for(StorageIndex j = 0; j<size; ++j)
   {
     
     for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
     {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
+      StorageIndex i = it.index();
+      if((int(SrcMode)==int(Lower) && i<j) || (int(SrcMode)==int(Upper) && i>j))
         continue;
                   
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm? perm[i] : i;
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm? perm[i] : i;
       
-      Index k = count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
-      dest.innerIndexPtr()[k] = int(DstUpLo)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
+      Index k = count[int(DstMode)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+      dest.innerIndexPtr()[k] = int(DstMode)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
       
       if(!StorageOrderMatch) std::swap(ip,jp);
-      if( ((int(DstUpLo)==int(Lower) && ip<jp) || (int(DstUpLo)==int(Upper) && ip>jp)))
+      if( ((int(DstMode)==int(Lower) && ip<jp) || (int(DstMode)==int(Upper) && ip>jp)))
         dest.valuePtr()[k] = numext::conj(it.value());
       else
         dest.valuePtr()[k] = it.value();
@@ -461,19 +533,33 @@ void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixTyp
 
 }
 
-template<typename MatrixType,int UpLo>
+// TODO implement twists in a more evaluator friendly fashion
+
+namespace internal {
+
+template<typename MatrixType, int Mode>
+struct traits<SparseSymmetricPermutationProduct<MatrixType,Mode> > : traits<MatrixType> {
+};
+
+}
+
+template<typename MatrixType,int Mode>
 class SparseSymmetricPermutationProduct
-  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,UpLo> >
+  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,Mode> >
 {
   public:
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    enum {
+      RowsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::RowsAtCompileTime,
+      ColsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::ColsAtCompileTime
+    };
   protected:
-    typedef PermutationMatrix<Dynamic,Dynamic,Index> Perm;
+    typedef PermutationMatrix<Dynamic,Dynamic,StorageIndex> Perm;
   public:
-    typedef Matrix<Index,Dynamic,1> VectorI;
+    typedef Matrix<StorageIndex,Dynamic,1> VectorI;
     typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+    typedef typename internal::remove_all<MatrixTypeNested>::type NestedExpression;
     
     SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
       : m_matrix(mat), m_perm(perm)
@@ -481,20 +567,9 @@ class SparseSymmetricPermutationProduct
     
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
-    
-    template<typename DestScalar, int Options, typename DstIndex>
-    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
-    {
-//       internal::permute_symm_to_fullsymm<UpLo>(m_matrix,_dest,m_perm.indices().data());
-      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix,tmp,m_perm.indices().data());
-      _dest = tmp;
-    }
-    
-    template<typename DestType,unsigned int DestUpLo> void evalTo(SparseSelfAdjointView<DestType,DestUpLo>& dest) const
-    {
-      internal::permute_symm_to_symm<UpLo,DestUpLo>(m_matrix,dest.matrix(),m_perm.indices().data());
-    }
+        
+    const NestedExpression& matrix() const { return m_matrix; }
+    const Perm& perm() const { return m_perm; }
     
   protected:
     MatrixTypeNested m_matrix;
@@ -502,6 +577,31 @@ class SparseSymmetricPermutationProduct
 
 };
 
+namespace internal {
+  
+template<typename DstXprType, typename MatrixType, int Mode, typename Scalar>
+struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType,Mode>, internal::assign_op<Scalar>, Sparse2Sparse>
+{
+  typedef SparseSymmetricPermutationProduct<MatrixType,Mode> SrcXprType;
+  typedef typename DstXprType::StorageIndex DstIndex;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,DstIndex> &dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
+    SparseMatrix<Scalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
+    internal::permute_symm_to_fullsymm<Mode>(src.matrix(),tmp,src.perm().indices().data());
+    dst = tmp;
+  }
+  
+  template<typename DestType,unsigned int DestMode>
+  static void run(SparseSelfAdjointView<DestType,DestMode>& dst, const SrcXprType &src, const internal::assign_op<Scalar> &)
+  {
+    internal::permute_symm_to_symm<Mode,DestMode>(src.matrix(),dst.matrix(),src.perm().indices().data());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseSolverBase.h b/nuparu/include/Eigen/src/SparseCore/SparseSolverBase.h
new file mode 100644
index 00000000..1cb7080c
--- /dev/null
+++ b/nuparu/include/Eigen/src/SparseCore/SparseSolverBase.h
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSESOLVERBASE_H
+#define EIGEN_SPARSESOLVERBASE_H
+
+namespace Eigen { 
+
+namespace internal {
+
+  /** \internal
+  * Helper functions to solve with a sparse right-hand-side and result.
+  * The rhs is decomposed into small vertical panels which are solved through dense temporaries.
+  */
+template<typename Decomposition, typename Rhs, typename Dest>
+void solve_sparse_through_dense_panels(const Decomposition &dec, const Rhs& rhs, Dest &dest)
+{
+  EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  typedef typename Dest::Scalar DestScalar;
+  // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
+  static const Index NbColsAtOnce = 4;
+  Index rhsCols = rhs.cols();
+  Index size = rhs.rows();
+  // the temporary matrices do not need more columns than NbColsAtOnce:
+  Index tmpCols = (std::min)(rhsCols, NbColsAtOnce); 
+  Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,tmpCols);
+  Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,tmpCols);
+  for(Index k=0; k<rhsCols; k+=NbColsAtOnce)
+  {
+    Index actualCols = std::min<Index>(rhsCols-k, NbColsAtOnce);
+    tmp.leftCols(actualCols) = rhs.middleCols(k,actualCols);
+    tmpX.leftCols(actualCols) = dec.solve(tmp.leftCols(actualCols));
+    dest.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
+  }
+}
+
+} // end namespace internal
+
+/** \class SparseSolverBase
+  * \ingroup SparseCore_Module
+  * \brief A base class for sparse solvers
+  *
+  * \tparam Derived the actual type of the solver.
+  *
+  */
+template<typename Derived>
+class SparseSolverBase : internal::noncopyable
+{
+  public:
+
+    /** Default constructor */
+    SparseSolverBase()
+      : m_isInitialized(false)
+    {}
+
+    ~SparseSolverBase()
+    {}
+
+    Derived& derived() { return *static_cast<Derived*>(this); }
+    const Derived& derived() const { return *static_cast<const Derived*>(this); }
+    
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Solver is not initialized.");
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+    
+    /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+      *
+      * \sa compute()
+      */
+    template<typename Rhs>
+    inline const Solve<Derived, Rhs>
+    solve(const SparseMatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "Solver is not initialized.");
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<Derived, Rhs>(derived(), b.derived());
+    }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** \internal default implementation of solving with a sparse rhs */
+    template<typename Rhs,typename Dest>
+    void _solve_impl(const SparseMatrixBase<Rhs> &b, SparseMatrixBase<Dest> &dest) const
+    {
+      internal::solve_sparse_through_dense_panels(derived(), b.derived(), dest.derived());
+    }
+    #endif // EIGEN_PARSED_BY_DOXYGEN
+
+  protected:
+    
+    mutable bool m_isInitialized;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSESOLVERBASE_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/nuparu/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 70857c7b..20078f72 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -22,30 +22,33 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
 
   typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
+  typedef typename remove_all<Lhs>::type::StorageIndex StorageIndex;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
-  //int size = lhs.outerSize();
+  //Index size = lhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
   // allocate a temporary buffer
-  AmbiVector<Scalar,Index> tempVector(rows);
+  AmbiVector<Scalar,StorageIndex> tempVector(rows);
 
+  // mimics a resizeByInnerOuter:
+  if(ResultType::IsRowMajor)
+    res.resize(cols, rows);
+  else
+    res.resize(rows, cols);
+  
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+  
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
   // of the lhs differs in average of one non zeros, thus the number of non zeros for
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-
-  // mimics a resizeByInnerOuter:
-  if(ResultType::IsRowMajor)
-    res.resize(cols, rows);
-  else
-    res.resize(rows, cols);
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.reserve(estimated_nnz_prod);
   double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
@@ -56,18 +59,18 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
     // let's do a more accurate determination of the nnz ratio for the current column j of res
     tempVector.init(ratioColRes);
     tempVector.setZero();
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt)
     {
       // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
       tempVector.restart();
       Scalar x = rhsIt.value();
-      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt)
       {
         tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
       }
     }
     res.startVec(j);
-    for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector,tolerance); it; ++it)
+    for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector,tolerance); it; ++it)
       res.insertBackByOuterInner(j,it.index()) = it.value();
   }
   res.finalize();
@@ -100,7 +103,7 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,C
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
     // we need a col-major matrix to hold the result
-    typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType;
     SparseTemporaryType _res(res.rows(), res.cols());
     internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
     res = _res;
@@ -126,10 +129,11 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
   typedef typename ResultType::RealScalar RealScalar;
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor> ColMajorMatrix;
-    ColMajorMatrix colLhs(lhs);
-    ColMajorMatrix colRhs(rhs);
-    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrix,ColMajorMatrix,ResultType>(colLhs, colRhs, res, tolerance);
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    ColMajorMatrixLhs colLhs(lhs);
+    ColMajorMatrixRhs colRhs(rhs);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
 
     // let's transpose the product to get a column x column product
 //     typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
@@ -139,8 +143,53 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
   }
 };
 
-// NOTE the 2 others cases (col row *) must never occur since they are caught
-// by ProductReturnType which transforms it to (col col *) by evaluating rhs.
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixLhs;
+    RowMajorMatrixLhs rowLhs(lhs);
+    sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs,Rhs,ResultType,RowMajor,RowMajor>(rowLhs,rhs,res,tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,RowMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename Lhs::StorageIndex> RowMajorMatrixRhs;
+    RowMajorMatrixRhs rowRhs(rhs);
+    sparse_sparse_product_with_pruning_selector<Lhs,RowMajorMatrixRhs,ResultType,RowMajor,RowMajor,RowMajor>(lhs,rowRhs,res,tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    ColMajorMatrixRhs colRhs(rhs);
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,ColMajorMatrixRhs,ResultType>(lhs, colRhs, res, tolerance);
+  }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ColMajor>
+{
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
+  {
+    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    ColMajorMatrixLhs colLhs(lhs);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,Rhs,ResultType>(colLhs, rhs, res, tolerance);
+  }
+};
 
 } // end namespace internal
 
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseTranspose.h b/nuparu/include/Eigen/src/SparseCore/SparseTranspose.h
index 7c300ee8..b6f180a4 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseTranspose.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseTranspose.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -12,52 +12,93 @@
 
 namespace Eigen { 
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
-  : public SparseMatrixBase<Transpose<MatrixType> >
-{
-    typedef typename internal::remove_all<typename MatrixType::Nested>::type _MatrixTypeNested;
+namespace internal {
+  template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
+  class SparseTransposeImpl
+    : public SparseMatrixBase<Transpose<MatrixType> >
+  {};
+  
+  template<typename MatrixType>
+  class SparseTransposeImpl<MatrixType,CompressedAccessBit>
+    : public SparseCompressedBase<Transpose<MatrixType> >
+  {
+    typedef SparseCompressedBase<Transpose<MatrixType> > Base;
   public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Transpose<MatrixType> )
-
-    class InnerIterator;
-    class ReverseInnerIterator;
+    using Base::derived;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::StorageIndex StorageIndex;
 
     inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
-};
+    
+    inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
+    inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
+    inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
+    inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
 
-// NOTE: VC10 trigger an ICE if don't put typename TransposeImpl<MatrixType,Sparse>:: in front of Index,
-// a typedef typename TransposeImpl<MatrixType,Sparse>::Index Index;
-// does not fix the issue.
-// An alternative is to define the nested class in the parent class itself.
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::InnerIterator
-  : public _MatrixTypeNested::InnerIterator
+    inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
+    inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
+    inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
+    inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
+  };
+}
+  
+template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
+  : public internal::SparseTransposeImpl<MatrixType>
 {
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE InnerIterator(const TransposeImpl& trans, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(trans.derived().nestedExpression(), outer)
-    {}
-    Index row() const { return Base::col(); }
-    Index col() const { return Base::row(); }
+  protected:
+    typedef internal::SparseTransposeImpl<MatrixType> Base;
 };
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::ReverseInnerIterator
-  : public _MatrixTypeNested::ReverseInnerIterator
+namespace internal {
+  
+template<typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IteratorBased>
+  : public evaluator_base<Transpose<ArgType> >
 {
-    typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
+    typedef typename evaluator<ArgType>::InnerIterator        EvalIterator;
+    typedef typename evaluator<ArgType>::ReverseInnerIterator EvalReverseIterator;
   public:
+    typedef Transpose<ArgType> XprType;
+    
+    inline Index nonZerosEstimate() const {
+      return m_argImpl.nonZerosEstimate();
+    }
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const TransposeImpl& xpr, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(xpr.derived().nestedExpression(), outer)
-    {}
-    Index row() const { return Base::col(); }
-    Index col() const { return Base::row(); }
+    class InnerIterator : public EvalIterator
+    {
+    public:
+      EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+        : EvalIterator(unaryOp.m_argImpl,outer)
+      {}
+      
+      Index row() const { return EvalIterator::col(); }
+      Index col() const { return EvalIterator::row(); }
+    };
+    
+    class ReverseInnerIterator : public EvalReverseIterator
+    {
+    public:
+      EIGEN_STRONG_INLINE ReverseInnerIterator(const unary_evaluator& unaryOp, Index outer)
+        : EvalReverseIterator(unaryOp.m_argImpl,outer)
+      {}
+      
+      Index row() const { return EvalReverseIterator::col(); }
+      Index col() const { return EvalReverseIterator::row(); }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& op) :m_argImpl(op.nestedExpression()) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
 };
 
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSETRANSPOSE_H
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseTriangularView.h b/nuparu/include/Eigen/src/SparseCore/SparseTriangularView.h
index 333127b7..7c718e4e 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseTriangularView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,19 +11,19 @@
 #ifndef EIGEN_SPARSE_TRIANGULARVIEW_H
 #define EIGEN_SPARSE_TRIANGULARVIEW_H
 
-namespace Eigen { 
+namespace Eigen {
 
-namespace internal {
-  
-template<typename MatrixType, int Mode>
-struct traits<SparseTriangularView<MatrixType,Mode> >
-: public traits<MatrixType>
-{};
-
-} // namespace internal
-
-template<typename MatrixType, int Mode> class SparseTriangularView
-  : public SparseMatrixBase<SparseTriangularView<MatrixType,Mode> >
+/** \ingroup SparseCore_Module
+  *
+  * \brief Base class for a triangular part in a \b sparse matrix
+  *
+  * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be instantiated.
+  * It extends class TriangularView with additional methods which are available for sparse expressions only.
+  *
+  * \sa class TriangularView, SparseMatrixBase::triangularView()
+  */
+template<typename MatrixType, unsigned int Mode> class TriangularViewImpl<MatrixType,Mode,Sparse>
+  : public SparseMatrixBase<TriangularView<MatrixType,Mode> >
 {
     enum { SkipFirst = ((Mode&Lower) && !(MatrixType::Flags&RowMajorBit))
                     || ((Mode&Upper) &&  (MatrixType::Flags&RowMajorBit)),
@@ -31,46 +31,46 @@ template<typename MatrixType, int Mode> class SparseTriangularView
            SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
            HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
     };
+    
+    typedef TriangularView<MatrixType,Mode> TriangularViewType;
+    
+  protected:
+    // dummy solve function to make TriangularView happy.
+    void solve() const;
 
+    typedef SparseMatrixBase<TriangularViewType> Base;
   public:
     
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseTriangularView)
-
+    EIGEN_SPARSE_PUBLIC_INTERFACE(TriangularViewType)
+    
     class InnerIterator;
     class ReverseInnerIterator;
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
     typedef typename MatrixType::Nested MatrixTypeNested;
     typedef typename internal::remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
     typedef typename internal::remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
 
-    inline SparseTriangularView(const MatrixType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solve(const MatrixBase<OtherDerived>& other) const;
+    template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE void _solve_impl(const RhsType &rhs, DstType &dst) const {
+      if(!(internal::is_same<RhsType,DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
+        dst = rhs;
+      this->solveInPlace(dst);
+    }
 
     template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
     template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
-
-  protected:
-    MatrixTypeNested m_matrix;
+  
 };
 
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator
+template<typename MatrixType, unsigned int Mode>
+class TriangularViewImpl<MatrixType,Mode,Sparse>::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator
 {
     typedef typename MatrixTypeNestedCleaned::InnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
   public:
 
-    EIGEN_STRONG_INLINE InnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer), m_returnOne(false)
+    EIGEN_STRONG_INLINE InnerIterator(const TriangularViewImpl& view, Index outer)
+      : Base(view.derived().nestedExpression(), outer), m_returnOne(false)
     {
       if(SkipFirst)
       {
@@ -106,7 +106,7 @@ class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNe
 
     inline Index row() const { return (MatrixType::Flags&RowMajorBit ? Base::outer() : this->index()); }
     inline Index col() const { return (MatrixType::Flags&RowMajorBit ? this->index() : Base::outer()); }
-    inline Index index() const
+    inline StorageIndex index() const
     {
       if(HasUnitDiag && m_returnOne)  return Base::outer();
       else                            return Base::index();
@@ -132,15 +132,14 @@ class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNe
     bool m_returnOne;
 };
 
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator
+template<typename MatrixType, unsigned int Mode>
+class TriangularViewImpl<MatrixType,Mode,Sparse>::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator
 {
     typedef typename MatrixTypeNestedCleaned::ReverseInnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
   public:
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer)
+    EIGEN_STRONG_INLINE ReverseInnerIterator(const TriangularViewType& view, Index outer)
+      : Base(view.derived().nestedExpression(), outer)
     {
       eigen_assert((!HasUnitDiag) && "ReverseInnerIterator does not support yet triangular views with a unit diagonal");
       if(SkipLast) {
@@ -166,12 +165,123 @@ class SparseTriangularView<MatrixType,Mode>::ReverseInnerIterator : public Matri
     }
 };
 
+namespace internal {
+
+template<typename ArgType, unsigned int Mode>
+struct unary_evaluator<TriangularView<ArgType,Mode>, IteratorBased>
+ : evaluator_base<TriangularView<ArgType,Mode> >
+{
+  typedef TriangularView<ArgType,Mode> XprType;
+  
+protected:
+  
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+  
+  enum { SkipFirst = ((Mode&Lower) && !(ArgType::Flags&RowMajorBit))
+                    || ((Mode&Upper) &&  (ArgType::Flags&RowMajorBit)),
+         SkipLast = !SkipFirst,
+         SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
+         HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
+  };
+  
+public:
+  
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = XprType::Flags
+  };
+    
+  explicit unary_evaluator(const XprType &xpr) : m_argImpl(xpr.nestedExpression()) {}
+  
+  inline Index nonZerosEstimate() const {
+    return m_argImpl.nonZerosEstimate();
+  }
+  
+  class InnerIterator : public EvalIterator
+  {
+      typedef EvalIterator Base;
+    public:
+
+      EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& xprEval, Index outer)
+        : Base(xprEval.m_argImpl,outer), m_returnOne(false)
+      {
+        if(SkipFirst)
+        {
+          while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
+            Base::operator++();
+          if(HasUnitDiag)
+            m_returnOne = true;
+        }
+        else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+        {
+          if((!SkipFirst) && Base::operator bool())
+            Base::operator++();
+          m_returnOne = true; // FIXME check innerSize()>outer();
+        }
+      }
+
+      EIGEN_STRONG_INLINE InnerIterator& operator++()
+      {
+        if(HasUnitDiag && m_returnOne)
+          m_returnOne = false;
+        else
+        {
+          Base::operator++();
+          if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
+          {
+            if((!SkipFirst) && Base::operator bool())
+              Base::operator++();
+            m_returnOne = true; // FIXME check innerSize()>outer();
+          }
+        }
+        return *this;
+      }
+      
+      EIGEN_STRONG_INLINE operator bool() const
+      {
+        if(HasUnitDiag && m_returnOne)
+          return true;
+        if(SkipFirst) return  Base::operator bool();
+        else
+        {
+          if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
+          else return (Base::operator bool() && this->index() <= this->outer());
+        }
+      }
+
+//       inline Index row() const { return (ArgType::Flags&RowMajorBit ? Base::outer() : this->index()); }
+//       inline Index col() const { return (ArgType::Flags&RowMajorBit ? this->index() : Base::outer()); }
+      inline StorageIndex index() const
+      {
+        if(HasUnitDiag && m_returnOne)  return internal::convert_index<StorageIndex>(Base::outer());
+        else                            return Base::index();
+      }
+      inline Scalar value() const
+      {
+        if(HasUnitDiag && m_returnOne)  return Scalar(1);
+        else                            return Base::value();
+      }
+
+    protected:
+      bool m_returnOne;
+    private:
+      Scalar& valueRef();
+  };
+  
+protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+} // end namespace internal
+
 template<typename Derived>
 template<int Mode>
-inline const SparseTriangularView<Derived, Mode>
+inline const TriangularView<const Derived, Mode>
 SparseMatrixBase<Derived>::triangularView() const
 {
-  return derived();
+  return TriangularView<const Derived, Mode>(derived());
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseUtil.h b/nuparu/include/Eigen/src/SparseCore/SparseUtil.h
index 064a4070..74df0d49 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseUtil.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseUtil.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -37,43 +37,23 @@ EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
 }
 
 #define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
-
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
-  typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
-  typedef typename Eigen::internal::nested<Derived >::type Nested; \
-  typedef typename Eigen::internal::traits<Derived >::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived >::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived >::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived >::ColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived >::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived >::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
-  using Base::derived; \
-  using Base::const_cast_derived;
+EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
+
 
 #define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)
 
+  
 const int CoherentAccessPattern     = 0x1;
 const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
 const int OuterRandomAccessPattern  = 0x4 | CoherentAccessPattern;
 const int RandomAccessPattern       = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
 
-template<typename Derived> class SparseMatrixBase;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class DynamicSparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseVector;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class MappedSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class DynamicSparseMatrix;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class SparseVector;
+template<typename _Scalar, int _Flags = 0, typename _StorageIndex = int>  class MappedSparseMatrix;
 
-template<typename MatrixType, int Mode>           class SparseTriangularView;
 template<typename MatrixType, unsigned int UpLo>  class SparseSelfAdjointView;
 template<typename Lhs, typename Rhs>              class SparseDiagonalProduct;
 template<typename MatrixType> class SparseView;
@@ -84,41 +64,45 @@ template<typename Lhs, typename Rhs>        class DenseTimeSparseProduct;
 template<typename Lhs, typename Rhs, bool Transpose> class SparseDenseOuterProduct;
 
 template<typename Lhs, typename Rhs> struct SparseSparseProductReturnType;
-template<typename Lhs, typename Rhs, int InnerSize = internal::traits<Lhs>::ColsAtCompileTime> struct DenseSparseProductReturnType;
-template<typename Lhs, typename Rhs, int InnerSize = internal::traits<Lhs>::ColsAtCompileTime> struct SparseDenseProductReturnType;
+template<typename Lhs, typename Rhs,
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct DenseSparseProductReturnType;
+         
+template<typename Lhs, typename Rhs,
+         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct SparseDenseProductReturnType;
 template<typename MatrixType,int UpLo> class SparseSymmetricPermutationProduct;
 
 namespace internal {
 
-template<typename T,int Rows,int Cols> struct sparse_eval;
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval;
 
 template<typename T> struct eval<T,Sparse>
-  : public sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime,traits<T>::Flags>
 {};
 
-template<typename T,int Cols> struct sparse_eval<T,1,Cols> {
+template<typename T,int Cols,int Flags> struct sparse_eval<T,1,Cols,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
+    typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
-    typedef SparseVector<_Scalar, RowMajor, _Index> type;
+    typedef SparseVector<_Scalar, RowMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
+template<typename T,int Rows,int Flags> struct sparse_eval<T,Rows,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
+    typedef typename traits<T>::StorageIndex _StorageIndex;
   public:
-    typedef SparseVector<_Scalar, ColMajor, _Index> type;
+    typedef SparseVector<_Scalar, ColMajor, _StorageIndex> type;
 };
 
-template<typename T,int Rows,int Cols> struct sparse_eval {
+// TODO this seems almost identical to plain_matrix_type<T, Sparse>
+template<typename T,int Rows,int Cols,int Flags> struct sparse_eval {
     typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
-    enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+    typedef typename traits<T>::StorageIndex _StorageIndex;
+    enum { _Options = ((Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+    typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
 };
 
-template<typename T> struct sparse_eval<T,1,1> {
+template<typename T,int Flags> struct sparse_eval<T,1,1,Flags> {
     typedef typename traits<T>::Scalar _Scalar;
   public:
     typedef Matrix<_Scalar, 1, 1> type;
@@ -127,12 +111,35 @@ template<typename T> struct sparse_eval<T,1,1> {
 template<typename T> struct plain_matrix_type<T,Sparse>
 {
   typedef typename traits<T>::Scalar _Scalar;
-  typedef typename traits<T>::Index _Index;
-  enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
+  typedef typename traits<T>::StorageIndex _StorageIndex;
+  enum { _Options = ((evaluator<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
   public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+    typedef SparseMatrix<_Scalar, _Options, _StorageIndex> type;
+};
+
+template<typename T>
+struct plain_object_eval<T,Sparse>
+  : sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime, evaluator<T>::Flags>
+{};
+
+template<typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition,RhsType,Sparse>
+{
+  typedef typename sparse_eval<RhsType, RhsType::RowsAtCompileTime, RhsType::ColsAtCompileTime,traits<RhsType>::Flags>::type PlainObject;
 };
 
+template<typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, Sparse>
+{
+  typedef SparseMatrixBase<Derived> type;
+};
+
+struct SparseTriangularShape  { static std::string debugName() { return "SparseTriangularShape"; } };
+struct SparseSelfAdjointShape { static std::string debugName() { return "SparseSelfAdjointShape"; } };
+
+template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type;  };
+template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape  type;  };
+
 } // end namespace internal
 
 /** \ingroup SparseCore_Module
@@ -143,26 +150,26 @@ template<typename T> struct plain_matrix_type<T,Sparse>
   *
   * \sa SparseMatrix::setFromTriplets()
   */
-template<typename Scalar, typename Index=unsigned int>
+template<typename Scalar, typename StorageIndex=typename SparseMatrix<Scalar>::StorageIndex >
 class Triplet
 {
 public:
   Triplet() : m_row(0), m_col(0), m_value(0) {}
 
-  Triplet(const Index& i, const Index& j, const Scalar& v = Scalar(0))
+  Triplet(const StorageIndex& i, const StorageIndex& j, const Scalar& v = Scalar(0))
     : m_row(i), m_col(j), m_value(v)
   {}
 
   /** \returns the row index of the element */
-  const Index& row() const { return m_row; }
+  const StorageIndex& row() const { return m_row; }
 
   /** \returns the column index of the element */
-  const Index& col() const { return m_col; }
+  const StorageIndex& col() const { return m_col; }
 
   /** \returns the value of the element */
   const Scalar& value() const { return m_value; }
 protected:
-  Index m_row, m_col;
+  StorageIndex m_row, m_col;
   Scalar m_value;
 };
 
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseVector.h b/nuparu/include/Eigen/src/SparseCore/SparseVector.h
index 7e15c814..7ec73a36 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseVector.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -26,11 +26,11 @@ namespace Eigen {
   */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseVector<_Scalar, _Options, _Index> >
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
   typedef _Scalar Scalar;
-  typedef _Index Index;
+  typedef _StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -40,8 +40,7 @@ struct traits<SparseVector<_Scalar, _Options, _Index> >
     ColsAtCompileTime = IsColVector ? 1 : Dynamic,
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit),
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
@@ -61,18 +60,18 @@ struct sparse_vector_assign_selector;
 
 }
 
-template<typename _Scalar, int _Options, typename _Index>
+template<typename _Scalar, int _Options, typename _StorageIndex>
 class SparseVector
-  : public SparseMatrixBase<SparseVector<_Scalar, _Options, _Index> >
+  : public SparseCompressedBase<SparseVector<_Scalar, _Options, _StorageIndex> >
 {
-    typedef SparseMatrixBase<SparseVector> SparseBase;
-    
+    typedef SparseCompressedBase<SparseVector> Base;
+    using Base::convert_index;
   public:
     EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
     EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, -=)
     
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
+    typedef internal::CompressedStorage<Scalar,StorageIndex> Storage;
     enum { IsColVector = internal::traits<SparseVector>::IsColVector };
     
     enum {
@@ -87,8 +86,13 @@ class SparseVector
     EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return &m_data.value(0); }
     EIGEN_STRONG_INLINE Scalar* valuePtr() { return &m_data.value(0); }
 
-    EIGEN_STRONG_INLINE const Index* innerIndexPtr() const { return &m_data.index(0); }
-    EIGEN_STRONG_INLINE Index* innerIndexPtr() { return &m_data.index(0); }
+    EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return &m_data.index(0); }
+    EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return &m_data.index(0); }
+
+    inline const StorageIndex* outerIndexPtr() const { return 0; }
+    inline StorageIndex* outerIndexPtr() { return 0; }
+    inline const StorageIndex* innerNonZeroPtr() const { return 0; }
+    inline StorageIndex* innerNonZeroPtr() { return 0; }
     
     /** \internal */
     inline Storage& data() { return m_data; }
@@ -103,13 +107,13 @@ class SparseVector
     inline Scalar coeff(Index i) const
     {
       eigen_assert(i>=0 && i<m_size);
-      return m_data.at(i);
+      return m_data.at(StorageIndex(i));
     }
 
     inline Scalar& coeffRef(Index row, Index col)
     {
       eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
-      return coeff(IsColVector ? row : col);
+      return coeffRef(IsColVector ? row : col);
     }
 
     /** \returns a reference to the coefficient value at given index \a i
@@ -121,18 +125,18 @@ class SparseVector
     inline Scalar& coeffRef(Index i)
     {
       eigen_assert(i>=0 && i<m_size);
-      return m_data.atWithInsertion(i);
+      return m_data.atWithInsertion(StorageIndex(i));
     }
 
   public:
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+    typedef typename Base::InnerIterator InnerIterator;
+    typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
 
     inline void setZero() { m_data.clear(); }
 
     /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return static_cast<Index>(m_data.size()); }
+    inline Index nonZeros() const  { return m_data.size(); }
 
     inline void startVec(Index outer)
     {
@@ -151,6 +155,18 @@ class SparseVector
       m_data.append(0, i);
       return m_data.value(m_data.size()-1);
     }
+    
+    Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
+    {
+      EIGEN_UNUSED_VARIABLE(outer);
+      eigen_assert(outer==0);
+      return insertBackUnordered(inner);
+    }
+    inline Scalar& insertBackUnordered(Index i)
+    {
+      m_data.append(0, i);
+      return m_data.value(m_data.size()-1);
+    }
 
     inline Scalar& insert(Index row, Index col)
     {
@@ -158,6 +174,7 @@ class SparseVector
       
       Index inner = IsColVector ? row : col;
       Index outer = IsColVector ? col : row;
+      EIGEN_ONLY_USED_FOR_DEBUG(outer);
       eigen_assert(outer==0);
       return insert(inner);
     }
@@ -176,7 +193,7 @@ class SparseVector
         m_data.value(p+1) = m_data.value(p);
         --p;
       }
-      m_data.index(p+1) = i;
+      m_data.index(p+1) = convert_index(i);
       m_data.value(p+1) = 0;
       return m_data.value(p+1);
     }
@@ -195,7 +212,7 @@ class SparseVector
 
     void resize(Index rows, Index cols)
     {
-      eigen_assert(rows==1 || cols==1);
+      eigen_assert((IsColVector ? cols : rows)==1 && "Outer dimension must equal 1");
       resize(IsColVector ? rows : cols);
     }
 
@@ -209,7 +226,7 @@ class SparseVector
 
     inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
 
-    inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
+    explicit inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
 
     inline SparseVector(Index rows, Index cols) : m_size(0) { check_template_parameters(); resize(rows,cols); }
 
@@ -217,12 +234,15 @@ class SparseVector
     inline SparseVector(const SparseMatrixBase<OtherDerived>& other)
       : m_size(0)
     {
+      #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+        EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      #endif
       check_template_parameters();
       *this = other.derived();
     }
 
     inline SparseVector(const SparseVector& other)
-      : SparseBase(other), m_size(0)
+      : Base(other), m_size(0)
     {
       check_template_parameters();
       *this = other.derived();
@@ -336,7 +356,7 @@ class SparseVector
   
     static void check_template_parameters()
     {
-      EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
+      EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
       EIGEN_STATIC_ASSERT((_Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
     }
     
@@ -344,77 +364,43 @@ class SparseVector
     Index m_size;
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-    InnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id)); }
-
-    inline Index index() const { return m_data.index(m_id); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id < m_end); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_end;
-};
+namespace internal {
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
+template<typename _Scalar, int _Options, typename _Index>
+struct evaluator<SparseVector<_Scalar,_Options,_Index> >
+  : evaluator_base<SparseVector<_Scalar,_Options,_Index> >
 {
-  public:
-    ReverseInnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
-
-    ReverseInnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_data.value(m_id-1); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id-1)); }
-
-    inline Index index() const { return m_data.index(m_id-1); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
-
-    inline operator bool() const { return (m_id > m_start); }
-
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_start;
+  typedef SparseVector<_Scalar,_Options,_Index> SparseVectorType;
+  typedef typename SparseVectorType::InnerIterator InnerIterator;
+  typedef typename SparseVectorType::ReverseInnerIterator ReverseInnerIterator;
+  
+  enum {
+    CoeffReadCost = NumTraits<_Scalar>::ReadCost,
+    Flags = SparseVectorType::Flags
+  };
+  
+  explicit evaluator(const SparseVectorType &mat) : m_matrix(mat)
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+  
+  inline Index nonZerosEstimate() const {
+    return m_matrix.nonZeros();
+  }
+  
+  operator SparseVectorType&() { return m_matrix.const_cast_derived(); }
+  operator const SparseVectorType&() const { return m_matrix; }
+  
+  const SparseVectorType &m_matrix;
 };
 
-namespace internal {
-
 template< typename Dest, typename Src>
 struct sparse_vector_assign_selector<Dest,Src,SVA_Inner> {
   static void run(Dest& dst, const Src& src) {
     eigen_internal_assert(src.innerSize()==src.size());
-    for(typename Src::InnerIterator it(src, 0); it; ++it)
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for(typename SrcEvaluatorType::InnerIterator it(srcEval, 0); it; ++it)
       dst.insert(it.index()) = it.value();
   }
 };
@@ -423,9 +409,11 @@ template< typename Dest, typename Src>
 struct sparse_vector_assign_selector<Dest,Src,SVA_Outer> {
   static void run(Dest& dst, const Src& src) {
     eigen_internal_assert(src.outerSize()==src.size());
-    for(typename Dest::Index i=0; i<src.size(); ++i)
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for(Index i=0; i<src.size(); ++i)
     {
-      typename Src::InnerIterator it(src, i);
+      typename SrcEvaluatorType::InnerIterator it(srcEval, i);
       if(it)
         dst.insert(i) = it.value();
     }
diff --git a/nuparu/include/Eigen/src/SparseCore/SparseView.h b/nuparu/include/Eigen/src/SparseCore/SparseView.h
index fd845046..c945c4da 100644
--- a/nuparu/include/Eigen/src/SparseCore/SparseView.h
+++ b/nuparu/include/Eigen/src/SparseCore/SparseView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Daniel Lowengrub <lowdanie@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -18,7 +18,7 @@ namespace internal {
 template<typename MatrixType>
 struct traits<SparseView<MatrixType> > : traits<MatrixType>
 {
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef Sparse StorageKind;
   enum {
     Flags = int(traits<MatrixType>::Flags) & (RowMajorBit)
@@ -32,66 +32,189 @@ class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
 {
   typedef typename MatrixType::Nested MatrixTypeNested;
   typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef SparseMatrixBase<SparseView > Base;
 public:
   EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
+  typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
-  SparseView(const MatrixType& mat, const Scalar& m_reference = Scalar(0),
-             typename NumTraits<Scalar>::Real m_epsilon = NumTraits<Scalar>::dummy_precision()) : 
-    m_matrix(mat), m_reference(m_reference), m_epsilon(m_epsilon) {}
-
-  class InnerIterator;
+  explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
+                      RealScalar epsilon = NumTraits<Scalar>::dummy_precision())
+    : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}
 
   inline Index rows() const { return m_matrix.rows(); }
   inline Index cols() const { return m_matrix.cols(); }
 
   inline Index innerSize() const { return m_matrix.innerSize(); }
   inline Index outerSize() const { return m_matrix.outerSize(); }
-
+  
+  /** \returns the nested expression */
+  const typename internal::remove_all<MatrixTypeNested>::type&
+  nestedExpression() const { return m_matrix; }
+  
+  Scalar reference() const { return m_reference; }
+  RealScalar epsilon() const { return m_epsilon; }
+  
 protected:
   MatrixTypeNested m_matrix;
   Scalar m_reference;
-  typename NumTraits<Scalar>::Real m_epsilon;
+  RealScalar m_epsilon;
 };
 
-template<typename MatrixType>
-class SparseView<MatrixType>::InnerIterator : public _MatrixTypeNested::InnerIterator
-{
-  typedef typename SparseView::Index Index;
-public:
-  typedef typename _MatrixTypeNested::InnerIterator IterBase;
-  InnerIterator(const SparseView& view, Index outer) :
-  IterBase(view.m_matrix, outer), m_view(view)
-  {
-    incrementToNonZero();
-  }
-
-  EIGEN_STRONG_INLINE InnerIterator& operator++()
-  {
-    IterBase::operator++();
-    incrementToNonZero();
-    return *this;
-  }
-
-  using IterBase::value;
+namespace internal {
 
-protected:
-  const SparseView& m_view;
+// TODO find a way to unify the two following variants
+// This is tricky because implementing an inner iterator on top of an IndexBased evaluator is
+// not easy because the evaluators do not expose the sizes of the underlying expression.
+  
+template<typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IteratorBased>
+  : public evaluator_base<SparseView<ArgType> >
+{
+    typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+  public:
+    typedef SparseView<ArgType> XprType;
+    
+    class InnerIterator : public EvalIterator
+    {
+        typedef typename XprType::Scalar Scalar;
+      public:
+
+        EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+          : EvalIterator(sve.m_argImpl,outer), m_view(sve.m_view)
+        {
+          incrementToNonZero();
+        }
+
+        EIGEN_STRONG_INLINE InnerIterator& operator++()
+        {
+          EvalIterator::operator++();
+          incrementToNonZero();
+          return *this;
+        }
+
+        using EvalIterator::value;
+
+      protected:
+        const XprType &m_view;
+
+      private:
+        void incrementToNonZero()
+        {
+          while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.reference(), m_view.epsilon()))
+          {
+            EvalIterator::operator++();
+          }
+        }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_view;
+};
 
-private:
-  void incrementToNonZero()
-  {
-    while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.m_reference, m_view.m_epsilon))
+template<typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IndexBased>
+  : public evaluator_base<SparseView<ArgType> >
+{
+  public:
+    typedef SparseView<ArgType> XprType;
+  protected:
+    enum { IsRowMajor = (XprType::Flags&RowMajorBit)==RowMajorBit };
+    typedef typename XprType::Scalar Scalar;
+    typedef typename XprType::StorageIndex StorageIndex;
+  public:
+    
+    class InnerIterator
     {
-      IterBase::operator++();
-    }
-  }
+      public:
+
+        EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+          : m_sve(sve), m_inner(0), m_outer(outer), m_end(sve.m_view.innerSize())
+        {
+          incrementToNonZero();
+        }
+
+        EIGEN_STRONG_INLINE InnerIterator& operator++()
+        {
+          m_inner++;
+          incrementToNonZero();
+          return *this;
+        }
+
+        EIGEN_STRONG_INLINE Scalar value() const
+        {
+          return (IsRowMajor) ? m_sve.m_argImpl.coeff(m_outer, m_inner)
+                              : m_sve.m_argImpl.coeff(m_inner, m_outer);
+        }
+
+        EIGEN_STRONG_INLINE StorageIndex index() const { return m_inner; }
+        inline Index row() const { return IsRowMajor ? m_outer : index(); }
+        inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+        EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
+
+      protected:
+        const unary_evaluator &m_sve;
+        Index m_inner;
+        const Index m_outer;
+        const Index m_end;
+
+      private:
+        void incrementToNonZero()
+        {
+          while((bool(*this)) && internal::isMuchSmallerThan(value(), m_sve.m_view.reference(), m_sve.m_view.epsilon()))
+          {
+            m_inner++;
+          }
+        }
+    };
+    
+    enum {
+      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+      Flags = XprType::Flags
+    };
+    
+    explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+  protected:
+    evaluator<ArgType> m_argImpl;
+    const XprType &m_view;
 };
 
+} // end namespace internal
+
+template<typename Derived>
+const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& reference,
+                                                          const typename NumTraits<Scalar>::Real& epsilon) const
+{
+  return SparseView<Derived>(derived(), reference, epsilon);
+}
+
+/** \returns an expression of \c *this with values smaller than
+  * \a reference * \a epsilon are removed.
+  *
+  * This method is typically used in conjunction with the product of two sparse matrices
+  * to automatically prune the smallest values as follows:
+  * \code
+  * C = (A*B).pruned();             // suppress numerical zeros (exact)
+  * C = (A*B).pruned(ref);
+  * C = (A*B).pruned(ref,epsilon);
+  * \endcode
+  * where \c ref is a meaningful non zero reference value.
+  * */
 template<typename Derived>
-const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& m_reference,
-                                                          const typename NumTraits<Scalar>::Real& m_epsilon) const
+const SparseView<Derived>
+SparseMatrixBase<Derived>::pruned(const Scalar& reference,
+                                  const RealScalar& epsilon) const
 {
-  return SparseView<Derived>(derived(), m_reference, m_epsilon);
+  return SparseView<Derived>(derived(), reference, epsilon);
 }
 
 } // end namespace Eigen
diff --git a/nuparu/include/Eigen/src/SparseCore/TriangularSolver.h b/nuparu/include/Eigen/src/SparseCore/TriangularSolver.h
index cb8ad82b..19f8f670 100644
--- a/nuparu/include/Eigen/src/SparseCore/TriangularSolver.h
+++ b/nuparu/include/Eigen/src/SparseCore/TriangularSolver.h
@@ -28,16 +28,19 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=0; i<lhs.rows(); ++i)
+      for(Index i=0; i<lhs.rows(); ++i)
       {
         Scalar tmp = other.coeff(i,col);
         Scalar lastVal(0);
-        int lastIndex = 0;
-        for(typename Lhs::InnerIterator it(lhs, i); it; ++it)
+        Index lastIndex = 0;
+        for(LhsIterator it(lhsEval, i); it; ++it)
         {
           lastVal = it.value();
           lastIndex = it.index();
@@ -62,15 +65,18 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=lhs.rows()-1 ; i>=0 ; --i)
+      for(Index i=lhs.rows()-1 ; i>=0 ; --i)
       {
         Scalar tmp = other.coeff(i,col);
-        Scalar l_ii = 0;
-        typename Lhs::InnerIterator it(lhs, i);
+        Scalar l_ii(0);
+        LhsIterator it(lhsEval, i);
         while(it && it.index()<i)
           ++it;
         if(!(Mode & UnitDiag))
@@ -86,10 +92,8 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
           tmp -= it.value() * other.coeff(it.index(),col);
         }
 
-        if (Mode & UnitDiag)
-          other.coeffRef(i,col) = tmp;
-        else
-          other.coeffRef(i,col) = tmp/l_ii;
+        if (Mode & UnitDiag)  other.coeffRef(i,col) = tmp;
+        else                  other.coeffRef(i,col) = tmp/l_ii;
       }
     }
   }
@@ -100,16 +104,19 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=0; i<lhs.cols(); ++i)
+      for(Index i=0; i<lhs.cols(); ++i)
       {
         Scalar& tmp = other.coeffRef(i,col);
         if (tmp!=Scalar(0)) // optimization when other is actually sparse
         {
-          typename Lhs::InnerIterator it(lhs, i);
+          LhsIterator it(lhsEval, i);
           while(it && it.index()<i)
             ++it;
           if(!(Mode & UnitDiag))
@@ -132,11 +139,14 @@ template<typename Lhs, typename Rhs, int Mode>
 struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
   static void run(const Lhs& lhs, Rhs& other)
   {
-    for(int col=0 ; col<other.cols() ; ++col)
+    LhsEval lhsEval(lhs);
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
-      for(int i=lhs.cols()-1; i>=0; --i)
+      for(Index i=lhs.cols()-1; i>=0; --i)
       {
         Scalar& tmp = other.coeffRef(i,col);
         if (tmp!=Scalar(0)) // optimization when other is actually sparse
@@ -144,13 +154,13 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
           if(!(Mode & UnitDiag))
           {
             // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
-            typename Lhs::ReverseInnerIterator it(lhs, i);
+            LhsIterator it(lhsEval, i);
             while(it && it.index()!=i)
-              --it;
+              ++it;
             eigen_assert(it && it.index()==i);
             other.coeffRef(i,col) /= it.value();
           }
-          typename Lhs::InnerIterator it(lhs, i);
+          LhsIterator it(lhsEval, i);
           for(; it && it.index()<i; ++it)
             other.coeffRef(it.index(), col) -= tmp * it.value();
         }
@@ -161,11 +171,11 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
 
 } // end namespace internal
 
-template<typename ExpressionType,int Mode>
+template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDerived>& other) const
+void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(MatrixBase<OtherDerived>& other) const
 {
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
@@ -174,22 +184,12 @@ void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDer
     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(m_matrix, otherCopy);
+  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(derived().nestedExpression(), otherCopy);
 
   if (copy)
     other = otherCopy;
 }
 
-template<typename ExpressionType,int Mode>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseTriangularView<ExpressionType,Mode>::solve(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  solveInPlace(res);
-  return res;
-}
-
 // pure sparse path
 
 namespace internal {
@@ -208,18 +208,18 @@ template<typename Lhs, typename Rhs, int Mode, int UpLo>
 struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex,
+                                      typename traits<Rhs>::StorageIndex>::type StorageIndex;
   static void run(const Lhs& lhs, Rhs& other)
   {
     const bool IsLower = (UpLo==Lower);
-    AmbiVector<Scalar,Index> tempVector(other.rows()*2);
+    AmbiVector<Scalar,StorageIndex> tempVector(other.rows()*2);
     tempVector.setBounds(0,other.rows());
 
     Rhs res(other.rows(), other.cols());
     res.reserve(other.nonZeros());
 
-    for(int col=0 ; col<other.cols() ; ++col)
+    for(Index col=0 ; col<other.cols() ; ++col)
     {
       // FIXME estimate number of non zeros
       tempVector.init(.99/*float(other.col(col).nonZeros())/float(other.rows())*/);
@@ -230,7 +230,7 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
         tempVector.coeffRef(rhsIt.index()) = rhsIt.value();
       }
 
-      for(int i=IsLower?0:lhs.cols()-1;
+      for(Index i=IsLower?0:lhs.cols()-1;
           IsLower?i<lhs.cols():i>=0;
           i+=IsLower?1:-1)
       {
@@ -267,9 +267,9 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
       }
 
 
-      int count = 0;
+      Index count = 0;
       // FIXME compute a reference value to filter zeros
-      for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector/*,1e-12*/); it; ++it)
+      for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector/*,1e-12*/); it; ++it)
       {
         ++ count;
 //         std::cerr << "fill " << it.index() << ", " << col << "\n";
@@ -286,11 +286,11 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
 
 } // end namespace internal
 
-template<typename ExpressionType,int Mode>
+template<typename ExpressionType,unsigned int Mode>
 template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
+void TriangularViewImpl<ExpressionType,Mode,Sparse>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
 {
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
   eigen_assert( (!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
 
 //   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
@@ -299,36 +299,12 @@ void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<Ot
 //     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
 //   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(m_matrix, other.derived());
+  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(derived().nestedExpression(), other.derived());
 
 //   if (copy)
 //     other = otherCopy;
 }
 
-#ifdef EIGEN2_SUPPORT
-
-// deprecated stuff:
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-void SparseMatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
-{
-  this->template triangular<Flags&(Upper|Lower)>().solveInPlace(other);
-}
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseMatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  derived().solveTriangularInPlace(res);
-  return res;
-}
-#endif // EIGEN2_SUPPORT
-
 } // end namespace Eigen
 
 #endif // EIGEN_SPARSETRIANGULARSOLVER_H
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU.h b/nuparu/include/Eigen/src/SparseLU/SparseLU.h
index dd9eab2c..d33d27f4 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,7 +14,7 @@
 
 namespace Eigen {
 
-template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::Index> > class SparseLU;
+template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::StorageIndex> > class SparseLU;
 template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
 template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
 
@@ -64,33 +64,45 @@ template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixURetu
   * 
   * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<>
   * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
-  * 
+  *
+  * \implsparsesolverconcept
   * 
   * \sa \ref TutorialSparseDirectSolvers
   * \sa \ref OrderingMethods_Module
   */
 template <typename _MatrixType, typename _OrderingType>
-class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::Index>
+class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >, public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::StorageIndex>
 {
+  protected:
+    typedef SparseSolverBase<SparseLU<_MatrixType,_OrderingType> > APIBase;
+    using APIBase::m_isInitialized;
   public:
+    using APIBase::_solve_impl;
+    
     typedef _MatrixType MatrixType; 
     typedef _OrderingType OrderingType;
     typedef typename MatrixType::Scalar Scalar; 
     typedef typename MatrixType::RealScalar RealScalar; 
-    typedef typename MatrixType::Index Index; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> NCMatrix;
-    typedef internal::MappedSuperNodalMatrix<Scalar, Index> SCMatrix; 
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> NCMatrix;
+    typedef internal::MappedSuperNodalMatrix<Scalar, StorageIndex> SCMatrix;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    typedef internal::SparseLUImpl<Scalar, Index> Base;
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+    typedef internal::SparseLUImpl<Scalar, StorageIndex> Base;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
     
   public:
-    SparseLU():m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    SparseLU():m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
     }
-    SparseLU(const MatrixType& matrix):m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
+    explicit SparseLU(const MatrixType& matrix)
+      : m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
     {
       initperfvalues(); 
       compute(matrix);
@@ -141,9 +153,9 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       * y = b; matrixU().solveInPlace(y);
       * \endcode
       */
-    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,Index> > matrixU() const
+    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,StorageIndex> > matrixU() const
     {
-      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,Index> >(m_Lstore, m_Ustore);
+      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,StorageIndex> >(m_Lstore, m_Ustore);
     }
 
     /**
@@ -168,6 +180,7 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       m_diagpivotthresh = thresh; 
     }
 
+#ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
       *
       * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
@@ -175,26 +188,8 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       * \sa compute()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
-
-    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseLU, Rhs> solve(const SparseMatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::sparse_solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
+    inline const Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
+#endif // EIGEN_PARSED_BY_DOXYGEN
     
     /** \brief Reports whether previous computation was successful.
       *
@@ -219,9 +214,9 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
     }
 
     template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &_X) const
+    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
     {
-      Dest& X(_X.derived());
+      Dest& X(X_base.derived());
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first");
       EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
                         THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
@@ -229,8 +224,10 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       // Permute the right hand side to form X = Pr*B
       // on return, X is overwritten by the computed solution
       X.resize(B.rows(),B.cols());
+
+      // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
       for(Index j = 0; j < B.cols(); ++j)
-        X.col(j) = rowsPermutation() * B.col(j);
+        X.col(j) = rowsPermutation() * B.const_cast_derived().col(j);
       
       //Forward substitution with L
       this->matrixL().solveInPlace(X);
@@ -253,70 +250,116 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       *
       * \sa logAbsDeterminant(), signDeterminant()
       */
-     Scalar absDeterminant()
+    Scalar absDeterminant()
     {
+      using std::abs;
       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
       // Initialize with the determinant of the row matrix
       Scalar det = Scalar(1.);
-      //Note that the diagonal blocks of U are stored in supernodes,
+      // Note that the diagonal blocks of U are stored in supernodes,
       // which are available in the  L part :)
       for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.index() == j)
+          {
+            det *= abs(it.value());
+            break;
+          }
+        }
+      }
+      return det;
+    }
+
+    /** \returns the natural log of the absolute value of the determinant of the matrix
+      * of which **this is the QR decomposition
+      *
+      * \note This method is useful to work around the risk of overflow/underflow that's
+      * inherent to the determinant computation.
+      *
+      * \sa absDeterminant(), signDeterminant()
+      */
+    Scalar logAbsDeterminant() const
+    {
+      using std::log;
+      using std::abs;
+
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+      Scalar det = Scalar(0.);
+      for (Index j = 0; j < this->cols(); ++j)
       {
         for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
         {
           if(it.row() < j) continue;
           if(it.row() == j)
           {
-            det *= (std::abs)(it.value());
+            det += log(abs(it.value()));
             break;
           }
         }
-       }
-       return det;
-     }
-
-     /** \returns the natural log of the absolute value of the determinant of the matrix
-       * of which **this is the QR decomposition
-       *
-       * \note This method is useful to work around the risk of overflow/underflow that's
-       * inherent to the determinant computation.
-       *
-       * \sa absDeterminant(), signDeterminant()
-       */
-     Scalar logAbsDeterminant() const
-     {
-       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-       Scalar det = Scalar(0.);
-       for (Index j = 0; j < this->cols(); ++j)
-       {
-         for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
-         {
-           if(it.row() < j) continue;
-           if(it.row() == j)
-           {
-             det += (std::log)((std::abs)(it.value()));
-             break;
-           }
-         }
-       }
-       return det;
-     }
+      }
+      return det;
+    }
 
-     /** \returns A number representing the sign of the determinant
-       *
-       * \sa absDeterminant(), logAbsDeterminant()
-       */
-     Scalar signDeterminant()
-     {
-       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-       return Scalar(m_detPermR);
-     }
+    /** \returns A number representing the sign of the determinant
+      *
+      * \sa absDeterminant(), logAbsDeterminant()
+      */
+    Scalar signDeterminant()
+    {
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+      // Initialize with the determinant of the row matrix
+      Index det = 1;
+      // Note that the diagonal blocks of U are stored in supernodes,
+      // which are available in the  L part :)
+      for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.index() == j)
+          {
+            if(it.value()<0)
+              det = -det;
+            else if(it.value()==0)
+              return 0;
+            break;
+          }
+        }
+      }
+      return det * m_detPermR * m_detPermC;
+    }
+    
+    /** \returns The determinant of the matrix.
+      *
+      * \sa absDeterminant(), logAbsDeterminant()
+      */
+    Scalar determinant()
+    {
+      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+      // Initialize with the determinant of the row matrix
+      Scalar det = Scalar(1.);
+      // Note that the diagonal blocks of U are stored in supernodes,
+      // which are available in the  L part :)
+      for (Index j = 0; j < this->cols(); ++j)
+      {
+        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
+        {
+          if(it.index() == j)
+          {
+            det *= it.value();
+            break;
+          }
+        }
+      }
+      return (m_detPermR * m_detPermC) > 0 ? det : -det;
+    }
 
   protected:
     // Functions 
     void initperfvalues()
     {
-      m_perfv.panel_size = 1;
+      m_perfv.panel_size = 16;
       m_perfv.relax = 1; 
       m_perfv.maxsuper = 128; 
       m_perfv.rowblk = 16; 
@@ -326,13 +369,12 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
       
     // Variables 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     bool m_factorizationIsOk;
     bool m_analysisIsOk;
     std::string m_lastError;
     NCMatrix m_mat; // The input (permuted ) matrix 
     SCMatrix m_Lstore; // The lower triangular matrix (supernodal)
-    MappedSparseMatrix<Scalar,ColMajor,Index> m_Ustore; // The upper triangular matrix
+    MappedSparseMatrix<Scalar,ColMajor,StorageIndex> m_Ustore; // The upper triangular matrix
     PermutationType m_perm_c; // Column permutation 
     PermutationType m_perm_r ; // Row permutation
     IndexVector m_etree; // Column elimination tree 
@@ -342,10 +384,10 @@ class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typ
     // SparseLU options 
     bool m_symmetricmode;
     // values for performance 
-    internal::perfvalues<Index> m_perfv; 
+    internal::perfvalues m_perfv;
     RealScalar m_diagpivotthresh; // Specifies the threshold used for a diagonal entry to be an acceptable pivot
-    Index m_nnzL, m_nnzU; // Nonzeros in L and U factors 
-    Index m_detPermR; // Determinant of the coefficient matrix
+    Index m_nnzL, m_nnzU; // Nonzeros in L and U factors
+    Index m_detPermR, m_detPermC; // Determinants of the permutation matrices
   private:
     // Disable copy constructor 
     SparseLU (const SparseLU& );
@@ -371,30 +413,32 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat)
   
   //TODO  It is possible as in SuperLU to compute row and columns scaling vectors to equilibrate the matrix mat.
   
+  // Firstly, copy the whole input matrix. 
+  m_mat = mat;
+  
+  // Compute fill-in ordering
   OrderingType ord; 
-  ord(mat,m_perm_c);
+  ord(m_mat,m_perm_c);
   
   // Apply the permutation to the column of the input  matrix
-  //First copy the whole input matrix. 
-  m_mat = mat;
-  if (m_perm_c.size()) {
+  if (m_perm_c.size())
+  {
     m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This vector is filled but not subsequently used.  
-    //Then, permute only the column pointers
-    const Index * outerIndexPtr;
-    if (mat.isCompressed()) outerIndexPtr = mat.outerIndexPtr();
-    else
-    {
-      Index *outerIndexPtr_t = new Index[mat.cols()+1];
-      for(Index i = 0; i <= mat.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
-      outerIndexPtr = outerIndexPtr_t;
-    }
+    // Then, permute only the column pointers
+    ei_declare_aligned_stack_constructed_variable(StorageIndex,outerIndexPtr,mat.cols()+1,mat.isCompressed()?const_cast<StorageIndex*>(mat.outerIndexPtr()):0);
+    
+    // If the input matrix 'mat' is uncompressed, then the outer-indices do not match the ones of m_mat, and a copy is thus needed.
+    if(!mat.isCompressed()) 
+      IndexVector::Map(outerIndexPtr, mat.cols()+1) = IndexVector::Map(m_mat.outerIndexPtr(),mat.cols()+1);
+    
+    // Apply the permutation and compute the nnz per column.
     for (Index i = 0; i < mat.cols(); i++)
     {
       m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
       m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
     }
-    if(!mat.isCompressed()) delete[] outerIndexPtr;
   }
+  
   // Compute the column elimination tree of the permuted matrix 
   IndexVector firstRowElt;
   internal::coletree(m_mat, m_etree,firstRowElt); 
@@ -403,7 +447,7 @@ void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat)
   if (!m_symmetricmode) {
     IndexVector post, iwork; 
     // Post order etree
-    internal::treePostorder(m_mat.cols(), m_etree, post); 
+    internal::treePostorder(StorageIndex(m_mat.cols()), m_etree, post); 
       
    
     // Renumber etree in postorder 
@@ -455,7 +499,9 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
   eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
   
-  typedef typename IndexVector::Scalar Index; 
+  typedef typename IndexVector::Scalar StorageIndex; 
+  
+  m_isInitialized = true;
   
   
   // Apply the column permutation computed in analyzepattern()
@@ -465,11 +511,11 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   {
     m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers.
     //Then, permute only the column pointers
-    const Index * outerIndexPtr;
+    const StorageIndex * outerIndexPtr;
     if (matrix.isCompressed()) outerIndexPtr = matrix.outerIndexPtr();
     else
     {
-      Index* outerIndexPtr_t = new Index[matrix.cols()+1];
+      StorageIndex* outerIndexPtr_t = new StorageIndex[matrix.cols()+1];
       for(Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
       outerIndexPtr = outerIndexPtr_t;
     }
@@ -483,7 +529,7 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   else 
   { //FIXME This should not be needed if the empty permutation is handled transparently
     m_perm_c.resize(matrix.cols());
-    for(Index i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
+    for(StorageIndex i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
   }
   
   Index m = m_mat.rows();
@@ -621,7 +667,8 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
       }
       
       // Update the determinant of the row permutation matrix
-      if (pivrow != jj) m_detPermR *= -1;
+      // FIXME: the following test is not correct, we should probably take iperm_c into account and pivrow is not directly the row pivot.
+      if (pivrow != jj) m_detPermR = -m_detPermR;
 
       // Prune columns (0:jj-1) using column jj
       Base::pruneL(jj, m_perm_r.indices(), pivrow, nseg, segrep, repfnz_k, xprune, m_glu); 
@@ -636,15 +683,18 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
     jcol += panel_size;  // Move to the next panel
   } // end for -- end elimination 
   
+  m_detPermR = m_perm_r.determinant();
+  m_detPermC = m_perm_c.determinant();
+  
   // Count the number of nonzeros in factors 
   Base::countnz(n, m_nnzL, m_nnzU, m_glu); 
   // Apply permutation  to the L subscripts 
-  Base::fixupL(n, m_perm_r.indices(), m_glu); 
+  Base::fixupL(n, m_perm_r.indices(), m_glu);
   
   // Create supernode matrix L 
   m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); 
   // Create the column major upper sparse matrix  U; 
-  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, Index> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); 
+  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, StorageIndex> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() );
   
   m_info = Success;
   m_factorizationIsOk = true;
@@ -653,9 +703,8 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
 template<typename MappedSupernodalType>
 struct SparseLUMatrixLReturnType : internal::no_assignment_operator
 {
-  typedef typename MappedSupernodalType::Index Index;
   typedef typename MappedSupernodalType::Scalar Scalar;
-  SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
+  explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
   Index rows() { return m_mapL.rows(); }
   Index cols() { return m_mapL.cols(); }
@@ -670,7 +719,6 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator
 template<typename MatrixLType, typename MatrixUType>
 struct SparseLUMatrixUReturnType : internal::no_assignment_operator
 {
-  typedef typename MatrixLType::Index Index;
   typedef typename MatrixLType::Scalar Scalar;
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
@@ -681,7 +729,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
     Index nrhs = X.cols();
-    Index n = X.rows();
+    Index n    = X.rows();
     // Backward solve with U
     for (Index k = m_mapL.nsuper(); k >= 0; k--)
     {
@@ -699,8 +747,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
       else
       {
-        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
@@ -722,35 +770,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   const MatrixUType& m_mapU;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 } // End namespace Eigen 
 
 #endif
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLUImpl.h b/nuparu/include/Eigen/src/SparseLU/SparseLUImpl.h
index 14d70897..fc0cfc4d 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLUImpl.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLUImpl.h
@@ -16,17 +16,19 @@ namespace internal {
   * \class SparseLUImpl
   * Base class for sparseLU
   */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 class SparseLUImpl
 {
   public:
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector; 
+    typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> ScalarMatrix;
+    typedef Map<ScalarMatrix, 0,  OuterStride<> > MappedMatrixBlock;
     typedef typename ScalarVector::RealScalar RealScalar; 
     typedef Ref<Matrix<Scalar,Dynamic,1> > BlockScalarVector;
-    typedef Ref<Matrix<Index,Dynamic,1> > BlockIndexVector;
+    typedef Ref<Matrix<StorageIndex,Dynamic,1> > BlockIndexVector;
     typedef LU_GlobalLU_t<IndexVector, ScalarVector> GlobalLU_t; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> MatrixType; 
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> MatrixType; 
     
   protected:
      template <typename VectorType>
@@ -40,7 +42,7 @@ class SparseLUImpl
      Index snode_bmod (const Index jcol, const Index fsupc, ScalarVector& dense, GlobalLU_t& glu);
      Index pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu);
      template <typename Traits>
-     void dfs_kernel(const Index jj, IndexVector& perm_r,
+     void dfs_kernel(const StorageIndex jj, IndexVector& perm_r,
                     Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
                     Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
                     IndexVector& xplore, GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits);
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_Memory.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_Memory.h
index a5158025..4dc42e87 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -36,13 +36,12 @@ namespace internal {
   
 enum { LUNoMarker = 3 };
 enum {emptyIdxLU = -1};
-template<typename Index>
 inline Index LUnumTempV(Index& m, Index& w, Index& t, Index& b)
 {
   return (std::max)(m, (t+b)*w);
 }
 
-template< typename Scalar, typename Index>
+template< typename Scalar>
 inline Index LUTempSpace(Index&m, Index& w)
 {
   return (2*w + 4 + LUNoMarker) * m * sizeof(Index) + (w + 1) * m * sizeof(Scalar);
@@ -59,9 +58,9 @@ inline Index LUTempSpace(Index&m, Index& w)
   * \param keep_prev  1: use length  and do not expand the vector; 0: compute new_len and expand
   * \param[in,out] num_expansions Number of times the memory has been expanded
   */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
+Index  SparseLUImpl<Scalar,StorageIndex>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
 {
   
   float alpha = 1.5; // Ratio of the memory increase 
@@ -70,23 +69,30 @@ Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index
   if(num_expansions == 0 || keep_prev) 
     new_len = length ; // First time allocate requested
   else 
-    new_len = Index(alpha * length);
+    new_len = (std::max)(length+1,Index(alpha * length));
   
   VectorType old_vec; // Temporary vector to hold the previous values   
   if (nbElts > 0 )
     old_vec = vec.segment(0,nbElts); 
   
   //Allocate or expand the current vector
-  try 
+#ifdef EIGEN_EXCEPTIONS
+  try
+#endif
   {
     vec.resize(new_len); 
   }
+#ifdef EIGEN_EXCEPTIONS
   catch(std::bad_alloc& )
+#else
+  if(!vec.size())
+#endif
   {
-    if ( !num_expansions )
+    if (!num_expansions)
     {
       // First time to allocate from LUMemInit()
-      throw; // Pass the exception to LUMemInit() which has a try... catch block
+      // Let LUMemInit() deals with it.
+      return -1;
     }
     if (keep_prev)
     {
@@ -100,12 +106,18 @@ Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index
       do 
       {
         alpha = (alpha + 1)/2;
-        new_len = Index(alpha * length);
+        new_len = (std::max)(length+1,Index(alpha * length));
+#ifdef EIGEN_EXCEPTIONS
         try
+#endif
         {
           vec.resize(new_len); 
         }
+#ifdef EIGEN_EXCEPTIONS
         catch(std::bad_alloc& )
+#else
+        if (!vec.size())
+#endif
         {
           tries += 1; 
           if ( tries > 10) return new_len; 
@@ -135,14 +147,13 @@ Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index
  * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated memory when allocation failed, and 0 on success
  * \note Unlike SuperLU, this routine does not support successive factorization with the same pattern and the same row permutation
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
 {
   Index& num_expansions = glu.num_expansions; //No memory expansions so far
-  num_expansions = 0; 
-  glu.nzumax = glu.nzlumax = (std::max)(fillratio * annz, m*n); // estimated number of nonzeros in U 
-  glu.nzlmax = (std::max)(Index(4), fillratio) * annz / 4; // estimated  nnz in L factor
-
+  num_expansions = 0;
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz+1) / n, m) * n; // estimated number of nonzeros in U 
+  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz+1) / 4; // estimated  nnz in L factor
   // Return the estimated size to the user if necessary
   Index tempSpace;
   tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
@@ -166,14 +177,10 @@ Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lw
   // Reserve memory for L/U factors
   do 
   {
-    try
-    {
-      expand<ScalarVector>(glu.lusup, glu.nzlumax, 0, 0, num_expansions); 
-      expand<ScalarVector>(glu.ucol,glu.nzumax, 0, 0, num_expansions); 
-      expand<IndexVector>(glu.lsub,glu.nzlmax, 0, 0, num_expansions); 
-      expand<IndexVector>(glu.usub,glu.nzumax, 0, 1, num_expansions); 
-    }
-    catch(std::bad_alloc& )
+    if(     (expand<ScalarVector>(glu.lusup, glu.nzlumax, 0, 0, num_expansions)<0)
+        ||  (expand<ScalarVector>(glu.ucol,  glu.nzumax,  0, 0, num_expansions)<0)
+        ||  (expand<IndexVector> (glu.lsub,  glu.nzlmax,  0, 0, num_expansions)<0)
+        ||  (expand<IndexVector> (glu.usub,  glu.nzumax,  0, 1, num_expansions)<0) )
     {
       //Reduce the estimated size and retry
       glu.nzlumax /= 2;
@@ -181,10 +188,7 @@ Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lw
       glu.nzlmax /= 2;
       if (glu.nzlumax < annz ) return glu.nzlumax; 
     }
-    
   } while (!glu.lusup.size() || !glu.ucol.size() || !glu.lsub.size() || !glu.usub.size());
-
-  
   
   ++num_expansions;
   return 0;
@@ -200,9 +204,9 @@ Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lw
  * \param num_expansions Number of expansions 
  * \return 0 on success, > 0 size of the memory allocated so far
  */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index SparseLUImpl<Scalar,Index>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
+Index SparseLUImpl<Scalar,StorageIndex>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
 {
   Index failed_size; 
   if (memtype == USUB)
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_Structs.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_Structs.h
index 24d6bf17..cf5ec449 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -75,7 +75,7 @@ typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType;
 
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
-  typedef typename IndexVector::Scalar Index; 
+  typedef typename IndexVector::Scalar StorageIndex; 
   IndexVector xsup; //First supernode column ... xsup(s) points to the beginning of the s-th supernode
   IndexVector supno; // Supernode number corresponding to this column (column to supernode mapping)
   ScalarVector  lusup; // nonzero values of L ordered by columns 
@@ -93,7 +93,6 @@ struct LU_GlobalLU_t {
 };
 
 // Values to set for performance
-template <typename Index>
 struct perfvalues {
   Index panel_size; // a panel consists of at most <panel_size> consecutive columns
   Index relax; // To control degree of relaxing supernodes. If the number of nodes (columns) 
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index ad6f2183..f0856db8 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -29,20 +29,20 @@ namespace internal {
  * SuperInnerIterator to iterate through all supernodes 
  * Function for triangular solve
  */
-template <typename _Scalar, typename _Index>
+template <typename _Scalar, typename _StorageIndex>
 class MappedSuperNodalMatrix
 {
   public:
     typedef _Scalar Scalar; 
-    typedef _Index Index;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
+    typedef _StorageIndex StorageIndex;
+    typedef Matrix<StorageIndex,Dynamic,1> IndexVector;
     typedef Matrix<Scalar,Dynamic,1> ScalarVector;
   public:
     MappedSuperNodalMatrix()
     {
       
     }
-    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
              IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
     {
       setInfos(m, n, nzval, nzval_colptr, rowind, rowind_colptr, col_to_sup, sup_to_col);
@@ -58,7 +58,7 @@ class MappedSuperNodalMatrix
      * FIXME This class will be modified such that it can be use in the course 
      * of the factorization.
      */
-    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
+    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
              IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
     {
       m_row = m;
@@ -96,12 +96,12 @@ class MappedSuperNodalMatrix
     /**
      * Return the pointers to the beginning of each column in \ref valuePtr()
      */
-    Index* colIndexPtr()
+    StorageIndex* colIndexPtr()
     {
       return m_nzval_colptr; 
     }
     
-    const Index* colIndexPtr() const
+    const StorageIndex* colIndexPtr() const
     {
       return m_nzval_colptr; 
     }
@@ -109,9 +109,9 @@ class MappedSuperNodalMatrix
     /**
      * Return the array of compressed row indices of all supernodes
      */
-    Index* rowIndex()  { return m_rowind; }
+    StorageIndex* rowIndex()  { return m_rowind; }
     
-    const Index* rowIndex() const
+    const StorageIndex* rowIndex() const
     {
       return m_rowind; 
     }
@@ -119,9 +119,9 @@ class MappedSuperNodalMatrix
     /**
      * Return the location in \em rowvaluePtr() which starts each column
      */
-    Index* rowIndexPtr() { return m_rowind_colptr; }
+    StorageIndex* rowIndexPtr() { return m_rowind_colptr; }
     
-    const Index* rowIndexPtr() const 
+    const StorageIndex* rowIndexPtr() const
     {
       return m_rowind_colptr; 
     }
@@ -129,18 +129,18 @@ class MappedSuperNodalMatrix
     /** 
      * Return the array of column-to-supernode mapping 
      */
-    Index* colToSup()  { return m_col_to_sup; }
+    StorageIndex* colToSup()  { return m_col_to_sup; }
     
-    const Index* colToSup() const
+    const StorageIndex* colToSup() const
     {
       return m_col_to_sup;       
     }
     /**
      * Return the array of supernode-to-column mapping
      */
-    Index* supToCol() { return m_sup_to_col; }
+    StorageIndex* supToCol() { return m_sup_to_col; }
     
-    const Index* supToCol() const 
+    const StorageIndex* supToCol() const
     {
       return m_sup_to_col;
     }
@@ -148,7 +148,7 @@ class MappedSuperNodalMatrix
     /**
      * Return the number of supernodes
      */
-    Index nsuper() const 
+    Index nsuper() const
     {
       return m_nsuper; 
     }
@@ -162,14 +162,14 @@ class MappedSuperNodalMatrix
     
   protected:
     Index m_row; // Number of rows
-    Index m_col; // Number of columns 
-    Index m_nsuper; // Number of supernodes 
+    Index m_col; // Number of columns
+    Index m_nsuper; // Number of supernodes
     Scalar* m_nzval; //array of nonzero values packed by column
-    Index* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j 
-    Index* m_rowind; // Array of compressed row indices of rectangular supernodes
-    Index* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
-    Index* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
-    Index* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
+    StorageIndex* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j
+    StorageIndex* m_rowind; // Array of compressed row indices of rectangular supernodes
+    StorageIndex* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
+    StorageIndex* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
+    StorageIndex* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
     
   private :
 };
@@ -178,19 +178,19 @@ class MappedSuperNodalMatrix
   * \brief InnerIterator class to iterate over nonzero values of the current column in the supernodal matrix L
   * 
   */
-template<typename Scalar, typename Index>
-class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
+template<typename Scalar, typename StorageIndex>
+class MappedSuperNodalMatrix<Scalar,StorageIndex>::InnerIterator
 {
   public:
      InnerIterator(const MappedSuperNodalMatrix& mat, Index outer)
       : m_matrix(mat),
-        m_outer(outer), 
+        m_outer(outer),
         m_supno(mat.colToSup()[outer]),
         m_idval(mat.colIndexPtr()[outer]),
         m_startidval(m_idval),
         m_endidval(mat.colIndexPtr()[outer+1]),
-        m_idrow(mat.rowIndexPtr()[outer]),
-        m_endidrow(mat.rowIndexPtr()[outer+1])
+        m_idrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]]),
+        m_endidrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]+1])
     {}
     inline InnerIterator& operator++()
     { 
@@ -229,14 +229,17 @@ class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
  * \brief Solve with the supernode triangular matrix
  * 
  */
-template<typename Scalar, typename Index>
+template<typename Scalar, typename Index_>
 template<typename Dest>
-void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) const
+void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) const
 {
-    Index n = X.rows(); 
-    Index nrhs = X.cols(); 
+    /* Explicit type conversion as the Index type of MatrixBase<Dest> may be wider than Index */
+//    eigen_assert(X.rows() <= NumTraits<Index>::highest());
+//    eigen_assert(X.cols() <= NumTraits<Index>::highest());
+    Index n    = int(X.rows());
+    Index nrhs = Index(X.cols());
     const Scalar * Lval = valuePtr();                 // Nonzero values 
-    Matrix<Scalar,Dynamic,Dynamic> work(n, nrhs);     // working vector
+    Matrix<Scalar,Dynamic,Dynamic, ColMajor> work(n, nrhs);     // working vector
     work.setZero();
     for (Index k = 0; k <= nsuper(); k ++)
     {
@@ -267,12 +270,12 @@ void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) con
         Index lda = colIndexPtr()[fsupc+1] - luptr;
         
         // Triangular solve 
-        Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
-        Map< Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
+        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
+        Map< Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
         U = A.template triangularView<UnitLower>().solve(U); 
         
         // Matrix-vector product 
-        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
+        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
         work.block(0, 0, nrow, nrhs) = A * U; 
         
         //Begin Scatter 
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_Utils.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_Utils.h
index 15352ac3..9e3dab44 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_Utils.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_Utils.h
@@ -17,8 +17,8 @@ namespace internal {
 /**
  * \brief Count Nonzero elements in the factors
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
 {
  nnzL = 0; 
  nnzU = (glu.xusub)(n); 
@@ -48,12 +48,12 @@ void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU
  * and applies permutation to the remaining subscripts
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
 {
   Index fsupc, i, j, k, jstart; 
   
-  Index nextl = 0; 
+  StorageIndex nextl = 0; 
   Index nsuper = (glu.supno)(n); 
   
   // For each supernode 
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_column_bmod.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_column_bmod.h
index f24bd87d..b57f0680 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_column_bmod.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_column_bmod.h
@@ -49,8 +49,9 @@ namespace internal {
  *         > 0 - number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv,
+                                                     BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
 {
   Index  jsupno, k, ksub, krep, ksupno; 
   Index lptr, nrow, isub, irow, nextlu, new_next, ufirst; 
@@ -137,7 +138,7 @@ Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg
     glu.lusup.segment(nextlu,offset).setZero();
     nextlu += offset;
   }
-  glu.xlusup(jcol + 1) = nextlu;  // close L\U(*,jcol); 
+  glu.xlusup(jcol + 1) = StorageIndex(nextlu);  // close L\U(*,jcol); 
   
   /* For more updates within the panel (also within the current supernode),
    * should start from the first column of the panel, or the first column
@@ -162,11 +163,11 @@ Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg
     // points to the beginning of jcol in snode L\U(jsupno) 
     ufirst = glu.xlusup(jcol) + d_fsupc; 
     Index lda = glu.xlusup(jcol+1) - glu.xlusup(jcol);
-    Map<Matrix<Scalar,Dynamic,Dynamic>, 0,  OuterStride<> > A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); 
+    MappedMatrixBlock A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
     VectorBlock<ScalarVector> u(glu.lusup, ufirst, nsupc); 
     u = A.template triangularView<UnitLower>().solve(u); 
     
-    new (&A) Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); 
+    new (&A) MappedMatrixBlock ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
     VectorBlock<ScalarVector> l(glu.lusup, ufirst+nsupc, nrow); 
     l.noalias() -= A * u;
     
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_column_dfs.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_column_dfs.h
index 4c04b0e4..c98b30e3 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -30,7 +30,7 @@
 #ifndef SPARSELU_COLUMN_DFS_H
 #define SPARSELU_COLUMN_DFS_H
 
-template <typename Scalar, typename Index> class SparseLUImpl;
+template <typename Scalar, typename StorageIndex> class SparseLUImpl;
 namespace Eigen {
 
 namespace internal {
@@ -39,8 +39,8 @@ template<typename IndexVector, typename ScalarVector>
 struct column_dfs_traits : no_assignment_operator
 {
   typedef typename ScalarVector::Scalar Scalar;
-  typedef typename IndexVector::Scalar Index;
-  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, Index>::GlobalLU_t& glu, SparseLUImpl<Scalar, Index>& luImpl)
+  typedef typename IndexVector::Scalar StorageIndex;
+  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& glu, SparseLUImpl<Scalar, StorageIndex>& luImpl)
    : m_jcol(jcol), m_jsuper_ref(jsuper), m_glu(glu), m_luImpl(luImpl)
  {}
   bool update_segrep(Index /*krep*/, Index /*jj*/)
@@ -57,8 +57,8 @@ struct column_dfs_traits : no_assignment_operator
   
   Index m_jcol;
   Index& m_jsuper_ref;
-  typename SparseLUImpl<Scalar, Index>::GlobalLU_t& m_glu;
-  SparseLUImpl<Scalar, Index>& m_luImpl;
+  typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& m_glu;
+  SparseLUImpl<Scalar, StorageIndex>& m_luImpl;
 };
 
 
@@ -89,8 +89,10 @@ struct column_dfs_traits : no_assignment_operator
  *         > 0 number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,
+                                                    BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune,
+                                                    IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
 {
   
   Index jsuper = glu.supno(jcol); 
@@ -110,13 +112,13 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
     // krow was visited before, go to the next nonz; 
     if (kmark == jcol) continue;
     
-    dfs_kernel(jcol, perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
+    dfs_kernel(StorageIndex(jcol), perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
                    xplore, glu, nextl, krow, traits);
   } // for each nonzero ... 
   
-  Index fsupc, jptr, jm1ptr, ito, ifrom, istop;
-  Index nsuper = glu.supno(jcol);
-  Index jcolp1 = jcol + 1;
+  Index fsupc;
+  StorageIndex nsuper = glu.supno(jcol);
+  StorageIndex jcolp1 = StorageIndex(jcol) + 1;
   Index jcolm1 = jcol - 1;
   
   // check to see if j belongs in the same supernode as j-1
@@ -127,8 +129,8 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
   else 
   {
     fsupc = glu.xsup(nsuper); 
-    jptr = glu.xlsub(jcol); // Not yet compressed
-    jm1ptr = glu.xlsub(jcolm1); 
+    StorageIndex jptr = glu.xlsub(jcol); // Not yet compressed
+    StorageIndex jm1ptr = glu.xlsub(jcolm1); 
     
     // Use supernodes of type T2 : see SuperLU paper
     if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = emptyIdxLU;
@@ -146,13 +148,13 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
     { // starts a new supernode 
       if ( (fsupc < jcolm1-1) ) 
       { // >= 3 columns in nsuper
-        ito = glu.xlsub(fsupc+1);
+        StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
-        istop = ito + jptr - jm1ptr; 
+        StorageIndex istop = ito + jptr - jm1ptr; 
         xprune(jcolm1) = istop; // intialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
-        for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
+        for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
           glu.lsub(ito) = glu.lsub(ifrom); 
         nextl = ito;  // = istop + length(jcol)
       }
@@ -164,8 +166,8 @@ Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, In
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = nextl;  // Intialize upper bound for pruning
-  glu.xlsub(jcolp1) = nextl; 
+  xprune(jcol) = StorageIndex(nextl);  // Intialize upper bound for pruning
+  glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
 }
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
index 170610d9..c32d8d8b 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
@@ -46,8 +46,9 @@ namespace internal {
  *         > 0 - number of bytes allocated when run out of space
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep,
+                                                      BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
 {  
   Index ksub, krep, ksupno; 
     
@@ -55,7 +56,7 @@ Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nse
   
   // For each nonzero supernode segment of U[*,j] in topological order 
   Index k = nseg - 1, i; 
-  Index nextu = glu.xusub(jcol); 
+  StorageIndex nextu = glu.xusub(jcol); 
   Index kfnz, isub, segsize; 
   Index new_next,irow; 
   Index fsupc, mem; 
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index 9e4e3e72..ae3685ac 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -21,7 +21,7 @@ namespace internal {
   *  - lda and ldc must be multiples of the respective packet size
   *  - C must have the same alignment as A
   */
-template<typename Scalar,typename Index>
+template<typename Scalar>
 EIGEN_DONT_INLINE
 void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
 {
@@ -39,9 +39,9 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
   };
   Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
   Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_aligned(A,m);
+  Index i0 = internal::first_default_aligned(A,m);
   
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
+  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
   
   // handle the non aligned rows of A and C without any optimization:
   for(Index i=0; i<i0; ++i)
@@ -165,7 +165,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         Bc1 += RK;
       } // peeled loop on k
     } // peeled loop on the columns j
-    // process the last column (we now perform a matrux-vector product)
+    // process the last column (we now perform a matrix-vector product)
     if((n-n_end)>0)
     {
       const Scalar* Bc0 = B+(n-1)*ldb;
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 7a4e4305..6f75d500 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
@@ -42,21 +42,20 @@ namespace internal {
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
 {
   
   // The etree may not be postordered, but its heap ordered  
   IndexVector post;
-  internal::treePostorder(n, et, post); // Post order etree
+  internal::treePostorder(StorageIndex(n), et, post); // Post order etree
   IndexVector inv_post(n+1); 
-  Index i;
-  for (i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
+  for (StorageIndex i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
   
   // Renumber etree in postorder 
   IndexVector iwork(n);
   IndexVector et_save(n+1);
-  for (i = 0; i < n; ++i)
+  for (Index i = 0; i < n; ++i)
   {
     iwork(post(i)) = post(et(i));
   }
@@ -75,10 +74,10 @@ void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& e
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
-  Index k;
+  StorageIndex k;
   Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
   Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
-  Index l; 
+  StorageIndex l; 
   for (j = 0; j < n; )
   {
     parent = et(j);
@@ -90,8 +89,8 @@ void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& e
     }
     // Found a supernode in postordered etree, j is the last column 
     ++nsuper_et_post;
-    k = n;
-    for (i = snode_start; i <= j; ++i)
+    k = StorageIndex(n);
+    for (Index i = snode_start; i <= j; ++i)
       k = (std::min)(k, inv_post(i));
     l = inv_post(j);
     if ( (l - k) == (j - snode_start) )  // Same number of columns in the snode
@@ -102,7 +101,7 @@ void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& e
     }
     else 
     {
-      for (i = snode_start; i <= j; ++i) 
+      for (Index i = snode_start; i <= j; ++i) 
       {
         l = inv_post(i);
         if (descendants(i) == 0) 
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 0d0283b1..e71a13b8 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -30,14 +30,14 @@ namespace internal {
  */
 template <int SegSizeAtCompileTime> struct LU_kernel_bmod
 {
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                     const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 template <int SegSizeAtCompileTime>
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
                                                                   const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
 {
   typedef typename ScalarVector::Scalar Scalar;
@@ -45,7 +45,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
   // The result of triangular solve is in tempv[*]; 
     // The result of matric-vector update is in dense[*]
   Index isub = lptr + no_zeros; 
-  int i;
+  Index i;
   Index irow;
   for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
   {
@@ -56,7 +56,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
   // Dense triangular solve -- start effective triangle
   luptr += lda * no_zeros + no_zeros; 
   // Form Eigen matrix and vector 
-  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
+  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
   Map<Matrix<Scalar,SegSizeAtCompileTime,1> > u(tempv.data(), segsize);
   
   u = A.template triangularView<UnitLower>().solve(u); 
@@ -65,9 +65,9 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
   luptr += segsize;
   const Index PacketSize = internal::packet_traits<Scalar>::size;
   Index ldl = internal::first_multiple(nrow, PacketSize);
-  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
-  Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
-  Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
+  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
+  Index aligned_offset = internal::first_default_aligned(tempv.data()+segsize, PacketSize);
+  Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
   Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
   
   l.setZero();
@@ -91,21 +91,22 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsi
 
 template <> struct LU_kernel_bmod<1>
 {
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
                                     const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
                                               const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
 {
   typedef typename ScalarVector::Scalar Scalar;
+  typedef typename IndexVector::Scalar StorageIndex;
   Scalar f = dense(lsub(lptr + no_zeros));
   luptr += lda * no_zeros + no_zeros + 1;
   const Scalar* a(lusup.data() + luptr);
-  const /*typename IndexVector::Scalar*/Index*  irow(lsub.data()+lptr + no_zeros + 1);
+  const StorageIndex*  irow(lsub.data()+lptr + no_zeros + 1);
   Index i = 0;
   for (; i+1 < nrow; i+=2)
   {
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index da0e0fc3..822cf32c 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -52,8 +52,8 @@ namespace internal {
  * 
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const Index jcol, 
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w, const Index jcol, 
                                             const Index nseg, ScalarVector& dense, ScalarVector& tempv,
                                             IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu)
 {
@@ -102,7 +102,7 @@ void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const
     if(nsupc >= 2)
     { 
       Index ldu = internal::first_multiple<Index>(u_rows, PacketSize);
-      Map<Matrix<Scalar,Dynamic,Dynamic>, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
+      Map<ScalarMatrix, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
       
       // gather U
       Index u_col = 0;
@@ -136,17 +136,17 @@ void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const
       Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc);
       no_zeros = (krep - u_rows + 1) - fsupc;
       luptr += lda * no_zeros + no_zeros;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
       U = A.template triangularView<UnitLower>().solve(U);
       
       // update
       luptr += u_rows;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
       eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
       
       Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
-      Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
-      Map<Matrix<Scalar,Dynamic,Dynamic>, 0, OuterStride<> > L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
+      Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
+      MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
       
       L.setZero();
       internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h
index dc0054ef..155df733 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h
@@ -37,11 +37,11 @@ namespace internal {
 template<typename IndexVector>
 struct panel_dfs_traits
 {
-  typedef typename IndexVector::Scalar Index;
-  panel_dfs_traits(Index jcol, Index* marker)
+  typedef typename IndexVector::Scalar StorageIndex;
+  panel_dfs_traits(Index jcol, StorageIndex* marker)
     : m_jcol(jcol), m_marker(marker)
   {}
-  bool update_segrep(Index krep, Index jj)
+  bool update_segrep(Index krep, StorageIndex jj)
   {
     if(m_marker[krep]<m_jcol)
     {
@@ -53,13 +53,13 @@ struct panel_dfs_traits
   void mem_expand(IndexVector& /*glu.lsub*/, Index /*nextl*/, Index /*chmark*/) {}
   enum { ExpandMem = false };
   Index m_jcol;
-  Index* m_marker;
+  StorageIndex* m_marker;
 };
 
 
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename Traits>
-void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
+void SparseLUImpl<Scalar,StorageIndex>::dfs_kernel(const StorageIndex jj, IndexVector& perm_r,
                    Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
                    Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
                    IndexVector& xplore, GlobalLU_t& glu,
@@ -67,14 +67,14 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
                   )
 {
   
-  Index kmark = marker(krow);
+  StorageIndex kmark = marker(krow);
       
   // For each unmarked krow of jj
   marker(krow) = jj; 
-  Index kperm = perm_r(krow); 
+  StorageIndex kperm = perm_r(krow); 
   if (kperm == emptyIdxLU ) {
     // krow is in L : place it in structure of L(*, jj)
-    panel_lsub(nextl_col++) = krow;  // krow is indexed into A
+    panel_lsub(nextl_col++) = StorageIndex(krow);  // krow is indexed into A
     
     traits.mem_expand(panel_lsub, nextl_col, kmark);
   }
@@ -83,9 +83,9 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
     // krow is in U : if its supernode-representative krep
     // has been explored, update repfnz(*)
     // krep = supernode representative of the current row
-    Index krep = glu.xsup(glu.supno(kperm)+1) - 1; 
+    StorageIndex krep = glu.xsup(glu.supno(kperm)+1) - 1; 
     // First nonzero element in the current column:
-    Index myfnz = repfnz_col(krep); 
+    StorageIndex myfnz = repfnz_col(krep); 
     
     if (myfnz != emptyIdxLU )
     {
@@ -96,26 +96,26 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
     else 
     {
       // Otherwise, perform dfs starting at krep
-      Index oldrep = emptyIdxLU; 
+      StorageIndex oldrep = emptyIdxLU; 
       parent(krep) = oldrep; 
       repfnz_col(krep) = kperm; 
-      Index xdfs =  glu.xlsub(krep); 
+      StorageIndex xdfs =  glu.xlsub(krep); 
       Index maxdfs = xprune(krep); 
       
-      Index kpar;
+      StorageIndex kpar;
       do 
       {
         // For each unmarked kchild of krep
         while (xdfs < maxdfs) 
         {
-          Index kchild = glu.lsub(xdfs); 
+          StorageIndex kchild = glu.lsub(xdfs); 
           xdfs++; 
-          Index chmark = marker(kchild); 
+          StorageIndex chmark = marker(kchild); 
           
           if (chmark != jj ) 
           {
             marker(kchild) = jj; 
-            Index chperm = perm_r(kchild); 
+            StorageIndex chperm = perm_r(kchild); 
             
             if (chperm == emptyIdxLU) 
             {
@@ -128,7 +128,7 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
               // case kchild is in U :
               // chrep = its supernode-rep. If its rep has been explored, 
               // update its repfnz(*)
-              Index chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
+              StorageIndex chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
               myfnz = repfnz_col(chrep); 
               
               if (myfnz != emptyIdxLU) 
@@ -215,8 +215,8 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
  * 
  */
 
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
 {
   Index nextl_col; // Next available position in panel_lsub[*,jj] 
   
@@ -227,7 +227,7 @@ void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const I
   panel_dfs_traits<IndexVector> traits(jcol, marker1.data());
   
   // For each column in the panel 
-  for (Index jj = jcol; jj < jcol + w; jj++) 
+  for (StorageIndex jj = StorageIndex(jcol); jj < jcol + w; jj++) 
   {
     nextl_col = (jj - jcol) * m; 
     
@@ -241,7 +241,7 @@ void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const I
       Index krow = it.row(); 
       dense_col(krow) = it.value();
       
-      Index kmark = marker(krow); 
+      StorageIndex kmark = marker(krow); 
       if (kmark == jj) 
         continue; // krow visited before, go to the next nonzero
       
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_pivotL.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_pivotL.h
index ddcd4ec9..a86dac93 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_pivotL.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_pivotL.h
@@ -56,8 +56,8 @@ namespace internal {
  * \return 0 if success, i > 0 if U(i,i) is exactly zero 
  * 
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar,StorageIndex>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
 {
   
   Index fsupc = (glu.xsup)((glu.supno)(jcol)); // First column in the supernode containing the column jcol
@@ -67,17 +67,18 @@ Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& dia
   Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); // leading dimension
   Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]); // Start of the current supernode
   Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]); // Start of jcol in the supernode
-  Index* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
+  StorageIndex* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
   
   // Determine the largest abs numerical value for partial pivoting 
   Index diagind = iperm_c(jcol); // diagonal index 
-  RealScalar pivmax = 0.0; 
+  RealScalar pivmax(-1.0);
   Index pivptr = nsupc; 
   Index diag = emptyIdxLU; 
   RealScalar rtemp;
   Index isub, icol, itemp, k; 
   for (isub = nsupc; isub < nsupr; ++isub) {
-    rtemp = std::abs(lu_col_ptr[isub]);
+    using std::abs;
+    rtemp = abs(lu_col_ptr[isub]);
     if (rtemp > pivmax) {
       pivmax = rtemp; 
       pivptr = isub;
@@ -86,9 +87,10 @@ Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& dia
   }
   
   // Test for singularity
-  if ( pivmax == 0.0 ) {
-    pivrow = lsub_ptr[pivptr];
-    perm_r(pivrow) = jcol;
+  if ( pivmax <= RealScalar(0.0) ) {
+    // if pivmax == -1, the column is structurally empty, otherwise it is only numerically zero
+    pivrow = pivmax < RealScalar(0.0) ? diagind : lsub_ptr[pivptr];
+    perm_r(pivrow) = StorageIndex(jcol);
     return (jcol+1);
   }
   
@@ -101,14 +103,15 @@ Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& dia
     if (diag >= 0 ) 
     {
       // Diagonal element exists
-      rtemp = std::abs(lu_col_ptr[diag]);
-      if (rtemp != 0.0 && rtemp >= thresh) pivptr = diag;
+      using std::abs;
+      rtemp = abs(lu_col_ptr[diag]);
+      if (rtemp != RealScalar(0.0) && rtemp >= thresh) pivptr = diag;
     }
     pivrow = lsub_ptr[pivptr];
   }
   
   // Record pivot row
-  perm_r(pivrow) = jcol; 
+  perm_r(pivrow) = StorageIndex(jcol);
   // Interchange row subscripts
   if (pivptr != nsupc )
   {
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_pruneL.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_pruneL.h
index 66460d16..ad32fed5 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_pruneL.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_pruneL.h
@@ -49,8 +49,9 @@ namespace internal {
  * \param glu Global LU data
  * 
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg,
+                                               const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
 {
   // For each supernode-rep irep in U(*,j]
   Index jsupno = glu.supno(jcol); 
@@ -123,7 +124,7 @@ void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& per
           }
         } // end while 
         
-        xprune(irep) = kmin;  //Pruning 
+        xprune(irep) = StorageIndex(kmin);  //Pruning 
       } // end if do_prune 
     } // end pruning 
   } // End for each U-segment
diff --git a/nuparu/include/Eigen/src/SparseLU/SparseLU_relax_snode.h b/nuparu/include/Eigen/src/SparseLU/SparseLU_relax_snode.h
index 58ec32e2..c408d01b 100644
--- a/nuparu/include/Eigen/src/SparseLU/SparseLU_relax_snode.h
+++ b/nuparu/include/Eigen/src/SparseLU/SparseLU_relax_snode.h
@@ -43,15 +43,15 @@ namespace internal {
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar,StorageIndex>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
 {
   
   // compute the number of descendants of each node in the etree
-  Index j, parent; 
+  Index parent; 
   relax_end.setConstant(emptyIdxLU);
   descendants.setZero();
-  for (j = 0; j < n; j++) 
+  for (Index j = 0; j < n; j++) 
   {
     parent = et(j);
     if (parent != n) // not the dummy root
@@ -59,7 +59,7 @@ void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, co
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
-  for (j = 0; j < n; )
+  for (Index j = 0; j < n; )
   {
     parent = et(j);
     snode_start = j; 
@@ -69,7 +69,7 @@ void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, co
       parent = et(j);
     }
     // Found a supernode in postordered etree, j is the last column 
-    relax_end(snode_start) = j; // Record last column
+    relax_end(snode_start) = StorageIndex(j); // Record last column
     j++;
     // Search for a new leaf
     while (descendants(j) != 0 && j < n) j++;
diff --git a/nuparu/include/Eigen/src/SparseQR/SparseQR.h b/nuparu/include/Eigen/src/SparseQR/SparseQR.h
index 07c46e4b..4f26c19c 100644
--- a/nuparu/include/Eigen/src/SparseQR/SparseQR.h
+++ b/nuparu/include/Eigen/src/SparseQR/SparseQR.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012-2013 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
-// Copyright (C) 2012-2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -21,8 +21,12 @@ namespace internal {
   template <typename SparseQRType> struct traits<SparseQRMatrixQReturnType<SparseQRType> >
   {
     typedef typename SparseQRType::MatrixType ReturnType;
-    typedef typename ReturnType::Index Index;
+    typedef typename ReturnType::StorageIndex StorageIndex;
     typedef typename ReturnType::StorageKind StorageKind;
+    enum {
+      RowsAtCompileTime = Dynamic,
+      ColsAtCompileTime = Dynamic
+    };
   };
   template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
   {
@@ -58,29 +62,55 @@ namespace internal {
   * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
   *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
   * 
+  * \implsparsesolverconcept
+  *
+  * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
   * 
   */
 template<typename _MatrixType, typename _OrderingType>
-class SparseQR
+class SparseQR : public SparseSolverBase<SparseQR<_MatrixType,_OrderingType> >
 {
+  protected:
+    typedef SparseSolverBase<SparseQR<_MatrixType,_OrderingType> > Base;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef _OrderingType OrderingType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> QRMatrixType;
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
+    typedef typename MatrixType::StorageIndex StorageIndex;
+    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> QRMatrixType;
+    typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
     typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
+    typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
+    
   public:
-    SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
+    SparseQR () :  m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     { }
     
-    SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false)
+    /** Construct a QR factorization of the matrix \a mat.
+      * 
+      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+      * 
+      * \sa compute()
+      */
+    explicit SparseQR(const MatrixType& mat) : m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
     {
       compute(mat);
     }
+    
+    /** Computes the QR factorization of the sparse matrix \a mat.
+      * 
+      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+      * 
+      * \sa analyzePattern(), factorize()
+      */
     void compute(const MatrixType& mat)
     {
       analyzePattern(mat);
@@ -105,7 +135,7 @@ class SparseQR
       *
       * \sa setPivotThreshold()
       */
-    Index rank() const 
+    Index rank() const
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       return m_nonzeropivots; 
@@ -148,7 +178,7 @@ class SparseQR
     
     /** \internal */
     template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
+    bool _solve_impl(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
@@ -161,17 +191,17 @@ class SparseQR
       b = y;
       
       // Solve with the triangular matrix R
+      y.resize((std::max<Index>)(cols(),y.rows()),y.cols());
       y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
-      y.bottomRows(y.size()-rank).setZero();
-
+      y.bottomRows(y.rows()-rank).setZero();
+      
       // Apply the column permutation
-      if (m_perm_c.size())  dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
+      if (m_perm_c.size())  dest = colsPermutation() * y.topRows(cols());
       else                  dest = y.topRows(cols());
       
       m_info = Success;
       return true;
     }
-    
 
     /** Sets the threshold that is used to determine linearly dependent columns during the factorization.
       *
@@ -189,23 +219,23 @@ class SparseQR
       * \sa compute()
       */
     template<typename Rhs>
-    inline const internal::solve_retval<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
+    inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
     {
       eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
       eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-      return internal::solve_retval<SparseQR, Rhs>(*this, B.derived());
+      return Solve<SparseQR, Rhs>(*this, B.derived());
     }
     template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
+    inline const Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
     {
           eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
           eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-          return internal::sparse_solve_retval<SparseQR, Rhs>(*this, B.derived());
+          return Solve<SparseQR, Rhs>(*this, B.derived());
     }
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was succesful,
+      * \returns \c Success if computation was successful,
       *          \c NumericalIssue if the QR factorization reports a numerical problem
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -217,8 +247,9 @@ class SparseQR
       return m_info;
     }
 
-  protected:
-    inline void sort_matrix_Q()
+
+    /** \internal */
+    inline void _sort_matrix_Q()
     {
       if(this->m_isQSorted) return;
       // The matrix Q is sorted during the transposition
@@ -229,7 +260,6 @@ class SparseQR
 
     
   protected:
-    bool m_isInitialized;
     bool m_analysisIsok;
     bool m_factorizationIsok;
     mutable ComputationInfo m_info;
@@ -243,56 +273,63 @@ class SparseQR
     PermutationType m_outputPerm_c; // The final column permutation
     RealScalar m_threshold;         // Threshold to determine null Householder reflections
     bool m_useDefaultThreshold;     // Use default threshold
-    Index m_nonzeropivots;          // Number of non zero pivots found 
+    Index m_nonzeropivots;          // Number of non zero pivots found
     IndexVector m_etree;            // Column elimination tree
     IndexVector m_firstRowElt;      // First element in each row
-    bool m_isQSorted;                 // whether Q is sorted or not
+    bool m_isQSorted;               // whether Q is sorted or not
+    bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
     
     template <typename, typename > friend struct SparseQR_QProduct;
-    template <typename > friend struct SparseQRMatrixQReturnType;
     
 };
 
 /** \brief Preprocessing step of a QR factorization 
+  * 
+  * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
   * 
   * In this step, the fill-reducing permutation is computed and applied to the columns of A
-  * and the column elimination tree is computed as well. Only the sparcity pattern of \a mat is exploited.
+  * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
   * 
   * \note In this step it is assumed that there is no empty row in the matrix \a mat.
   */
 template <typename MatrixType, typename OrderingType>
 void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
 {
+  eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
+  // Copy to a column major matrix if the input is rowmajor
+  typename internal::conditional<MatrixType::IsRowMajor,QRMatrixType,const MatrixType&>::type matCpy(mat);
   // Compute the column fill reducing ordering
   OrderingType ord; 
-  ord(mat, m_perm_c); 
+  ord(matCpy, m_perm_c); 
   Index n = mat.cols();
   Index m = mat.rows();
+  Index diagSize = (std::min)(m,n);
   
   if (!m_perm_c.size())
   {
     m_perm_c.resize(n);
-    m_perm_c.indices().setLinSpaced(n, 0,n-1);
+    m_perm_c.indices().setLinSpaced(n, 0,StorageIndex(n-1));
   }
   
   // Compute the column elimination tree of the permuted matrix
   m_outputPerm_c = m_perm_c.inverse();
-  internal::coletree(mat, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+  internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+  m_isEtreeOk = true;
   
-  m_R.resize(n, n);
-  m_Q.resize(m, n);
+  m_R.resize(m, n);
+  m_Q.resize(m, diagSize);
   
   // Allocate space for nonzero elements : rough estimation
   m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
   m_Q.reserve(2*mat.nonZeros());
-  m_hcoeffs.resize(n);
+  m_hcoeffs.resize(diagSize);
   m_analysisIsok = true;
 }
 
 /** \brief Performs the numerical QR factorization of the input matrix
   * 
   * The function SparseQR::analyzePattern(const MatrixType&) must have been called beforehand with
-  * a matrix having the same sparcity pattern than \a mat.
+  * a matrix having the same sparsity pattern than \a mat.
   * 
   * \param mat The sparse column-major matrix
   */
@@ -300,67 +337,92 @@ template <typename MatrixType, typename OrderingType>
 void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
 {
   using std::abs;
-  using std::max;
   
   eigen_assert(m_analysisIsok && "analyzePattern() should be called before this step");
-  Index m = mat.rows();
-  Index n = mat.cols();
-  IndexVector mark(m); mark.setConstant(-1);  // Record the visited nodes
-  IndexVector Ridx(n), Qidx(m);               // Store temporarily the row indexes for the current column of R and Q
-  Index nzcolR, nzcolQ;                       // Number of nonzero for the current column of R and Q
-  ScalarVector tval(m);                       // The dense vector used to compute the current column
-  bool found_diag;
-    
+  StorageIndex m = StorageIndex(mat.rows());
+  StorageIndex n = StorageIndex(mat.cols());
+  StorageIndex diagSize = (std::min)(m,n);
+  IndexVector mark((std::max)(m,n)); mark.setConstant(-1);  // Record the visited nodes
+  IndexVector Ridx(n), Qidx(m);                             // Store temporarily the row indexes for the current column of R and Q
+  Index nzcolR, nzcolQ;                                     // Number of nonzero for the current column of R and Q
+  ScalarVector tval(m);                                     // The dense vector used to compute the current column
+  RealScalar pivotThreshold = m_threshold;
+  
+  m_R.setZero();
+  m_Q.setZero();
   m_pmat = mat;
+  if(!m_isEtreeOk)
+  {
+    m_outputPerm_c = m_perm_c.inverse();
+    internal::coletree(m_pmat, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
+    m_isEtreeOk = true;
+  }
+
   m_pmat.uncompress(); // To have the innerNonZeroPtr allocated
+  
   // Apply the fill-in reducing permutation lazily:
-  for (int i = 0; i < n; i++)
   {
-    Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
-    m_pmat.outerIndexPtr()[p] = mat.outerIndexPtr()[i]; 
-    m_pmat.innerNonZeroPtr()[p] = mat.outerIndexPtr()[i+1] - mat.outerIndexPtr()[i]; 
+    // If the input is row major, copy the original column indices,
+    // otherwise directly use the input matrix
+    // 
+    IndexVector originalOuterIndicesCpy;
+    const StorageIndex *originalOuterIndices = mat.outerIndexPtr();
+    if(MatrixType::IsRowMajor)
+    {
+      originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(),n+1);
+      originalOuterIndices = originalOuterIndicesCpy.data();
+    }
+    
+    for (int i = 0; i < n; i++)
+    {
+      Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
+      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i]; 
+      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i+1] - originalOuterIndices[i]; 
+    }
   }
   
-  /* Compute the default threshold, see : 
+  /* Compute the default threshold as in MatLab, see:
    * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
    * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
    */
   if(m_useDefaultThreshold) 
   {
     RealScalar max2Norm = 0.0;
-    for (int j = 0; j < n; j++) max2Norm = (max)(max2Norm, m_pmat.col(j).norm());
-    m_threshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
+    for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
+    if(max2Norm==RealScalar(0))
+      max2Norm = RealScalar(1);
+    pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
   }
   
   // Initialize the numerical permutation
   m_pivotperm.setIdentity(n);
   
-  Index nonzeroCol = 0; // Record the number of valid pivots
-  
+  StorageIndex nonzeroCol = 0; // Record the number of valid pivots
+  m_Q.startVec(0);
+
   // Left looking rank-revealing QR factorization: compute a column of R and Q at a time
-  for (Index col = 0; col < n; ++col)
+  for (StorageIndex col = 0; col < n; ++col)
   {
     mark.setConstant(-1);
     m_R.startVec(col);
-    m_Q.startVec(col);
     mark(nonzeroCol) = col;
     Qidx(0) = nonzeroCol;
     nzcolR = 0; nzcolQ = 1;
-    found_diag = false;
+    bool found_diag = nonzeroCol>=m;
     tval.setZero(); 
     
     // Symbolic factorization: find the nonzero locations of the column k of the factors R and Q, i.e.,
     // all the nodes (with indexes lower than rank) reachable through the column elimination tree (etree) rooted at node k.
     // Note: if the diagonal entry does not exist, then its contribution must be explicitly added,
     // thus the trick with found_diag that permits to do one more iteration on the diagonal element if this one has not been found.
-    for (typename MatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
+    for (typename QRMatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
     {
-      Index curIdx = nonzeroCol ;
-      if(itp) curIdx = itp.row();
+      StorageIndex curIdx = nonzeroCol;
+      if(itp) curIdx = StorageIndex(itp.row());
       if(curIdx == nonzeroCol) found_diag = true;
       
       // Get the nonzeros indexes of the current column of R
-      Index st = m_firstRowElt(curIdx); // The traversal of the etree starts here 
+      StorageIndex st = m_firstRowElt(curIdx); // The traversal of the etree starts here
       if (st < 0 )
       {
         m_lastError = "Empty row found during numerical factorization";
@@ -397,7 +459,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
     // Browse all the indexes of R(:,col) in reverse order
     for (Index i = nzcolR-1; i >= 0; i--)
     {
-      Index curIdx = m_pivotperm.indices()(Ridx(i));
+      Index curIdx = Ridx(i);
       
       // Apply the curIdx-th householder vector to the current column (temporarily stored into tval)
       Scalar tdot(0);
@@ -417,7 +479,7 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
       {
         for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
         {
-          Index iQ = itq.row();
+          StorageIndex iQ = StorageIndex(itq.row());
           if (mark(iQ) != col)
           {
             Qidx(nzcolQ++) = iQ;  // Add this row to the pattern of Q,
@@ -426,33 +488,36 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
         }
       }
     } // End update current column
-        
-    // Compute the Householder reflection that eliminate the current column
-    // FIXME this step should call the Householder module.
-    Scalar tau;
-    RealScalar beta;
-    Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
     
-    // First, the squared norm of Q((col+1):m, col)
-    RealScalar sqrNorm = 0.;
-    for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
+    Scalar tau = RealScalar(0);
+    RealScalar beta = 0;
     
-    if(sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0))
+    if(nonzeroCol < diagSize)
     {
-      tau = RealScalar(0);
-      beta = numext::real(c0);
-      tval(Qidx(0)) = 1;
-     }
-    else
-    {
-      beta = std::sqrt(numext::abs2(c0) + sqrNorm);
-      if(numext::real(c0) >= RealScalar(0))
-        beta = -beta;
-      tval(Qidx(0)) = 1;
-      for (Index itq = 1; itq < nzcolQ; ++itq)
-        tval(Qidx(itq)) /= (c0 - beta);
-      tau = numext::conj((beta-c0) / beta);
-        
+      // Compute the Householder reflection that eliminate the current column
+      // FIXME this step should call the Householder module.
+      Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
+      
+      // First, the squared norm of Q((col+1):m, col)
+      RealScalar sqrNorm = 0.;
+      for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
+      if(sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0))
+      {
+        beta = numext::real(c0);
+        tval(Qidx(0)) = 1;
+      }
+      else
+      {
+        using std::sqrt;
+        beta = sqrt(numext::abs2(c0) + sqrNorm);
+        if(numext::real(c0) >= RealScalar(0))
+          beta = -beta;
+        tval(Qidx(0)) = 1;
+        for (Index itq = 1; itq < nzcolQ; ++itq)
+          tval(Qidx(itq)) /= (c0 - beta);
+        tau = numext::conj((beta-c0) / beta);
+          
+      }
     }
 
     // Insert values in R
@@ -466,45 +531,49 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
       }
     }
 
-    if(abs(beta) >= m_threshold)
+    if(nonzeroCol < diagSize && abs(beta) >= pivotThreshold)
     {
       m_R.insertBackByOuterInner(col, nonzeroCol) = beta;
-      nonzeroCol++;
       // The householder coefficient
-      m_hcoeffs(col) = tau;
+      m_hcoeffs(nonzeroCol) = tau;
       // Record the householder reflections
       for (Index itq = 0; itq < nzcolQ; ++itq)
       {
         Index iQ = Qidx(itq);
-        m_Q.insertBackByOuterInnerUnordered(col,iQ) = tval(iQ);
+        m_Q.insertBackByOuterInnerUnordered(nonzeroCol,iQ) = tval(iQ);
         tval(iQ) = Scalar(0.);
-      }    
+      }
+      nonzeroCol++;
+      if(nonzeroCol<diagSize)
+        m_Q.startVec(nonzeroCol);
     }
     else
     {
       // Zero pivot found: move implicitly this column to the end
-      m_hcoeffs(col) = Scalar(0);
       for (Index j = nonzeroCol; j < n-1; j++) 
         std::swap(m_pivotperm.indices()(j), m_pivotperm.indices()[j+1]);
       
       // Recompute the column elimination tree
       internal::coletree(m_pmat, m_etree, m_firstRowElt, m_pivotperm.indices().data());
+      m_isEtreeOk = false;
     }
   }
   
+  m_hcoeffs.tail(diagSize-nonzeroCol).setZero();
+  
   // Finalize the column pointers of the sparse matrices R and Q
   m_Q.finalize();
   m_Q.makeCompressed();
   m_R.finalize();
   m_R.makeCompressed();
   m_isQSorted = false;
-  
+
   m_nonzeropivots = nonzeroCol;
   
   if(nonzeroCol<n)
   {
     // Permute the triangular factor to put the 'dead' columns to the end
-    MatrixType tempR(m_R);
+    QRMatrixType tempR(m_R);
     m_R = tempR * m_pivotperm;
     
     // Update the column permutation
@@ -516,40 +585,11 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
   m_info = Success;
 }
 
-namespace internal {
-  
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct solve_retval<SparseQR<_MatrixType,OrderingType>, Rhs>
-  : solve_retval_base<SparseQR<_MatrixType,OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType,OrderingType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct sparse_solve_retval<SparseQR<_MatrixType, OrderingType>, Rhs>
- : sparse_solve_retval_base<SparseQR<_MatrixType, OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType, OrderingType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec, Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 template <typename SparseQRType, typename Derived>
 struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived> >
 {
   typedef typename SparseQRType::QRMatrixType MatrixType;
   typedef typename SparseQRType::Scalar Scalar;
-  typedef typename SparseQRType::Index Index;
   // Get the references 
   SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
   m_qr(qr),m_other(other),m_transpose(transpose) {}
@@ -560,17 +600,20 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
   template<typename DesType>
   void evalTo(DesType& res) const
   {
+    Index m = m_qr.rows();
     Index n = m_qr.cols();
+    Index diagSize = (std::min)(m,n);
     res = m_other;
     if (m_transpose)
     {
       eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
       //Compute res = Q' * other column by column
       for(Index j = 0; j < res.cols(); j++){
-        for (Index k = 0; k < n; k++)
+        for (Index k = 0; k < diagSize; k++)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
+          if(tau==Scalar(0)) continue;
           tau = tau * m_qr.m_hcoeffs(k);
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
@@ -578,14 +621,15 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
     }
     else
     {
-      eigen_assert(m_qr.m_Q.cols() == m_other.rows() && "Non conforming object sizes");
-      // Compute res = Q' * other column by column
+      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
+      // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        for (Index k = n-1; k >=0; k--)
+        for (Index k = diagSize-1; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
+          if(tau==Scalar(0)) continue;
           tau = tau * m_qr.m_hcoeffs(k);
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
@@ -601,10 +645,13 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
 template<typename SparseQRType>
 struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> >
 {  
-  typedef typename SparseQRType::Index Index;
   typedef typename SparseQRType::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic
+  };
+  explicit SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
   {
@@ -615,32 +662,19 @@ struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<Sp
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   inline Index rows() const { return m_qr.rows(); }
-  inline Index cols() const { return m_qr.cols(); }
+  inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); }
   // To use for operations with the transpose of Q
   SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const
   {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
-  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
-  {
-    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
-  }
-  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
-  {
-    Dest idMat(m_qr.rows(), m_qr.rows());
-    idMat.setIdentity();
-    // Sort the sparse householder reflectors if needed
-    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
-    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
-  }
-
   const SparseQRType& m_qr;
 };
 
 template<typename SparseQRType>
 struct SparseQRMatrixQTransposeReturnType
 {
-  SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  explicit SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
   template<typename Derived>
   SparseQR_QProduct<SparseQRType,Derived> operator*(const MatrixBase<Derived>& other)
   {
@@ -649,6 +683,47 @@ struct SparseQRMatrixQTransposeReturnType
   const SparseQRType& m_qr;
 };
 
+namespace internal {
+  
+template<typename SparseQRType>
+struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> >
+{
+  typedef typename SparseQRType::MatrixType MatrixType;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseShape Shape;
+  static const int AssumeAliasing = 0;
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Sparse>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
+  {
+    typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows());
+    idMat.setIdentity();
+    // Sort the sparse householder reflectors if needed
+    const_cast<SparseQRType *>(&src.m_qr)->_sort_matrix_Q();
+    dst = SparseQR_QProduct<SparseQRType, DstXprType>(src.m_qr, idMat, false);
+  }
+};
+
+template< typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>, internal::assign_op<typename DstXprType::Scalar>, Sparse2Dense>
+{
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar> &/*func*/)
+  {
+    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
+  }
+};
+
+} // end namespace internal
+
 } // end namespace Eigen
 
 #endif
diff --git a/nuparu/include/Eigen/src/StlSupport/StdDeque.h b/nuparu/include/Eigen/src/StlSupport/StdDeque.h
index 4ee8e5c1..25930cb8 100644
--- a/nuparu/include/Eigen/src/StlSupport/StdDeque.h
+++ b/nuparu/include/Eigen/src/StlSupport/StdDeque.h
@@ -11,10 +11,10 @@
 #ifndef EIGEN_STDDEQUE_H
 #define EIGEN_STDDEQUE_H
 
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 
 // Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
+#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC
   #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
 #else
   #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
@@ -53,7 +53,7 @@ namespace std \
 }
 
 // check whether we really need the std::deque specialization
-#if !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
+#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
 
 namespace std {
 
diff --git a/nuparu/include/Eigen/src/StlSupport/StdList.h b/nuparu/include/Eigen/src/StlSupport/StdList.h
index 627381ec..7412b50a 100644
--- a/nuparu/include/Eigen/src/StlSupport/StdList.h
+++ b/nuparu/include/Eigen/src/StlSupport/StdList.h
@@ -10,10 +10,10 @@
 #ifndef EIGEN_STDLIST_H
 #define EIGEN_STDLIST_H
 
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 
 // Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
+#if EIGEN_COMP_GNUC || EIGEN_COMP_ICC
   #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
 #else
   #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
@@ -51,8 +51,8 @@ namespace std \
   }; \
 }
 
-// check whether we really need the std::vector specialization
-#if !(defined(_GLIBCXX_VECTOR) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
+// check whether we really need the std::list specialization
+#if !EIGEN_HAS_CXX11_CONTAINERS && !(defined(_GLIBCXX_LIST) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
 
 namespace std
 {
diff --git a/nuparu/include/Eigen/src/StlSupport/StdVector.h b/nuparu/include/Eigen/src/StlSupport/StdVector.h
index 40a9abef..ec22821d 100644
--- a/nuparu/include/Eigen/src/StlSupport/StdVector.h
+++ b/nuparu/include/Eigen/src/StlSupport/StdVector.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_STDVECTOR_H
 #define EIGEN_STDVECTOR_H
 
-#include "Eigen/src/StlSupport/details.h"
+#include "details.h"
 
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
@@ -44,6 +44,9 @@ namespace std \
   }; \
 }
 
+// Don't specialize if containers are implemented according to C++11
+#if !EIGEN_HAS_CXX11_CONTAINERS
+
 namespace std {
 
 #define EIGEN_STD_VECTOR_SPECIALIZATION_BODY \
@@ -122,5 +125,7 @@ namespace std {
 #endif
   };
 }
+#endif // !EIGEN_HAS_CXX11_CONTAINERS
+
 
 #endif // EIGEN_STDVECTOR_H
diff --git a/nuparu/include/Eigen/src/StlSupport/details.h b/nuparu/include/Eigen/src/StlSupport/details.h
index d8debc7c..e42ec024 100644
--- a/nuparu/include/Eigen/src/StlSupport/details.h
+++ b/nuparu/include/Eigen/src/StlSupport/details.h
@@ -46,7 +46,7 @@ namespace Eigen {
     ~aligned_allocator_indirection() {}
   };
 
-#ifdef _MSC_VER
+#if EIGEN_COMP_MSVC
 
   // sometimes, MSVC detects, at compile time, that the argument x
   // in std::vector::resize(size_t s,T x) won't be aligned and generate an error
diff --git a/nuparu/include/Eigen/src/SuperLUSupport/SuperLUSupport.h b/nuparu/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
index bcb35576..fd2b2658 100644
--- a/nuparu/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/nuparu/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,12 +14,11 @@ namespace Eigen {
 
 #define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
     extern "C" {                                                                                          \
-      typedef struct { FLOATTYPE for_lu; FLOATTYPE total_needed; int expansions; } PREFIX##mem_usage_t;   \
       extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
                                 char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
                                 void *, int, SuperMatrix *, SuperMatrix *,                                \
                                 FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                           \
+                                mem_usage_t *, SuperLUStat_t *, int *);                           \
     }                                                                                                     \
     inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
          int *perm_c, int *perm_r, int *etree, char *equed,                                               \
@@ -29,7 +28,7 @@ namespace Eigen {
          FLOATTYPE *recip_pivot_growth,                                                                   \
          FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    PREFIX##mem_usage_t mem_usage;                                                                        \
+    mem_usage_t mem_usage;                                                                        \
     PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
          ferr, berr, &mem_usage, stats, info);                                                            \
@@ -53,7 +52,7 @@ DECL_GSSVX(z,double,std::complex<double>)
       extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *,        \
                          char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,        \
                          void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, FLOATTYPE *,   \
-                         PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                        \
+                         mem_usage_t *, SuperLUStat_t *, int *);                        \
     }                                                                                           \
     inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A,                      \
          int *perm_c, int *perm_r, int *etree, char *equed,                                     \
@@ -63,7 +62,7 @@ DECL_GSSVX(z,double,std::complex<double>)
          FLOATTYPE *recip_pivot_growth,                                                         \
          FLOATTYPE *rcond,                                                                      \
          SuperLUStat_t *stats, int *info, KEYTYPE) {                                            \
-    PREFIX##mem_usage_t mem_usage;                                                              \
+    mem_usage_t mem_usage;                                                              \
     PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L,                            \
          U, work, lwork, B, X, recip_pivot_growth, rcond,                                       \
          &mem_usage, stats, info);                                                              \
@@ -156,37 +155,38 @@ struct SluMatrix : SuperMatrix
     res.setScalarType<typename MatrixType::Scalar>();
     res.Mtype     = SLU_GE;
 
-    res.nrow      = mat.rows();
-    res.ncol      = mat.cols();
+    res.nrow      = internal::convert_index<int>(mat.rows());
+    res.ncol      = internal::convert_index<int>(mat.cols());
 
-    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
+    res.storage.lda       = internal::convert_index<int>(MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride());
     res.storage.values    = (void*)(mat.data());
     return res;
   }
 
   template<typename MatrixType>
-  static SluMatrix Map(SparseMatrixBase<MatrixType>& mat)
+  static SluMatrix Map(SparseMatrixBase<MatrixType>& a_mat)
   {
+    MatrixType &mat(a_mat.derived());
     SluMatrix res;
     if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
     {
       res.setStorageType(SLU_NR);
-      res.nrow      = mat.cols();
-      res.ncol      = mat.rows();
+      res.nrow      = internal::convert_index<int>(mat.cols());
+      res.ncol      = internal::convert_index<int>(mat.rows());
     }
     else
     {
       res.setStorageType(SLU_NC);
-      res.nrow      = mat.rows();
-      res.ncol      = mat.cols();
+      res.nrow      = internal::convert_index<int>(mat.rows());
+      res.ncol      = internal::convert_index<int>(mat.cols());
     }
 
     res.Mtype       = SLU_GE;
 
-    res.storage.nnz       = mat.nonZeros();
-    res.storage.values    = mat.derived().valuePtr();
-    res.storage.innerInd  = mat.derived().innerIndexPtr();
-    res.storage.outerInd  = mat.derived().outerIndexPtr();
+    res.storage.nnz       = internal::convert_index<int>(mat.nonZeros());
+    res.storage.values    = mat.valuePtr();
+    res.storage.innerInd  = mat.innerIndexPtr();
+    res.storage.outerInd  = mat.outerIndexPtr();
 
     res.setScalarType<typename MatrixType::Scalar>();
 
@@ -288,17 +288,26 @@ MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
   * \brief The base class for the direct and incomplete LU factorization of SuperLU
   */
 template<typename _MatrixType, typename Derived>
-class SuperLUBase : internal::noncopyable
+class SuperLUBase : public SparseSolverBase<Derived>
 {
+  protected:
+    typedef SparseSolverBase<Derived> Base;
+    using Base::derived;
+    using Base::m_isInitialized;
   public:
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
+    typedef Map<PermutationMatrix<Dynamic,Dynamic,int> > PermutationMap;
     typedef SparseMatrix<Scalar> LUMatrixType;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
@@ -309,9 +318,6 @@ class SuperLUBase : internal::noncopyable
       clearFactors();
     }
     
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
     inline Index rows() const { return m_matrix.rows(); }
     inline Index cols() const { return m_matrix.cols(); }
     
@@ -335,33 +341,7 @@ class SuperLUBase : internal::noncopyable
       derived().analyzePattern(matrix);
       derived().factorize(matrix);
     }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SuperLUBase, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SuperLUBase, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
+
     /** Performs a symbolic decomposition on the sparcity of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
@@ -386,7 +366,7 @@ class SuperLUBase : internal::noncopyable
     {
       set_default_options(&this->m_sluOptions);
       
-      const int size = a.rows();
+      const Index size = a.rows();
       m_matrix = a;
 
       m_sluA = internal::asSluMatrix(m_matrix);
@@ -405,7 +385,7 @@ class SuperLUBase : internal::noncopyable
       m_sluB.storage.values = 0;
       m_sluB.nrow           = 0;
       m_sluB.ncol           = 0;
-      m_sluB.storage.lda    = size;
+      m_sluB.storage.lda    = internal::convert_index<int>(size);
       m_sluX                = m_sluB;
       
       m_extractedDataAreDirty = true;
@@ -453,7 +433,6 @@ class SuperLUBase : internal::noncopyable
     mutable char m_sluEqued;
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
     mutable bool m_extractedDataAreDirty;
@@ -473,6 +452,10 @@ class SuperLUBase : internal::noncopyable
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+  *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType>
@@ -483,18 +466,20 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
+    typedef typename Base::StorageIndex StorageIndex;
     typedef typename Base::IntRowVectorType IntRowVectorType;
-    typedef typename Base::IntColVectorType IntColVectorType;    
+    typedef typename Base::IntColVectorType IntColVectorType;   
+    typedef typename Base::PermutationMap PermutationMap;
     typedef typename Base::LUMatrixType LUMatrixType;
     typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
-    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
+    typedef TriangularView<LUMatrixType,  Upper>          UMatrixType;
 
   public:
+    using Base::_solve_impl;
 
     SuperLU() : Base() { init(); }
 
-    SuperLU(const MatrixType& matrix) : Base()
+    explicit SuperLU(const MatrixType& matrix) : Base()
     {
       init();
       Base::compute(matrix);
@@ -525,11 +510,9 @@ class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
       */
     void factorize(const MatrixType& matrix);
     
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
-    #endif // EIGEN_PARSED_BY_DOXYGEN
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
     
     inline const LMatrixType& matrixL() const
     {
@@ -637,12 +620,12 @@ void SuperLU<MatrixType>::factorize(const MatrixType& a)
 
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
-void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
-  const int rhsCols = b.cols();
+  const Index size = m_matrix.rows();
+  const Index rhsCols = b.cols();
   eigen_assert(size==b.rows());
 
   m_sluOptions.Trans = NOTRANS;
@@ -652,8 +635,12 @@ void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
   
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -676,6 +663,10 @@ void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
                 &m_sluFerr[0], &m_sluBerr[0],
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(x.derived().data() != x_ref.data())
+    x = x_ref;
+  
   m_info = info==0 ? Success : NumericalIssue;
 }
 
@@ -699,7 +690,7 @@ void SuperLUBase<MatrixType,Derived>::extractData() const
     NCformat    *Ustore = static_cast<NCformat*>(m_sluU.Store);
     Scalar      *SNptr;
 
-    const int size = m_matrix.rows();
+    const Index size = m_matrix.rows();
     m_l.resize(size,size);
     m_l.resizeNonZeros(Lstore->nnz);
     m_u.resize(size,size);
@@ -791,6 +782,8 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
         det *= m_u.valuePtr()[lastId];
     }
   }
+  if(PermutationMap(m_p.data(),m_p.size()).determinant()*PermutationMap(m_q.data(),m_q.size()).determinant()<0)
+    det = -det;
   if(m_sluEqued!='N')
     return det/m_sluRscale.prod()/m_sluCscale.prod();
   else
@@ -810,10 +803,12 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
   * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization
   * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers.
   *
-  * \warning This class requires SuperLU 4 or later.
+  * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
   *
   * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
   *
+  * \implsparsesolverconcept
+  *
   * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB
   */
 
@@ -825,9 +820,9 @@ class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> >
     typedef _MatrixType MatrixType;
     typedef typename Base::Scalar Scalar;
     typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
 
   public:
+    using Base::_solve_impl;
 
     SuperILU() : Base() { init(); }
 
@@ -863,7 +858,7 @@ class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> >
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+    void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
     #endif // EIGEN_PARSED_BY_DOXYGEN
     
   protected:
@@ -948,7 +943,7 @@ void SuperILU<MatrixType>::factorize(const MatrixType& a)
 
 template<typename MatrixType>
 template<typename Rhs,typename Dest>
-void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
+void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
 {
   eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
 
@@ -962,8 +957,12 @@ void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+  
+  Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
+  
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
 
   typename Rhs::PlainObject b_cpy;
   if(m_sluEqued!='N')
@@ -986,41 +985,14 @@ void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
                 &recip_pivot_growth, &rcond,
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
+  
+  if(&x.coeffRef(0) != x_ref.data())
+    x = x_ref;
 
   m_info = info==0 ? Success : NumericalIssue;
 }
 #endif
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_SUPERLUSUPPORT_H
diff --git a/nuparu/include/Eigen/src/UmfPackSupport/UmfPackSupport.h b/nuparu/include/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 3a48cecf..aaec8c6f 100644
--- a/nuparu/include/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/nuparu/include/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -16,6 +16,13 @@ namespace Eigen {
 
 // generic double/complex<double> wrapper functions:
 
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) 
+{ umfpack_di_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>) 
+{ umfpack_zi_defaults(control); }
+
 inline void umfpack_free_numeric(void **Numeric, double)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
@@ -107,6 +114,7 @@ inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *N
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
+
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
   *
@@ -121,24 +129,41 @@ inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *N
   * \sa \ref TutorialSparseDirectSolvers
   */
 template<typename _MatrixType>
-class UmfPackLU : internal::noncopyable
+class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 {
+  protected:
+    typedef SparseSolverBase<UmfPackLU<_MatrixType> > Base;
+    using Base::m_isInitialized;
   public:
+    using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar,Dynamic,1> Vector;
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
     typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
+    typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
+    enum {
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+    };
 
   public:
 
-    UmfPackLU() { init(); }
+    typedef Array<double, UMFPACK_CONTROL, 1> UmfpackControl;
 
-    UmfPackLU(const MatrixType& matrix)
+    UmfPackLU()
+      : m_dummy(0,0), mp_matrix(m_dummy)
+    {
+      init();
+    }
+
+    template<typename InputMatrixType>
+    explicit UmfPackLU(const InputMatrixType& matrix)
+      : mp_matrix(matrix)
     {
       init();
       compute(matrix);
@@ -150,8 +175,8 @@ class UmfPackLU : internal::noncopyable
       if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
     }
 
-    inline Index rows() const { return m_copyMatrix.rows(); }
-    inline Index cols() const { return m_copyMatrix.cols(); }
+    inline Index rows() const { return mp_matrix.rows(); }
+    inline Index cols() const { return mp_matrix.cols(); }
 
     /** \brief Reports whether previous computation was successful.
       *
@@ -192,90 +217,87 @@ class UmfPackLU : internal::noncopyable
      *  Note that the matrix should be column-major, and in compressed format for best performance.
      *  \sa SparseMatrix::makeCompressed().
      */
-    void compute(const MatrixType& matrix)
+    template<typename InputMatrixType>
+    void compute(const InputMatrixType& matrix)
     {
-      analyzePattern(matrix);
-      factorize(matrix);
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      grab(matrix.derived());
+      analyzePattern_impl();
+      factorize_impl();
     }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+      *
+      * This function is particularly useful when solving for several problems having the same structure.
       *
-      * \sa compute()
+      * \sa factorize(), compute()
       */
-    template<typename Rhs>
-    inline const internal::solve_retval<UmfPackLU, Rhs> solve(const MatrixBase<Rhs>& b) const
+    template<typename InputMatrixType>
+    void analyzePattern(const InputMatrixType& matrix)
     {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<UmfPackLU, Rhs>(*this, b.derived());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
+      
+      grab(matrix.derived());
+
+      analyzePattern_impl();
     }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+    /** Provides the return status code returned by UmfPack during the numeric
+      * factorization.
       *
-      * \sa compute()
+      * \sa factorize(), compute()
       */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<UmfPackLU, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
+    inline int umfpackFactorizeReturncode() const
     {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<UmfPackLU, Rhs>(*this, b.derived());
+      eigen_assert(m_numeric && "UmfPackLU: you must first call factorize()");
+      return m_fact_errorCode;
     }
 
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
+    /** Provides access to the control settings array used by UmfPack.
       *
-      * This function is particularly useful when solving for several problems having the same structure.
+      * If this array contains NaN's, the default values are used.
       *
-      * \sa factorize(), compute()
+      * See UMFPACK documentation for details.
       */
-    void analyzePattern(const MatrixType& matrix)
+    inline const UmfpackControl& umfpackControl() const
     {
-      if(m_symbolic)
-        umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
-      
-      grapInput(matrix);
-
-      int errorCode = 0;
-      errorCode = umfpack_symbolic(matrix.rows(), matrix.cols(), m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                   &m_symbolic, 0, 0);
-
-      m_isInitialized = true;
-      m_info = errorCode ? InvalidInput : Success;
-      m_analysisIsOk = true;
-      m_factorizationIsOk = false;
+      return m_control;
     }
-
+    
+    /** Provides access to the control settings array used by UmfPack.
+      *
+      * If this array contains NaN's, the default values are used.
+      *
+      * See UMFPACK documentation for details.
+      */
+    inline UmfpackControl& umfpackControl()
+    {
+      return m_control;
+    }
+    
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
       *
       * \sa analyzePattern(), compute()
       */
-    void factorize(const MatrixType& matrix)
+    template<typename InputMatrixType>
+    void factorize(const InputMatrixType& matrix)
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
       if(m_numeric)
         umfpack_free_numeric(&m_numeric,Scalar());
 
-      grapInput(matrix);
-
-      int errorCode;
-      errorCode = umfpack_numeric(m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                  m_symbolic, &m_numeric, 0, 0);
-
-      m_info = errorCode ? NumericalIssue : Success;
-      m_factorizationIsOk = true;
+      grab(matrix.derived());
+      
+      factorize_impl();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename BDerived,typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
-    #endif
+    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
 
     Scalar determinant() const;
 
@@ -283,52 +305,73 @@ class UmfPackLU : internal::noncopyable
 
   protected:
 
-
     void init()
     {
-      m_info = InvalidInput;
-      m_isInitialized = false;
-      m_numeric = 0;
-      m_symbolic = 0;
-      m_outerIndexPtr = 0;
-      m_innerIndexPtr = 0;
-      m_valuePtr      = 0;
+      m_info                  = InvalidInput;
+      m_isInitialized         = false;
+      m_numeric               = 0;
+      m_symbolic              = 0;
+      m_extractedDataAreDirty = true;
     }
     
-    void grapInput(const MatrixType& mat)
+    void analyzePattern_impl()
     {
-      m_copyMatrix.resize(mat.rows(), mat.cols());
-      if( ((MatrixType::Flags&RowMajorBit)==RowMajorBit) || sizeof(typename MatrixType::Index)!=sizeof(int) || !mat.isCompressed() )
-      {
-        // non supported input -> copy
-        m_copyMatrix = mat;
-        m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-        m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-        m_valuePtr      = m_copyMatrix.valuePtr();
-      }
-      else
+      umfpack_defaults(m_control.data(), Scalar());
+      int errorCode = 0;
+      errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
+                                   internal::convert_index<int>(mp_matrix.cols()),
+                                   mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                   &m_symbolic, m_control.data(), 0);
+
+      m_isInitialized = true;
+      m_info = errorCode ? InvalidInput : Success;
+      m_analysisIsOk = true;
+      m_factorizationIsOk = false;
+      m_extractedDataAreDirty = true;
+    }
+    
+    void factorize_impl()
+    {
+      m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                         m_symbolic, &m_numeric, m_control.data(), 0);
+
+      m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue;
+      m_factorizationIsOk = true;
+      m_extractedDataAreDirty = true;
+    }
+    
+    template<typename MatrixDerived>
+    void grab(const EigenBase<MatrixDerived> &A)
+    {
+      mp_matrix.~UmfpackMatrixRef();
+      ::new (&mp_matrix) UmfpackMatrixRef(A.derived());
+    }
+    
+    void grab(const UmfpackMatrixRef &A)
+    {
+      if(&(A.derived()) != &mp_matrix)
       {
-        m_outerIndexPtr = mat.outerIndexPtr();
-        m_innerIndexPtr = mat.innerIndexPtr();
-        m_valuePtr      = mat.valuePtr();
+        mp_matrix.~UmfpackMatrixRef();
+        ::new (&mp_matrix) UmfpackMatrixRef(A);
       }
     }
-
+  
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
+    int m_fact_errorCode;
+    UmfpackControl m_control;
+    
     mutable LUMatrixType m_u;
     mutable IntColVectorType m_p;
     mutable IntRowVectorType m_q;
 
-    UmfpackMatrixType m_copyMatrix;
-    const Scalar* m_valuePtr;
-    const int* m_outerIndexPtr;
-    const int* m_innerIndexPtr;
+    UmfpackMatrixType m_dummy;
+    UmfpackMatrixRef mp_matrix;
+  
     void* m_numeric;
     void* m_symbolic;
 
     mutable ComputationInfo m_info;
-    bool m_isInitialized;
     int m_factorizationIsOk;
     int m_analysisIsOk;
     mutable bool m_extractedDataAreDirty;
@@ -376,19 +419,30 @@ typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() cons
 
 template<typename MatrixType>
 template<typename BDerived,typename XDerived>
-bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
+bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
 {
-  const int rhsCols = b.cols();
+  Index rhsCols = b.cols();
   eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet");
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
   
   int errorCode;
+  Scalar* x_ptr = 0;
+  Matrix<Scalar,Dynamic,1> x_tmp;
+  if(x.innerStride()!=1)
+  {
+    x_tmp.resize(x.rows());
+    x_ptr = x_tmp.data();
+  }
   for (int j=0; j<rhsCols; ++j)
   {
+    if(x.innerStride()==1)
+      x_ptr = &x.col(j).coeffRef(0);
     errorCode = umfpack_solve(UMFPACK_A,
-        m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
+        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), 0);
+    if(x.innerStride()!=1)
+      x.col(j) = x_tmp;
     if (errorCode!=0)
       return false;
   }
@@ -396,37 +450,6 @@ bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDe
   return true;
 }
 
-
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
-
 } // end namespace Eigen
 
 #endif // EIGEN_UMFPACKSUPPORT_H
diff --git a/nuparu/include/Eigen/src/misc/Image.h b/nuparu/include/Eigen/src/misc/Image.h
index 75c5f433..b8b8a045 100644
--- a/nuparu/include/Eigen/src/misc/Image.h
+++ b/nuparu/include/Eigen/src/misc/Image.h
@@ -38,7 +38,6 @@ template<typename _DecompositionType> struct image_retval_base
   typedef _DecompositionType DecompositionType;
   typedef typename DecompositionType::MatrixType MatrixType;
   typedef ReturnByValue<image_retval_base> Base;
-  typedef typename Base::Index Index;
 
   image_retval_base(const DecompositionType& dec, const MatrixType& originalMatrix)
     : m_dec(dec), m_rank(dec.rank()),
@@ -69,7 +68,6 @@ template<typename _DecompositionType> struct image_retval_base
   typedef typename DecompositionType::MatrixType MatrixType; \
   typedef typename MatrixType::Scalar Scalar; \
   typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
   typedef Eigen::internal::image_retval_base<DecompositionType> Base; \
   using Base::dec; \
   using Base::originalMatrix; \
diff --git a/nuparu/include/Eigen/src/misc/Kernel.h b/nuparu/include/Eigen/src/misc/Kernel.h
index b9e1518f..bef5d6ff 100644
--- a/nuparu/include/Eigen/src/misc/Kernel.h
+++ b/nuparu/include/Eigen/src/misc/Kernel.h
@@ -39,9 +39,8 @@ template<typename _DecompositionType> struct kernel_retval_base
 {
   typedef _DecompositionType DecompositionType;
   typedef ReturnByValue<kernel_retval_base> Base;
-  typedef typename Base::Index Index;
 
-  kernel_retval_base(const DecompositionType& dec)
+  explicit kernel_retval_base(const DecompositionType& dec)
     : m_dec(dec),
       m_rank(dec.rank()),
       m_cols(m_rank==dec.cols() ? 1 : dec.cols() - m_rank)
@@ -68,7 +67,6 @@ template<typename _DecompositionType> struct kernel_retval_base
   typedef typename DecompositionType::MatrixType MatrixType; \
   typedef typename MatrixType::Scalar Scalar; \
   typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
   typedef Eigen::internal::kernel_retval_base<DecompositionType> Base; \
   using Base::dec; \
   using Base::rank; \
diff --git a/nuparu/include/Eigen/src/misc/Solve.h b/nuparu/include/Eigen/src/misc/Solve.h
deleted file mode 100644
index 7f70d60a..00000000
--- a/nuparu/include/Eigen/src/misc/Solve.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MISC_SOLVE_H
-#define EIGEN_MISC_SOLVE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/** \class solve_retval_base
-  *
-  */
-template<typename DecompositionType, typename Rhs>
-struct traits<solve_retval_base<DecompositionType, Rhs> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<typename Rhs::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 Rhs::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime> ReturnType;
-};
-
-template<typename _DecompositionType, typename Rhs> struct solve_retval_base
- : public ReturnByValue<solve_retval_base<_DecompositionType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
-  typedef _DecompositionType DecompositionType;
-  typedef ReturnByValue<solve_retval_base> Base;
-  typedef typename Base::Index Index;
-
-  solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-  inline const DecompositionType& dec() const { return m_dec; }
-  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    static_cast<const solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
-  }
-
-  protected:
-    const DecompositionType& m_dec;
-    typename Rhs::Nested m_rhs;
-};
-
-} // end namespace internal
-
-#define EIGEN_MAKE_SOLVE_HELPERS(DecompositionType,Rhs) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
-  typedef Eigen::internal::solve_retval_base<DecompositionType,Rhs> Base; \
-  using Base::dec; \
-  using Base::rhs; \
-  using Base::rows; \
-  using Base::cols; \
-  solve_retval(const DecompositionType& dec, const Rhs& rhs) \
-    : Base(dec, rhs) {}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MISC_SOLVE_H
diff --git a/nuparu/include/Eigen/src/misc/SparseSolve.h b/nuparu/include/Eigen/src/misc/SparseSolve.h
deleted file mode 100644
index 244bb8ec..00000000
--- a/nuparu/include/Eigen/src/misc/SparseSolve.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSE_SOLVE_H
-#define EIGEN_SPARSE_SOLVE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base;
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval;
-  
-template<typename DecompositionType, typename Rhs>
-struct traits<sparse_solve_retval_base<DecompositionType, Rhs> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef SparseMatrix<typename Rhs::Scalar, Rhs::Options, typename Rhs::Index> ReturnType;
-};
-
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base
- : public ReturnByValue<sparse_solve_retval_base<_DecompositionType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
-  typedef _DecompositionType DecompositionType;
-  typedef ReturnByValue<sparse_solve_retval_base> Base;
-  typedef typename Base::Index Index;
-
-  sparse_solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-  inline const DecompositionType& dec() const { return m_dec; }
-  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    static_cast<const sparse_solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
-  }
-
-  protected:
-    template<typename DestScalar, int DestOptions, typename DestIndex>
-    inline void defaultEvalTo(SparseMatrix<DestScalar,DestOptions,DestIndex>& dst) const
-    {
-      // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
-      static const int NbColsAtOnce = 4;
-      int rhsCols = m_rhs.cols();
-      int size = m_rhs.rows();
-      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,rhsCols);
-      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,rhsCols);
-      for(int k=0; k<rhsCols; k+=NbColsAtOnce)
-      {
-        int actualCols = std::min<int>(rhsCols-k, NbColsAtOnce);
-        tmp.leftCols(actualCols) = m_rhs.middleCols(k,actualCols);
-        tmpX.leftCols(actualCols) = m_dec.solve(tmp.leftCols(actualCols));
-        dst.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
-      }
-    }
-    const DecompositionType& m_dec;
-    typename Rhs::Nested m_rhs;
-};
-
-#define EIGEN_MAKE_SPARSE_SOLVE_HELPERS(DecompositionType,Rhs) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
-  typedef Eigen::internal::sparse_solve_retval_base<DecompositionType,Rhs> Base; \
-  using Base::dec; \
-  using Base::rhs; \
-  using Base::rows; \
-  using Base::cols; \
-  sparse_solve_retval(const DecompositionType& dec, const Rhs& rhs) \
-    : Base(dec, rhs) {}
-
-
-
-template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess;
-
-template<typename DecompositionType, typename Rhs, typename Guess>
-struct traits<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<typename Rhs::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 Rhs::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime> ReturnType;
-};
-
-template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess
- : public ReturnByValue<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
-{
-  typedef typename DecompositionType::Index Index;
-
-  solve_retval_with_guess(const DecompositionType& dec, const Rhs& rhs, const Guess& guess)
-    : m_dec(dec), m_rhs(rhs), m_guess(guess)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    dst = m_guess;
-    m_dec._solveWithGuess(m_rhs,dst);
-  }
-
-  protected:
-    const DecompositionType& m_dec;
-    const typename Rhs::Nested m_rhs;
-    const typename Guess::Nested m_guess;
-};
-
-} // namepsace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SPARSE_SOLVE_H
diff --git a/nuparu/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/nuparu/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 5c8c476e..9422c40b 100644
--- a/nuparu/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/nuparu/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -3,6 +3,7 @@
   * \sa MatrixBase::cwiseProduct
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
 operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -14,6 +15,7 @@ operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa MatrixBase::cwiseQuotient
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
 operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -33,6 +35,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(min,internal::scalar_min_op)
   *
   * \sa max()
   */
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -58,6 +61,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(max,internal::scalar_max_op)
   *
   * \sa min()
   */
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -70,6 +74,62 @@ max
   return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
+/** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents.
+  *
+  * This function computes the coefficient-wise power.
+  *
+  * Example: \include Cwise_array_power_array.cpp
+  * Output: \verbinclude Cwise_array_power_array.out
+  */
+template<typename ExponentDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+const CwiseBinaryOp<internal::scalar_binary_pow_op<Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
+pow(const ArrayBase<ExponentDerived>& exponents) const
+{
+  return CwiseBinaryOp<internal::scalar_binary_pow_op<Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
+    this->derived(),
+    exponents.derived()
+  );
+}
+
+// TODO code generating macros could be moved to Macros.h and could include generation of documentation
+#define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \
+template<typename OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
+OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+{ \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
+}\
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
+OP(const Scalar& s) const { \
+  return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
+} \
+EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
+OP(const Scalar& s, const Derived& d) { \
+  return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
+}
+
+#define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \
+template<typename OtherDerived> \
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
+OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
+{ \
+  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
+} \
+EIGEN_DEVICE_FUNC \
+inline const RCmp ## RCOMPARATOR ## ReturnType \
+OP(const Scalar& s) const { \
+  return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this); \
+} \
+friend inline const Cmp ## RCOMPARATOR ## ReturnType \
+OP(const Scalar& s, const Derived& d) { \
+  return d.R_OP(Derived::PlainObject::Constant(d.rows(), d.cols(), s)); \
+}
+
+
+
 /** \returns an expression of the coefficient-wise \< operator of *this and \a other
   *
   * Example: \include Cwise_less.cpp
@@ -77,7 +137,7 @@ max
   *
   * \sa all(), any(), operator>(), operator<=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator<,std::less)
+EIGEN_MAKE_CWISE_COMP_OP(operator<, LT)
 
 /** \returns an expression of the coefficient-wise \<= operator of *this and \a other
   *
@@ -86,7 +146,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator<,std::less)
   *
   * \sa all(), any(), operator>=(), operator<()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator<=,std::less_equal)
+EIGEN_MAKE_CWISE_COMP_OP(operator<=, LE)
 
 /** \returns an expression of the coefficient-wise \> operator of *this and \a other
   *
@@ -95,7 +155,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator<=,std::less_equal)
   *
   * \sa all(), any(), operator>=(), operator<()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator>,std::greater)
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>, operator<, LT)
 
 /** \returns an expression of the coefficient-wise \>= operator of *this and \a other
   *
@@ -104,7 +164,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator>,std::greater)
   *
   * \sa all(), any(), operator>(), operator<=()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator>=,std::greater_equal)
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>=, operator<=, LE)
 
 /** \returns an expression of the coefficient-wise == operator of *this and \a other
   *
@@ -118,7 +178,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator>=,std::greater_equal)
   *
   * \sa all(), any(), isApprox(), isMuchSmallerThan()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator==,std::equal_to)
+EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ)
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
   *
@@ -132,7 +192,11 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator==,std::equal_to)
   *
   * \sa all(), any(), isApprox(), isMuchSmallerThan()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(operator!=,std::not_equal_to)
+EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
+
+
+#undef EIGEN_MAKE_CWISE_COMP_OP
+#undef EIGEN_MAKE_CWISE_COMP_R_OP
 
 // scalar addition
 
@@ -143,12 +207,14 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator!=,std::not_equal_to)
   *
   * \sa operator+=(), operator-()
   */
+EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
 operator+(const Scalar& scalar) const
 {
   return CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>(derived(), internal::scalar_add_op<Scalar>(scalar));
 }
 
+EIGEN_DEVICE_FUNC
 friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
 operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
 {
@@ -162,16 +228,18 @@ operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>&
   *
   * \sa operator+(), operator-=()
   */
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const CwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
 operator-(const Scalar& scalar) const
 {
-  return *this + (-scalar);
+  return CwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>(derived(), internal::scalar_sub_op<Scalar>(scalar));;
 }
 
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> >
+EIGEN_DEVICE_FUNC
+friend inline const CwiseUnaryOp<internal::scalar_rsub_op<Scalar>, const Derived>
 operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
 {
-  return (-other) + scalar;
+  return CwiseUnaryOp<internal::scalar_rsub_op<Scalar>, const Derived>(other.derived(), internal::scalar_rsub_op<Scalar>(scalar));;
 }
 
 /** \returns an expression of the coefficient-wise && operator of *this and \a other
@@ -184,6 +252,7 @@ operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>&
   * \sa operator||(), select()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
 operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -202,6 +271,7 @@ operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa operator&&(), select()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
 operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -209,3 +279,4 @@ operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
                       THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
   return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
 }
+
diff --git a/nuparu/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/nuparu/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index a5963679..01432e2f 100644
--- a/nuparu/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/nuparu/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -1,5 +1,39 @@
 
 
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
+typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
+
+typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
+typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
+typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
+typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
+typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
+typedef CwiseUnaryOp<internal::scalar_tan_op<Scalar>, const Derived> TanReturnType;
+typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturnType;
+typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
+typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
+typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+typedef CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived> PowReturnType;
+typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
+typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
+typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
+typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
+typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
+typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
+typedef CwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived> IsInfReturnType;
+typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFiniteReturnType;
+
 /** \returns an expression of the coefficient-wise absolute value of \c *this
   *
   * Example: \include Cwise_abs.cpp
@@ -7,10 +41,25 @@
   *
   * \sa abs2()
   */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const AbsReturnType
 abs() const
 {
-  return derived();
+  return AbsReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise phase angle of \c *this
+  *
+  * Example: \include Cwise_arg.cpp
+  * Output: \verbinclude Cwise_arg.out
+  *
+  * \sa abs()
+  */
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const ArgReturnType
+arg() const
+{
+  return ArgReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
@@ -20,76 +69,173 @@ abs() const
   *
   * \sa abs(), square()
   */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const Abs2ReturnType
 abs2() const
 {
-  return derived();
+  return Abs2ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise exponential of *this.
+  *
+  * This function computes the coefficient-wise exponential. The function MatrixBase::exp() in the
+  * unsupported module MatrixFunctions computes the matrix exponential.
   *
   * Example: \include Cwise_exp.cpp
   * Output: \verbinclude Cwise_exp.out
   *
   * \sa pow(), log(), sin(), cos()
   */
-inline const CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const ExpReturnType
 exp() const
 {
-  return derived();
+  return ExpReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise logarithm of *this.
+  *
+  * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the
+  * unsupported module MatrixFunctions computes the matrix logarithm.
   *
   * Example: \include Cwise_log.cpp
   * Output: \verbinclude Cwise_log.out
   *
   * \sa exp()
   */
-inline const CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const LogReturnType
 log() const
 {
-  return derived();
+  return LogReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise base-10 logarithm of *this.
+  *
+  * This function computes the coefficient-wise base-10 logarithm.
+  *
+  * Example: \include Cwise_log10.cpp
+  * Output: \verbinclude Cwise_log10.out
+  *
+  * \sa log()
+  */
+EIGEN_DEVICE_FUNC
+inline const Log10ReturnType
+log10() const
+{
+  return Log10ReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise square root of *this.
+  *
+  * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the
+  * unsupported module MatrixFunctions computes the matrix square root.
   *
   * Example: \include Cwise_sqrt.cpp
   * Output: \verbinclude Cwise_sqrt.out
   *
   * \sa pow(), square()
   */
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const SqrtReturnType
 sqrt() const
 {
-  return derived();
+  return SqrtReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise inverse square root of *this.
+  *
+  * This function computes the coefficient-wise inverse square root.
+  *
+  * Example: \include Cwise_sqrt.cpp
+  * Output: \verbinclude Cwise_sqrt.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const RsqrtReturnType
+rsqrt() const
+{
+  return RsqrtReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise signum of *this.
+  *
+  * This function computes the coefficient-wise signum.
+  *
+  * Example: \include Cwise_sign.cpp
+  * Output: \verbinclude Cwise_sign.out
+  *
+  * \sa pow(), square()
+  */
+EIGEN_DEVICE_FUNC
+inline const SignReturnType
+sign() const
+{
+  return SignReturnType(derived());
+}
+
+
 /** \returns an expression of the coefficient-wise cosine of *this.
+  *
+  * This function computes the coefficient-wise cosine. The function MatrixBase::cos() in the
+  * unsupported module MatrixFunctions computes the matrix cosine.
   *
   * Example: \include Cwise_cos.cpp
   * Output: \verbinclude Cwise_cos.out
   *
   * \sa sin(), acos()
   */
-inline const CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const CosReturnType
 cos() const
 {
-  return derived();
+  return CosReturnType(derived());
 }
 
 
 /** \returns an expression of the coefficient-wise sine of *this.
+  *
+  * This function computes the coefficient-wise sine. The function MatrixBase::sin() in the
+  * unsupported module MatrixFunctions computes the matrix sine.
   *
   * Example: \include Cwise_sin.cpp
   * Output: \verbinclude Cwise_sin.out
   *
   * \sa cos(), asin()
   */
-inline const CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const SinReturnType
 sin() const
 {
-  return derived();
+  return SinReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise tan of *this.
+  *
+  * Example: \include Cwise_tan.cpp
+  * Output: \verbinclude Cwise_tan.out
+  *
+  * \sa cos(), sin()
+  */
+EIGEN_DEVICE_FUNC
+inline const TanReturnType
+tan() const
+{
+  return TanReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise arc tan of *this.
+  *
+  * Example: \include Cwise_atan.cpp
+  * Output: \verbinclude Cwise_atan.out
+  *
+  * \sa tan(), asin(), acos()
+  */
+inline const AtanReturnType
+atan() const
+{
+  return AtanReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise arc cosine of *this.
@@ -99,10 +245,11 @@ sin() const
   *
   * \sa cos(), asin()
   */
-inline const CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const AcosReturnType
 acos() const
 {
-  return derived();
+  return AcosReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise arc sine of *this.
@@ -112,38 +259,108 @@ acos() const
   *
   * \sa sin(), acos()
   */
-inline const CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const AsinReturnType
 asin() const
 {
-  return derived();
+  return AsinReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise tan of *this.
+/** \returns an expression of the coefficient-wise hyperbolic tan of *this.
   *
-  * Example: \include Cwise_tan.cpp
-  * Output: \verbinclude Cwise_tan.out
+  * Example: \include Cwise_tanh.cpp
+  * Output: \verbinclude Cwise_tanh.out
   *
-  * \sa cos(), sin()
+  * \sa tan(), sinh(), cosh()
   */
-inline const CwiseUnaryOp<internal::scalar_tan_op<Scalar>, Derived>
-tan() const
+inline const TanhReturnType
+tanh() const
 {
-  return derived();
+  return TanhReturnType(derived());
 }
 
+/** \returns an expression of the coefficient-wise hyperbolic sin of *this.
+  *
+  * Example: \include Cwise_sinh.cpp
+  * Output: \verbinclude Cwise_sinh.out
+  *
+  * \sa sin(), tanh(), cosh()
+  */
+inline const SinhReturnType
+sinh() const
+{
+  return SinhReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise hyperbolic cos of *this.
+  *
+  * Example: \include Cwise_cosh.cpp
+  * Output: \verbinclude Cwise_cosh.out
+  *
+  * \sa tan(), sinh(), cosh()
+  */
+inline const CoshReturnType
+cosh() const
+{
+  return CoshReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+ *
+ * Example: \include Cwise_lgamma.cpp
+ * Output: \verbinclude Cwise_lgamma.out
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const LgammaReturnType
+lgamma() const
+{
+  return LgammaReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise Gauss error
+ * function of *this.
+ *
+ * Example: \include Cwise_erf.cpp
+ * Output: \verbinclude Cwise_erf.out
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const ErfReturnType
+erf() const
+{
+  return ErfReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise Complementary error
+ * function of *this.
+ *
+ * Example: \include Cwise_erfc.cpp
+ * Output: \verbinclude Cwise_erfc.out
+ *
+ * \sa cos(), sin(), tan()
+ */
+inline const ErfcReturnType
+erfc() const
+{
+  return ErfcReturnType(derived());
+}
 
 /** \returns an expression of the coefficient-wise power of *this to the given exponent.
+  *
+  * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
+  * unsupported module MatrixFunctions computes the matrix power.
   *
   * Example: \include Cwise_pow.cpp
   * Output: \verbinclude Cwise_pow.out
   *
   * \sa exp(), log()
   */
-inline const CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const PowReturnType
 pow(const Scalar& exponent) const
 {
-  return CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-          (derived(), internal::scalar_pow_op<Scalar>(exponent));
+  return PowReturnType(derived(), internal::scalar_pow_op<Scalar>(exponent));
 }
 
 
@@ -154,10 +371,11 @@ pow(const Scalar& exponent) const
   *
   * \sa operator/(), operator*()
   */
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const InverseReturnType
 inverse() const
 {
-  return derived();
+  return InverseReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise square of *this.
@@ -167,10 +385,11 @@ inverse() const
   *
   * \sa operator/(), operator*(), abs2()
   */
-inline const CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const SquareReturnType
 square() const
 {
-  return derived();
+  return SquareReturnType(derived());
 }
 
 /** \returns an expression of the coefficient-wise cube of *this.
@@ -180,24 +399,105 @@ square() const
   *
   * \sa square(), pow()
   */
-inline const CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const CubeReturnType
 cube() const
 {
-  return derived();
+  return CubeReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise round of *this.
+  *
+  * Example: \include Cwise_round.cpp
+  * Output: \verbinclude Cwise_round.out
+  *
+  * \sa ceil(), floor()
+  */
+inline const RoundReturnType
+round() const
+{
+  return RoundReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise floor of *this.
+  *
+  * Example: \include Cwise_floor.cpp
+  * Output: \verbinclude Cwise_floor.out
+  *
+  * \sa ceil(), round()
+  */
+inline const FloorReturnType
+floor() const
+{
+  return FloorReturnType(derived());
 }
 
-#define EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(METHOD_NAME,FUNCTOR) \
-  inline const CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
-  METHOD_NAME(const Scalar& s) const { \
-    return CwiseUnaryOp<std::binder2nd<FUNCTOR<Scalar> >, const Derived> \
-            (derived(), std::bind2nd(FUNCTOR<Scalar>(), s)); \
-  }
+/** \returns an expression of the coefficient-wise ceil of *this.
+  *
+  * Example: \include Cwise_ceil.cpp
+  * Output: \verbinclude Cwise_ceil.out
+  *
+  * \sa floor(), round()
+  */
+inline const CeilReturnType
+ceil() const
+{
+  return CeilReturnType(derived());
+}
 
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator==,  std::equal_to)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator!=,  std::not_equal_to)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<,   std::less)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator<=,  std::less_equal)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>,   std::greater)
-EIGEN_MAKE_SCALAR_CWISE_UNARY_OP(operator>=,  std::greater_equal)
+/** \returns an expression of the coefficient-wise isnan of *this.
+  *
+  * Example: \include Cwise_isNaN.cpp
+  * Output: \verbinclude Cwise_isNaN.out
+  *
+  * \sa isfinite(), isinf()
+  */
+inline const IsNaNReturnType
+isNaN() const
+{
+  return IsNaNReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise isinf of *this.
+  *
+  * Example: \include Cwise_isInf.cpp
+  * Output: \verbinclude Cwise_isInf.out
+  *
+  * \sa isnan(), isfinite()
+  */
+inline const IsInfReturnType
+isInf() const
+{
+  return IsInfReturnType(derived());
+}
 
+/** \returns an expression of the coefficient-wise isfinite of *this.
+  *
+  * Example: \include Cwise_isFinite.cpp
+  * Output: \verbinclude Cwise_isFinite.out
+  *
+  * \sa isnan(), isinf()
+  */
+inline const IsFiniteReturnType
+isFinite() const
+{
+  return IsFiniteReturnType(derived());
+}
 
+/** \returns an expression of the coefficient-wise ! operator of *this
+  *
+  * \warning this operator is for expression of bool only.
+  *
+  * Example: \include Cwise_boolean_not.cpp
+  * Output: \verbinclude Cwise_boolean_not.out
+  *
+  * \sa operator!=()
+  */
+EIGEN_DEVICE_FUNC
+inline const BooleanNotReturnType
+operator!() const
+{
+  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value),
+                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
+  return BooleanNotReturnType(derived());
+}
diff --git a/nuparu/include/Eigen/src/plugins/BlockMethods.h b/nuparu/include/Eigen/src/plugins/BlockMethods.h
index 6911bede..9b7fdc4a 100644
--- a/nuparu/include/Eigen/src/plugins/BlockMethods.h
+++ b/nuparu/include/Eigen/src/plugins/BlockMethods.h
@@ -53,12 +53,14 @@ template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBloc
   *
   * \sa class Block, block(Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline Block<Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols)
 {
   return Block<Derived>(derived(), startRow, startCol, blockRows, blockCols);
 }
 
 /** This is the const version of block(Index,Index,Index,Index). */
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
 {
   return Block<const Derived>(derived(), startRow, startCol, blockRows, blockCols);
@@ -77,12 +79,14 @@ inline const Block<const Derived> block(Index startRow, Index startCol, Index bl
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline Block<Derived> topRightCorner(Index cRows, Index cCols)
 {
   return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 /** This is the const version of topRightCorner(Index, Index).*/
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived> topRightCorner(Index cRows, Index cCols) const
 {
   return Block<const Derived>(derived(), 0, cols() - cCols, cRows, cCols);
@@ -99,6 +103,7 @@ inline const Block<const Derived> topRightCorner(Index cRows, Index cCols) const
   * \sa class Block, block<int,int>(Index,Index)
   */
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline Block<Derived, CRows, CCols> topRightCorner()
 {
   return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
@@ -106,6 +111,7 @@ inline Block<Derived, CRows, CCols> topRightCorner()
 
 /** This is the const version of topRightCorner<int, int>().*/
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived, CRows, CCols> topRightCorner() const
 {
   return Block<const Derived, CRows, CCols>(derived(), 0, cols() - CCols);
@@ -113,13 +119,13 @@ inline const Block<const Derived, CRows, CCols> topRightCorner() const
 
 /** \returns an expression of a top-right corner of *this.
   *
-  * \tparam CRows number of rows in corner as specified at compile time
-  * \tparam CCols number of columns in corner as specified at compile time
-  * \param  cRows number of rows in corner as specified at run time
-  * \param  cCols number of columns in corner as specified at run time
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
   *
-  * This function is mainly useful for corners where the number of rows is specified at compile time
-  * and the number of columns is specified at run time, or vice versa. The compile-time and run-time
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
   * information should not contradict. In other words, \a cRows should equal \a CRows unless
   * \a CRows is \a Dynamic, and the same for the number of columns.
   *
@@ -153,12 +159,14 @@ inline const Block<const Derived, CRows, CCols> topRightCorner(Index cRows, Inde
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline Block<Derived> topLeftCorner(Index cRows, Index cCols)
 {
   return Block<Derived>(derived(), 0, 0, cRows, cCols);
 }
 
 /** This is the const version of topLeftCorner(Index, Index).*/
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived> topLeftCorner(Index cRows, Index cCols) const
 {
   return Block<const Derived>(derived(), 0, 0, cRows, cCols);
@@ -174,6 +182,7 @@ inline const Block<const Derived> topLeftCorner(Index cRows, Index cCols) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline Block<Derived, CRows, CCols> topLeftCorner()
 {
   return Block<Derived, CRows, CCols>(derived(), 0, 0);
@@ -181,6 +190,7 @@ inline Block<Derived, CRows, CCols> topLeftCorner()
 
 /** This is the const version of topLeftCorner<int, int>().*/
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived, CRows, CCols> topLeftCorner() const
 {
   return Block<const Derived, CRows, CCols>(derived(), 0, 0);
@@ -188,13 +198,13 @@ inline const Block<const Derived, CRows, CCols> topLeftCorner() const
 
 /** \returns an expression of a top-left corner of *this.
   *
-  * \tparam CRows number of rows in corner as specified at compile time
-  * \tparam CCols number of columns in corner as specified at compile time
-  * \param  cRows number of rows in corner as specified at run time
-  * \param  cCols number of columns in corner as specified at run time
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
   *
-  * This function is mainly useful for corners where the number of rows is specified at compile time
-  * and the number of columns is specified at run time, or vice versa. The compile-time and run-time
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
   * information should not contradict. In other words, \a cRows should equal \a CRows unless
   * \a CRows is \a Dynamic, and the same for the number of columns.
   *
@@ -228,12 +238,14 @@ inline const Block<const Derived, CRows, CCols> topLeftCorner(Index cRows, Index
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline Block<Derived> bottomRightCorner(Index cRows, Index cCols)
 {
   return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 /** This is the const version of bottomRightCorner(Index, Index).*/
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived> bottomRightCorner(Index cRows, Index cCols) const
 {
   return Block<const Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
@@ -249,6 +261,7 @@ inline const Block<const Derived> bottomRightCorner(Index cRows, Index cCols) co
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline Block<Derived, CRows, CCols> bottomRightCorner()
 {
   return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
@@ -256,6 +269,7 @@ inline Block<Derived, CRows, CCols> bottomRightCorner()
 
 /** This is the const version of bottomRightCorner<int, int>().*/
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived, CRows, CCols> bottomRightCorner() const
 {
   return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
@@ -263,13 +277,13 @@ inline const Block<const Derived, CRows, CCols> bottomRightCorner() const
 
 /** \returns an expression of a bottom-right corner of *this.
   *
-  * \tparam CRows number of rows in corner as specified at compile time
-  * \tparam CCols number of columns in corner as specified at compile time
-  * \param  cRows number of rows in corner as specified at run time
-  * \param  cCols number of columns in corner as specified at run time
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
   *
-  * This function is mainly useful for corners where the number of rows is specified at compile time
-  * and the number of columns is specified at run time, or vice versa. The compile-time and run-time
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
   * information should not contradict. In other words, \a cRows should equal \a CRows unless
   * \a CRows is \a Dynamic, and the same for the number of columns.
   *
@@ -303,12 +317,14 @@ inline const Block<const Derived, CRows, CCols> bottomRightCorner(Index cRows, I
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline Block<Derived> bottomLeftCorner(Index cRows, Index cCols)
 {
   return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 /** This is the const version of bottomLeftCorner(Index, Index).*/
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived> bottomLeftCorner(Index cRows, Index cCols) const
 {
   return Block<const Derived>(derived(), rows() - cRows, 0, cRows, cCols);
@@ -324,6 +340,7 @@ inline const Block<const Derived> bottomLeftCorner(Index cRows, Index cCols) con
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline Block<Derived, CRows, CCols> bottomLeftCorner()
 {
   return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
@@ -331,6 +348,7 @@ inline Block<Derived, CRows, CCols> bottomLeftCorner()
 
 /** This is the const version of bottomLeftCorner<int, int>().*/
 template<int CRows, int CCols>
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived, CRows, CCols> bottomLeftCorner() const
 {
   return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, 0);
@@ -338,13 +356,13 @@ inline const Block<const Derived, CRows, CCols> bottomLeftCorner() const
 
 /** \returns an expression of a bottom-left corner of *this.
   *
-  * \tparam CRows number of rows in corner as specified at compile time
-  * \tparam CCols number of columns in corner as specified at compile time
-  * \param  cRows number of rows in corner as specified at run time
-  * \param  cCols number of columns in corner as specified at run time
+  * \tparam CRows number of rows in corner as specified at compile-time
+  * \tparam CCols number of columns in corner as specified at compile-time
+  * \param  cRows number of rows in corner as specified at run-time
+  * \param  cCols number of columns in corner as specified at run-time
   *
-  * This function is mainly useful for corners where the number of rows is specified at compile time
-  * and the number of columns is specified at run time, or vice versa. The compile-time and run-time
+  * This function is mainly useful for corners where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
   * information should not contradict. In other words, \a cRows should equal \a CRows unless
   * \a CRows is \a Dynamic, and the same for the number of columns.
   *
@@ -377,12 +395,14 @@ inline const Block<const Derived, CRows, CCols> bottomLeftCorner(Index cRows, In
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline RowsBlockXpr topRows(Index n)
 {
   return RowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
 /** This is the const version of topRows(Index).*/
+EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr topRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
@@ -390,7 +410,11 @@ inline ConstRowsBlockXpr topRows(Index n) const
 
 /** \returns a block consisting of the top rows of *this.
   *
-  * \tparam N the number of rows in the block
+  * \tparam N the number of rows in the block as specified at compile-time
+  * \param n the number of rows in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_topRows.cpp
   * Output: \verbinclude MatrixBase_template_int_topRows.out
@@ -398,16 +422,18 @@ inline ConstRowsBlockXpr topRows(Index n) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int N>
-inline typename NRowsBlockXpr<N>::Type topRows()
+EIGEN_DEVICE_FUNC
+inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
 {
-  return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, N, cols());
+  return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 /** This is the const version of topRows<int>().*/
 template<int N>
-inline typename ConstNRowsBlockXpr<N>::Type topRows() const
+EIGEN_DEVICE_FUNC
+inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 {
-  return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, N, cols());
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 
@@ -421,12 +447,14 @@ inline typename ConstNRowsBlockXpr<N>::Type topRows() const
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline RowsBlockXpr bottomRows(Index n)
 {
   return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
 /** This is the const version of bottomRows(Index).*/
+EIGEN_DEVICE_FUNC
 inline ConstRowsBlockXpr bottomRows(Index n) const
 {
   return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
@@ -434,7 +462,11 @@ inline ConstRowsBlockXpr bottomRows(Index n) const
 
 /** \returns a block consisting of the bottom rows of *this.
   *
-  * \tparam N the number of rows in the block
+  * \tparam N the number of rows in the block as specified at compile-time
+  * \param n the number of rows in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_bottomRows.cpp
   * Output: \verbinclude MatrixBase_template_int_bottomRows.out
@@ -442,16 +474,18 @@ inline ConstRowsBlockXpr bottomRows(Index n) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int N>
-inline typename NRowsBlockXpr<N>::Type bottomRows()
+EIGEN_DEVICE_FUNC
+inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
 {
-  return typename NRowsBlockXpr<N>::Type(derived(), rows() - N, 0, N, cols());
+  return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 /** This is the const version of bottomRows<int>().*/
 template<int N>
-inline typename ConstNRowsBlockXpr<N>::Type bottomRows() const
+EIGEN_DEVICE_FUNC
+inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 {
-  return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - N, 0, N, cols());
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 
@@ -459,28 +493,34 @@ inline typename ConstNRowsBlockXpr<N>::Type bottomRows() const
 /** \returns a block consisting of a range of rows of *this.
   *
   * \param startRow the index of the first row in the block
-  * \param numRows the number of rows in the block
+  * \param n the number of rows in the block
   *
   * Example: \include DenseBase_middleRows_int.cpp
   * Output: \verbinclude DenseBase_middleRows_int.out
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
-inline RowsBlockXpr middleRows(Index startRow, Index numRows)
+EIGEN_DEVICE_FUNC
+inline RowsBlockXpr middleRows(Index startRow, Index n)
 {
-  return RowsBlockXpr(derived(), startRow, 0, numRows, cols());
+  return RowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
 /** This is the const version of middleRows(Index,Index).*/
-inline ConstRowsBlockXpr middleRows(Index startRow, Index numRows) const
+EIGEN_DEVICE_FUNC
+inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
 {
-  return ConstRowsBlockXpr(derived(), startRow, 0, numRows, cols());
+  return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
 /** \returns a block consisting of a range of rows of *this.
   *
-  * \tparam N the number of rows in the block
+  * \tparam N the number of rows in the block as specified at compile-time
   * \param startRow the index of the first row in the block
+  * \param n the number of rows in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include DenseBase_template_int_middleRows.cpp
   * Output: \verbinclude DenseBase_template_int_middleRows.out
@@ -488,16 +528,18 @@ inline ConstRowsBlockXpr middleRows(Index startRow, Index numRows) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int N>
-inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow)
+EIGEN_DEVICE_FUNC
+inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
 {
-  return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, N, cols());
+  return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 /** This is the const version of middleRows<int>().*/
 template<int N>
-inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
 {
-  return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, N, cols());
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 
@@ -511,12 +553,14 @@ inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow) const
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline ColsBlockXpr leftCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
 /** This is the const version of leftCols(Index).*/
+EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr leftCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
@@ -524,7 +568,11 @@ inline ConstColsBlockXpr leftCols(Index n) const
 
 /** \returns a block consisting of the left columns of *this.
   *
-  * \tparam N the number of columns in the block
+  * \tparam N the number of columns in the block as specified at compile-time
+  * \param n the number of columns in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_leftCols.cpp
   * Output: \verbinclude MatrixBase_template_int_leftCols.out
@@ -532,16 +580,18 @@ inline ConstColsBlockXpr leftCols(Index n) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int N>
-inline typename NColsBlockXpr<N>::Type leftCols()
+EIGEN_DEVICE_FUNC
+inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
 {
-  return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), N);
+  return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 /** This is the const version of leftCols<int>().*/
 template<int N>
-inline typename ConstNColsBlockXpr<N>::Type leftCols() const
+EIGEN_DEVICE_FUNC
+inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 {
-  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), N);
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 
@@ -555,12 +605,14 @@ inline typename ConstNColsBlockXpr<N>::Type leftCols() const
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline ColsBlockXpr rightCols(Index n)
 {
   return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
 /** This is the const version of rightCols(Index).*/
+EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr rightCols(Index n) const
 {
   return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
@@ -568,7 +620,11 @@ inline ConstColsBlockXpr rightCols(Index n) const
 
 /** \returns a block consisting of the right columns of *this.
   *
-  * \tparam N the number of columns in the block
+  * \tparam N the number of columns in the block as specified at compile-time
+  * \param n the number of columns in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_rightCols.cpp
   * Output: \verbinclude MatrixBase_template_int_rightCols.out
@@ -576,16 +632,18 @@ inline ConstColsBlockXpr rightCols(Index n) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int N>
-inline typename NColsBlockXpr<N>::Type rightCols()
+EIGEN_DEVICE_FUNC
+inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
 {
-  return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - N, rows(), N);
+  return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 /** This is the const version of rightCols<int>().*/
 template<int N>
-inline typename ConstNColsBlockXpr<N>::Type rightCols() const
+EIGEN_DEVICE_FUNC
+inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 {
-  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - N, rows(), N);
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 
@@ -600,12 +658,14 @@ inline typename ConstNColsBlockXpr<N>::Type rightCols() const
   *
   * \sa class Block, block(Index,Index,Index,Index)
   */
+EIGEN_DEVICE_FUNC
 inline ColsBlockXpr middleCols(Index startCol, Index numCols)
 {
   return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
 /** This is the const version of middleCols(Index,Index).*/
+EIGEN_DEVICE_FUNC
 inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 {
   return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
@@ -613,8 +673,12 @@ inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 
 /** \returns a block consisting of a range of columns of *this.
   *
-  * \tparam N the number of columns in the block
+  * \tparam N the number of columns in the block as specified at compile-time
   * \param startCol the index of the first column in the block
+  * \param n the number of columns in the block as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include DenseBase_template_int_middleCols.cpp
   * Output: \verbinclude DenseBase_template_int_middleCols.out
@@ -622,16 +686,18 @@ inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int N>
-inline typename NColsBlockXpr<N>::Type middleCols(Index startCol)
+EIGEN_DEVICE_FUNC
+inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
 {
-  return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), N);
+  return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 /** This is the const version of middleCols<int>().*/
 template<int N>
-inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
 {
-  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), N);
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 
@@ -653,6 +719,7 @@ inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol) const
   * \sa class Block, block(Index,Index,Index,Index)
   */
 template<int BlockRows, int BlockCols>
+EIGEN_DEVICE_FUNC
 inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol)
 {
   return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
@@ -660,6 +727,7 @@ inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol
 
 /** This is the const version of block<>(Index, Index). */
 template<int BlockRows, int BlockCols>
+EIGEN_DEVICE_FUNC
 inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol) const
 {
   return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
@@ -667,15 +735,15 @@ inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, In
 
 /** \returns an expression of a block in *this.
   *
-  * \tparam BlockRows number of rows in block as specified at compile time
-  * \tparam BlockCols number of columns in block as specified at compile time
+  * \tparam BlockRows number of rows in block as specified at compile-time
+  * \tparam BlockCols number of columns in block as specified at compile-time
   * \param  startRow  the first row in the block
   * \param  startCol  the first column in the block
-  * \param  blockRows number of rows in block as specified at run time
-  * \param  blockCols number of columns in block as specified at run time
+  * \param  blockRows number of rows in block as specified at run-time
+  * \param  blockCols number of columns in block as specified at run-time
   *
-  * This function is mainly useful for blocks where the number of rows is specified at compile time
-  * and the number of columns is specified at run time, or vice versa. The compile-time and run-time
+  * This function is mainly useful for blocks where the number of rows is specified at compile-time
+  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
   * information should not contradict. In other words, \a blockRows should equal \a BlockRows unless
   * \a BlockRows is \a Dynamic, and the same for the number of columns.
   *
@@ -705,12 +773,14 @@ inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, In
   * Output: \verbinclude MatrixBase_col.out
   *
   * \sa row(), class Block */
+EIGEN_DEVICE_FUNC
 inline ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
 /** This is the const version of col(). */
+EIGEN_DEVICE_FUNC
 inline ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
@@ -722,12 +792,14 @@ inline ConstColXpr col(Index i) const
   * Output: \verbinclude MatrixBase_row.out
   *
   * \sa col(), class Block */
+EIGEN_DEVICE_FUNC
 inline RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
 /** This is the const version of row(). */
+EIGEN_DEVICE_FUNC
 inline ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
@@ -738,7 +810,7 @@ inline ConstRowXpr row(Index i) const
   * \only_for_vectors
   *
   * \param start the first coefficient in the segment
-  * \param vecSize the number of coefficients in the segment
+  * \param n the number of coefficients in the segment
   *
   * Example: \include MatrixBase_segment_int_int.cpp
   * Output: \verbinclude MatrixBase_segment_int_int.out
@@ -749,25 +821,27 @@ inline ConstRowXpr row(Index i) const
   *
   * \sa class Block, segment(Index)
   */
-inline SegmentReturnType segment(Index start, Index vecSize)
+EIGEN_DEVICE_FUNC
+inline SegmentReturnType segment(Index start, Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), start, vecSize);
+  return SegmentReturnType(derived(), start, n);
 }
 
 
 /** This is the const version of segment(Index,Index).*/
-inline ConstSegmentReturnType segment(Index start, Index vecSize) const
+EIGEN_DEVICE_FUNC
+inline ConstSegmentReturnType segment(Index start, Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), start, vecSize);
+  return ConstSegmentReturnType(derived(), start, n);
 }
 
 /** \returns a dynamic-size expression of the first coefficients of *this.
   *
   * \only_for_vectors
   *
-  * \param vecSize the number of coefficients in the block
+  * \param n the number of coefficients in the segment
   *
   * Example: \include MatrixBase_start_int.cpp
   * Output: \verbinclude MatrixBase_start_int.out
@@ -778,25 +852,26 @@ inline ConstSegmentReturnType segment(Index start, Index vecSize) const
   *
   * \sa class Block, block(Index,Index)
   */
-inline SegmentReturnType head(Index vecSize)
+EIGEN_DEVICE_FUNC
+inline SegmentReturnType head(Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), 0, vecSize);
+  return SegmentReturnType(derived(), 0, n);
 }
 
 /** This is the const version of head(Index).*/
-inline ConstSegmentReturnType
-  head(Index vecSize) const
+EIGEN_DEVICE_FUNC
+inline ConstSegmentReturnType head(Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), 0, vecSize);
+  return ConstSegmentReturnType(derived(), 0, n);
 }
 
 /** \returns a dynamic-size expression of the last coefficients of *this.
   *
   * \only_for_vectors
   *
-  * \param vecSize the number of coefficients in the block
+  * \param n the number of coefficients in the segment
   *
   * Example: \include MatrixBase_end_int.cpp
   * Output: \verbinclude MatrixBase_end_int.out
@@ -807,95 +882,114 @@ inline ConstSegmentReturnType
   *
   * \sa class Block, block(Index,Index)
   */
-inline SegmentReturnType tail(Index vecSize)
+EIGEN_DEVICE_FUNC
+inline SegmentReturnType tail(Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), this->size() - vecSize, vecSize);
+  return SegmentReturnType(derived(), this->size() - n, n);
 }
 
 /** This is the const version of tail(Index).*/
-inline ConstSegmentReturnType tail(Index vecSize) const
+EIGEN_DEVICE_FUNC
+inline ConstSegmentReturnType tail(Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), this->size() - vecSize, vecSize);
+  return ConstSegmentReturnType(derived(), this->size() - n, n);
 }
 
 /** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
   *
   * \only_for_vectors
   *
-  * The template parameter \a Size is the number of coefficients in the block
+  * \tparam N the number of coefficients in the segment as specified at compile-time
+  * \param start the index of the first element in the segment
+  * \param n the number of coefficients in the segment as specified at compile-time
   *
-  * \param start the index of the first element of the sub-vector
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_segment.cpp
   * Output: \verbinclude MatrixBase_template_int_segment.out
   *
   * \sa class Block
   */
-template<int Size>
-inline typename FixedSegmentReturnType<Size>::Type segment(Index start)
+template<int N>
+EIGEN_DEVICE_FUNC
+inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<Size>::Type(derived(), start);
+  return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
 /** This is the const version of segment<int>(Index).*/
-template<int Size>
-inline typename ConstFixedSegmentReturnType<Size>::Type segment(Index start) const
+template<int N>
+EIGEN_DEVICE_FUNC
+inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<Size>::Type(derived(), start);
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
 /** \returns a fixed-size expression of the first coefficients of *this.
   *
   * \only_for_vectors
   *
-  * The template parameter \a Size is the number of coefficients in the block
+  * \tparam N the number of coefficients in the segment as specified at compile-time
+  * \param  n the number of coefficients in the segment as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_start.cpp
   * Output: \verbinclude MatrixBase_template_int_start.out
   *
   * \sa class Block
   */
-template<int Size>
-inline typename FixedSegmentReturnType<Size>::Type head()
+template<int N>
+EIGEN_DEVICE_FUNC
+inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<Size>::Type(derived(), 0);
+  return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
 /** This is the const version of head<int>().*/
-template<int Size>
-inline typename ConstFixedSegmentReturnType<Size>::Type head() const
+template<int N>
+EIGEN_DEVICE_FUNC
+inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<Size>::Type(derived(), 0);
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
 /** \returns a fixed-size expression of the last coefficients of *this.
   *
   * \only_for_vectors
   *
-  * The template parameter \a Size is the number of coefficients in the block
+  * \tparam N the number of coefficients in the segment as specified at compile-time
+  * \param  n the number of coefficients in the segment as specified at run-time
+  *
+  * The compile-time and run-time information should not contradict. In other words,
+  * \a n should equal \a N unless \a N is \a Dynamic.
   *
   * Example: \include MatrixBase_template_int_end.cpp
   * Output: \verbinclude MatrixBase_template_int_end.out
   *
   * \sa class Block
   */
-template<int Size>
-inline typename FixedSegmentReturnType<Size>::Type tail()
+template<int N>
+EIGEN_DEVICE_FUNC
+inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<Size>::Type(derived(), size() - Size);
+  return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
 
 /** This is the const version of tail<int>.*/
-template<int Size>
-inline typename ConstFixedSegmentReturnType<Size>::Type tail() const
+template<int N>
+EIGEN_DEVICE_FUNC
+inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<Size>::Type(derived(), size() - Size);
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
diff --git a/nuparu/include/Eigen/src/plugins/CommonCwiseBinaryOps.h b/nuparu/include/Eigen/src/plugins/CommonCwiseBinaryOps.h
index 688d2244..a8fa287c 100644
--- a/nuparu/include/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ b/nuparu/include/Eigen/src/plugins/CommonCwiseBinaryOps.h
@@ -38,6 +38,7 @@ EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
   * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
   */
 template<typename CustomBinaryOp, typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
 binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other, const CustomBinaryOp& func = CustomBinaryOp()) const
 {
diff --git a/nuparu/include/Eigen/src/plugins/CommonCwiseUnaryOps.h b/nuparu/include/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 08e931aa..050bce03 100644
--- a/nuparu/include/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/nuparu/include/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -14,6 +14,8 @@
 
 /** \internal Represents a scalar multiple of an expression */
 typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> ScalarMultipleReturnType;
+typedef CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived> ScalarComplexMultipleReturnType;
+
 /** \internal Represents a quotient of an expression by a scalar*/
 typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> ScalarQuotient1ReturnType;
 /** \internal the return type of conjugate() */
@@ -36,20 +38,24 @@ typedef CwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> ImagReturn
 /** \internal the return type of imag() */
 typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
 
+typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> NegativeReturnType;
+//typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
 /** \returns an expression of the opposite of \c *this
   */
-inline const CwiseUnaryOp<internal::scalar_opposite_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator-() const { return derived(); }
+EIGEN_DEVICE_FUNC
+inline const NegativeReturnType
+operator-() const { return NegativeReturnType(derived()); }
 
 
 /** \returns an expression of \c *this scaled by the scalar factor \a scalar */
+EIGEN_DEVICE_FUNC
 inline const ScalarMultipleReturnType
 operator*(const Scalar& scalar) const
 {
-  return CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
-    (derived(), internal::scalar_multiple_op<Scalar>(scalar));
+  return ScalarMultipleReturnType(derived(), internal::scalar_multiple_op<Scalar>(scalar));
 }
 
 #ifdef EIGEN_PARSED_BY_DOXYGEN
@@ -57,29 +63,34 @@ const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
 #endif
 
 /** \returns an expression of \c *this divided by the scalar value \a scalar */
-inline const CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>, const Derived>
+EIGEN_DEVICE_FUNC
+inline const ScalarQuotient1ReturnType
 operator/(const Scalar& scalar) const
 {
-  return CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
-    (derived(), internal::scalar_quotient1_op<Scalar>(scalar));
+  return ScalarQuotient1ReturnType(derived(), internal::scalar_quotient1_op<Scalar>(scalar));
 }
 
 /** Overloaded for efficient real matrix times complex scalar value */
-inline const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
+EIGEN_DEVICE_FUNC
+inline const ScalarComplexMultipleReturnType
 operator*(const std::complex<Scalar>& scalar) const
 {
-  return CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-    (*static_cast<const Derived*>(this), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
+  return ScalarComplexMultipleReturnType(derived(), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
 }
 
+EIGEN_DEVICE_FUNC
 inline friend const ScalarMultipleReturnType
 operator*(const Scalar& scalar, const StorageBaseType& matrix)
 { return matrix*scalar; }
 
+EIGEN_DEVICE_FUNC
 inline friend const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
 operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
 { return matrix*scalar; }
 
+
+template<class NewType> struct CastXpr { typedef typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<Scalar, NewType>, const Derived> >::type Type; };
+
 /** \returns an expression of *this with the \a Scalar type casted to
   * \a NewScalar.
   *
@@ -88,15 +99,17 @@ operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
   * \sa class CwiseUnaryOp
   */
 template<typename NewType>
-typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<typename internal::traits<Derived>::Scalar, NewType>, const Derived> >::type
+EIGEN_DEVICE_FUNC
+typename CastXpr<NewType>::Type
 cast() const
 {
-  return derived();
+  return typename CastXpr<NewType>::Type(derived());
 }
 
 /** \returns an expression of the complex conjugate of \c *this.
   *
   * \sa adjoint() */
+EIGEN_DEVICE_FUNC
 inline ConjugateReturnType
 conjugate() const
 {
@@ -106,14 +119,16 @@ conjugate() const
 /** \returns a read-only expression of the real part of \c *this.
   *
   * \sa imag() */
+EIGEN_DEVICE_FUNC
 inline RealReturnType
-real() const { return derived(); }
+real() const { return RealReturnType(derived()); }
 
 /** \returns an read-only expression of the imaginary part of \c *this.
   *
   * \sa real() */
+EIGEN_DEVICE_FUNC
 inline const ImagReturnType
-imag() const { return derived(); }
+imag() const { return ImagReturnType(derived()); }
 
 /** \brief Apply a unary operator coefficient-wise
   * \param[in]  func  Functor implementing the unary operator
@@ -135,6 +150,7 @@ imag() const { return derived(); }
   * \sa class CwiseUnaryOp, class CwiseBinaryOp
   */
 template<typename CustomUnaryOp>
+EIGEN_DEVICE_FUNC
 inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
 unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
 {
@@ -153,6 +169,7 @@ unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
   * \sa class CwiseUnaryOp, class CwiseBinaryOp
   */
 template<typename CustomViewOp>
+EIGEN_DEVICE_FUNC
 inline const CwiseUnaryView<CustomViewOp, const Derived>
 unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
 {
@@ -162,11 +179,13 @@ unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
 /** \returns a non const expression of the real part of \c *this.
   *
   * \sa imag() */
+EIGEN_DEVICE_FUNC
 inline NonConstRealReturnType
-real() { return derived(); }
+real() { return NonConstRealReturnType(derived()); }
 
 /** \returns a non const expression of the imaginary part of \c *this.
   *
   * \sa real() */
+EIGEN_DEVICE_FUNC
 inline NonConstImagReturnType
-imag() { return derived(); }
+imag() { return NonConstImagReturnType(derived()); }
diff --git a/nuparu/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/nuparu/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index 3a737df7..6dd2e119 100644
--- a/nuparu/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/nuparu/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -18,6 +18,7 @@
   * \sa class CwiseBinaryOp, cwiseAbs2
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
 cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -37,6 +38,7 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -56,6 +58,7 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -70,6 +73,7 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   * \sa class CwiseBinaryOp, max()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
 cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -80,10 +84,11 @@ cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   *
   * \sa class CwiseBinaryOp, min()
   */
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const ConstantReturnType>
 cwiseMin(const Scalar &other) const
 {
-  return cwiseMin(Derived::PlainObject::Constant(rows(), cols(), other));
+  return cwiseMin(Derived::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise max of *this and \a other
@@ -94,6 +99,7 @@ cwiseMin(const Scalar &other) const
   * \sa class CwiseBinaryOp, min()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
 cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
@@ -104,10 +110,11 @@ cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   *
   * \sa class CwiseBinaryOp, min()
   */
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const ConstantReturnType>
 cwiseMax(const Scalar &other) const
 {
-  return cwiseMax(Derived::PlainObject::Constant(rows(), cols(), other));
+  return cwiseMax(Derived::Constant(rows(), cols(), other));
 }
 
 
@@ -119,8 +126,27 @@ cwiseMax(const Scalar &other) const
   * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
   */
 template<typename OtherDerived>
+EIGEN_DEVICE_FUNC
 EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
 cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
   return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
+
+typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
+
+/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+  *
+  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+  * isMuchSmallerThan().
+  *
+  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+  */
+EIGEN_DEVICE_FUNC
+inline const CwiseScalarEqualReturnType
+cwiseEqual(const Scalar& s) const
+{
+  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,internal::cmp_EQ>());
+}
diff --git a/nuparu/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/nuparu/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h
index 0cf0640b..e16bb374 100644
--- a/nuparu/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ b/nuparu/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h
@@ -8,7 +8,14 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// This file is a base class plugin containing matrix specifics coefficient wise functions.
+// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
+// This include MatrixBase and SparseMatrixBase.
+
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
 
 /** \returns an expression of the coefficient-wise absolute value of \c *this
   *
@@ -17,8 +24,9 @@
   *
   * \sa cwiseAbs2()
   */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
-cwiseAbs() const { return derived(); }
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseAbsReturnType
+cwiseAbs() const { return CwiseAbsReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise squared absolute value of \c *this
   *
@@ -27,8 +35,9 @@ cwiseAbs() const { return derived(); }
   *
   * \sa cwiseAbs()
   */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
-cwiseAbs2() const { return derived(); }
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseAbs2ReturnType
+cwiseAbs2() const { return CwiseAbs2ReturnType(derived()); }
 
 /** \returns an expression of the coefficient-wise square root of *this.
   *
@@ -37,31 +46,28 @@ cwiseAbs2() const { return derived(); }
   *
   * \sa cwisePow(), cwiseSquare()
   */
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
-cwiseSqrt() const { return derived(); }
+EIGEN_DEVICE_FUNC
+inline const CwiseSqrtReturnType
+cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise inverse of *this.
+/** \returns an expression of the coefficient-wise signum of *this.
   *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
+  * Example: \include MatrixBase_cwiseSign.cpp
+  * Output: \verbinclude MatrixBase_cwiseSign.out
   *
-  * \sa cwiseProduct()
   */
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
-cwiseInverse() const { return derived(); }
+EIGEN_DEVICE_FUNC
+inline const CwiseSignReturnType
+cwiseSign() const { return CwiseSignReturnType(derived()); }
 
-/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+
+/** \returns an expression of the coefficient-wise inverse of *this.
   *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
+  * Example: \include MatrixBase_cwiseInverse.cpp
+  * Output: \verbinclude MatrixBase_cwiseInverse.out
   *
-  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+  * \sa cwiseProduct()
   */
-inline const CwiseUnaryOp<std::binder1st<std::equal_to<Scalar> >, const Derived>
-cwiseEqual(const Scalar& s) const
-{
-  return CwiseUnaryOp<std::binder1st<std::equal_to<Scalar> >,const Derived>
-          (derived(), std::bind1st(std::equal_to<Scalar>(), s));
-}
+EIGEN_DEVICE_FUNC
+inline const CwiseInverseReturnType
+cwiseInverse() const { return CwiseInverseReturnType(derived()); }
diff --git a/nuparu/include/openvdb/tools/AttributeArray.h b/nuparu/include/openvdb/tools/AttributeArray.h
new file mode 100644
index 00000000..9675e76b
--- /dev/null
+++ b/nuparu/include/openvdb/tools/AttributeArray.h
@@ -0,0 +1,1573 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file AttributeArray.h
+///
+/// @authors Dan Bailey, Mihai Alden, Peter Cucka
+///
+/// @brief  Attribute Array storage templated on type and compression codec.
+///
+
+
+#ifndef OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+
+#include <openvdb_points/Types.h>
+#include <openvdb/math/QuantizedUnitVec.h>
+#include <openvdb/util/Name.h>
+#include <openvdb/util/logging.h>
+#include <openvdb/io/io.h> // MappedFile
+#include <openvdb/io/Compression.h> // COMPRESS_BLOSC
+
+#include <openvdb_points/tools/IndexIterator.h>
+
+#include <tbb/spin_mutex.h>
+#include <tbb/atomic.h>
+
+#include <boost/scoped_array.hpp>
+
+#include <string>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+
+// Add new typedef for a Name pair
+typedef std::pair<Name, Name> NamePair;
+
+namespace tools {
+
+
+////////////////////////////////////////
+
+// Attribute Compression methods
+
+
+namespace attribute_compression {
+
+/// @brief Returns true if compression is available
+bool canCompress();
+
+/// @brief Retrieves the uncompressed size of buffer when uncompressed
+///
+/// @param buffer the compressed buffer
+int uncompressedSize(const char* buffer);
+
+/// @brief Retrieves the compressed size of buffer when compressed
+///
+/// @param buffer the uncompressed buffer
+/// @param typeSize the size of the data type
+/// @param uncompressedBytes number of uncompressed bytes
+int compressedSize(const char* buffer, const size_t typeSize, const int uncompressedBytes);
+
+/// @brief Compress and return the compressed buffer.
+///
+/// @param buffer the buffer to compress
+/// @param typeSize the size of the data type
+/// @param uncompressedBytes number of uncompressed bytes
+/// @param compressedBytes number of compressed bytes (written to this variable)
+/// @param cleanup if true, the supplied buffer will be deleted prior to allocating new memory
+char* compress( char* buffer, const size_t typeSize,
+                const int uncompressedBytes, int& compressedBytes,
+                const bool cleanup = false);
+
+/// @brief Compress and return the compressed buffer.
+///
+/// @param buffer the buffer to compress
+/// @param typeSize the size of the data type
+/// @param uncompressedBytes number of uncompressed bytes
+/// @param compressedBytes number of compressed bytes (written to this variable)
+///
+/// @note Unlike the non-const buffer version, the buffer will never be deleted.
+char* compress( const char* buffer, const size_t typeSize,
+                const int uncompressedBytes, int& compressedBytes);
+
+/// @brief Decompress and return the uncompressed buffer.
+///
+/// @param buffer the buffer to decompress
+/// @param expectedBytes the number of bytes expected once the buffer is decompressed
+/// @param cleanup if true, the supplied buffer will be deleted prior to allocating new memory
+char* decompress(char* buffer, const int expectedBytes, const bool cleanup = false);
+
+/// @brief Decompress and return the uncompressed buffer.
+///
+/// @param buffer the buffer to decompress
+/// @param expectedBytes the number of bytes expected once the buffer is decompressed
+///
+/// @note Unlike the non-const buffer version, the buffer will never be deleted.
+char* decompress(const char* buffer, const int expectedBytes);
+
+} // namespace attribute_compression
+
+
+////////////////////////////////////////
+
+// Utility methods
+
+template <typename IntegerT, typename FloatT>
+inline IntegerT
+floatingPointToFixedPoint(const FloatT s)
+{
+    BOOST_STATIC_ASSERT(boost::is_unsigned<IntegerT>::value);
+    if (FloatT(0.0) > s) return std::numeric_limits<IntegerT>::min();
+    else if (FloatT(1.0) <= s) return std::numeric_limits<IntegerT>::max();
+    return IntegerT(std::floor(s * FloatT(std::numeric_limits<IntegerT>::max())));
+}
+
+
+template <typename FloatT, typename IntegerT>
+inline FloatT
+fixedPointToFloatingPoint(const IntegerT s)
+{
+    BOOST_STATIC_ASSERT(boost::is_unsigned<IntegerT>::value);
+    return FloatT(s) / FloatT((std::numeric_limits<IntegerT>::max()));
+}
+
+
+template <typename IntegerVectorT, typename FloatT>
+inline IntegerVectorT
+floatingPointToFixedPoint(const math::Vec3<FloatT>& v)
+{
+    return IntegerVectorT(
+        floatingPointToFixedPoint<typename IntegerVectorT::ValueType>(v.x()),
+        floatingPointToFixedPoint<typename IntegerVectorT::ValueType>(v.y()),
+        floatingPointToFixedPoint<typename IntegerVectorT::ValueType>(v.z()));
+}
+
+template <typename FloatVectorT, typename IntegerT>
+inline FloatVectorT
+fixedPointToFloatingPoint(const math::Vec3<IntegerT>& v)
+{
+    return FloatVectorT(
+        fixedPointToFloatingPoint<typename FloatVectorT::ValueType>(v.x()),
+        fixedPointToFloatingPoint<typename FloatVectorT::ValueType>(v.y()),
+        fixedPointToFloatingPoint<typename FloatVectorT::ValueType>(v.z()));
+}
+
+
+////////////////////////////////////////
+
+// Attribute codec schemes
+
+template<typename StorageType_>
+struct NullAttributeCodec
+{
+    typedef StorageType_ StorageType;
+    template<typename ValueType> static void decode(const StorageType&, ValueType&);
+    template<typename ValueType> static void encode(const StorageType&, ValueType&);
+    static const char* name() { return "null"; }
+};
+
+
+template<typename IntType>
+struct FixedPointAttributeCodec
+{
+    typedef IntType StorageType;
+    template<typename ValueType> static void decode(const StorageType&, ValueType&);
+    template<typename ValueType> static void encode(const ValueType&, StorageType&);
+    static const char* name() { return "fxpt"; }
+};
+
+
+struct UnitVecAttributeCodec
+{
+    typedef uint16_t StorageType;
+    template<typename T> static void decode(const StorageType&, math::Vec3<T>&);
+    template<typename T> static void encode(const math::Vec3<T>&, StorageType&);
+    static const char* name() { return "uvec"; }
+};
+
+
+////////////////////////////////////////
+
+
+/// Base class for storing attribute data
+class AttributeArray
+{
+protected:
+    struct AccessorBase;
+    template <typename T> struct Accessor;
+
+    typedef boost::shared_ptr<AccessorBase>             AccessorBasePtr;
+
+public:
+    enum Flag { TRANSIENT = 0x1, HIDDEN = 0x2, GROUP=0x4, WRITEUNIFORM=0x8,
+                WRITEMEMCOMPRESS=0x10, WRITEDISKCOMPRESS=0x20, OUTOFCORE=0x40 };
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    struct FileInfo
+    {
+        FileInfo(): bufpos(0), bytes(0) {}
+        std::streamoff bufpos;
+        Index64 bytes;
+        io::MappedFile::Ptr mapping;
+        boost::shared_ptr<io::StreamMetadata> meta;
+    };
+#endif
+
+    typedef boost::shared_ptr<AttributeArray>           Ptr;
+    typedef boost::shared_ptr<const AttributeArray>     ConstPtr;
+
+    template <typename> friend class AttributeHandle;
+
+    typedef Ptr (*FactoryMethod)(size_t);
+
+    AttributeArray() : mCompressedBytes(0), mFlags(0) {}
+    virtual ~AttributeArray() {}
+
+    /// Return a copy of this attribute.
+    virtual AttributeArray::Ptr copy() const = 0;
+
+    /// Return an uncompressed copy of this attribute (will return a copy if not compressed).
+    virtual AttributeArray::Ptr copyUncompressed() const = 0;
+
+    /// Return the length of this array.
+    virtual size_t size() const = 0;
+
+    /// Return the number of bytes of memory used by this attribute.
+    virtual size_t memUsage() const = 0;
+
+    /// Create a new attribute array of the given (registered) type and length.
+    static Ptr create(const NamePair& type, size_t length);
+    /// Return @c true if the given attribute type name is registered.
+    static bool isRegistered(const NamePair& type);
+    /// Clear the attribute type registry.
+    static void clearRegistry();
+
+    /// Return the name of this attribute's type.
+    virtual const NamePair& type() const = 0;
+    /// Return @c true if this attribute is of the same type as the template parameter.
+    template<typename AttributeArrayType>
+    bool isType() const { return this->type() == AttributeArrayType::attributeType(); }
+
+    /// Return @c true if this attribute has a value type the same as the template parameter
+    template<typename ValueType>
+    bool hasValueType() const { return this->type().first == typeNameAsString<ValueType>();}
+
+    /// Set value at given index @a n from @a sourceIndex of another @a sourceArray
+    virtual void set(const Index n, const AttributeArray& sourceArray, const Index sourceIndex) = 0;
+
+    /// Return @c true if this array is stored as a single uniform value.
+    virtual bool isUniform() const = 0;
+    /// @brief  If this array is uniform, replace it with an array of length size().
+    /// @param  fill if true, assign the uniform value to each element of the array.
+    virtual void expand(bool fill = true) = 0;
+    /// Replace the existing array with a uniform zero value.
+    virtual void collapse() = 0;
+    /// Compact the existing array to become uniform if all values are identical
+    virtual bool compact() = 0;
+
+    /// Return @c true if this array is compressed.
+    bool isCompressed() const { return mCompressedBytes != 0; }
+    /// Compress the attribute array.
+    virtual bool compress() = 0;
+    /// Uncompress the attribute array.
+    virtual bool decompress() = 0;
+
+    /// @brief   Specify whether this attribute should be hidden (e.g., from UI or iterators).
+    /// @details This is useful if the attribute is used for blind data or as scratch space
+    ///          for a calculation.
+    /// @note    Attributes are not hidden by default.
+    void setHidden(bool state);
+    /// Return @c true if this attribute is hidden (e.g., from UI or iterators).
+    bool isHidden() const { return bool(mFlags & HIDDEN); }
+
+    /// @brief Specify whether this attribute should only exist in memory
+    ///        and not be serialized during stream output.
+    /// @note  Attributes are not transient by default.
+    void setTransient(bool state);
+    /// Return @c true if this attribute is not serialized during stream output.
+    bool isTransient() const { return bool(mFlags & TRANSIENT); }
+
+    /// @brief Retrieve the attribute array flags
+    uint16_t flags() const { return mFlags; }
+
+    IndexIter beginIndex() const;
+
+    /// Read attribute metadata and buffers from a stream.
+    virtual void read(std::istream&) = 0;
+    /// Write attribute metadata and buffers to a stream.
+    virtual void write(std::ostream&) const = 0;
+
+    /// Ensures all data is in-core
+    virtual void loadData() const = 0;
+
+    /// Check the compressed bytes and flags. If they are equal, perform a deeper
+    /// comparison check necessary on the inherited types (TypedAttributeArray)
+    /// Requires non operator implementation due to inheritance
+    bool operator==(const AttributeArray& other) const;
+    bool operator!=(const AttributeArray& other) const { return !this->operator==(other); }
+
+private:
+    /// Virtual function used by the comparison operator to perform
+    /// comparisons on inherited types
+    virtual bool isEqual(const AttributeArray& other) const = 0;
+
+protected:
+    /// Obtain an Accessor that stores getter and setter functors.
+    virtual AccessorBasePtr getAccessor() const = 0;
+
+    /// Register a attribute type along with a factory function.
+    static void registerType(const NamePair& type, FactoryMethod);
+    /// Remove a attribute type from the registry.
+    static void unregisterType(const NamePair& type);
+
+    size_t mCompressedBytes;
+    uint16_t mFlags;
+
+    /// Out-of-core data
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    boost::shared_ptr<FileInfo> mFileInfo;
+#endif
+}; // class AttributeArray
+
+
+////////////////////////////////////////
+
+
+/// Accessor base class for AttributeArray storage where type is not available
+struct AttributeArray::AccessorBase { };
+
+/// Templated Accessor stores typed function pointers used in binding
+/// AttributeHandles
+template <typename T>
+struct AttributeArray::Accessor : public AttributeArray::AccessorBase
+{
+    typedef T (*GetterPtr)(const AttributeArray* array, const Index n);
+    typedef void (*SetterPtr)(AttributeArray* array, const Index n, const T& value);
+    typedef void (*ValuePtr)(AttributeArray* array, const T& value);
+
+    Accessor(GetterPtr getter, SetterPtr setter, ValuePtr collapser, ValuePtr filler) :
+        mGetter(getter), mSetter(setter), mCollapser(collapser), mFiller(filler) { }
+
+    GetterPtr mGetter;
+    SetterPtr mSetter;
+    ValuePtr  mCollapser;
+    ValuePtr  mFiller;
+}; // struct AttributeArray::Accessor
+
+
+////////////////////////////////////////
+
+
+/// Typed class for storing attribute data
+template<typename ValueType_, typename Codec_ = NullAttributeCodec<ValueType_> >
+class TypedAttributeArray: public AttributeArray
+{
+public:
+    typedef boost::shared_ptr<TypedAttributeArray>          Ptr;
+    typedef boost::shared_ptr<const TypedAttributeArray>    ConstPtr;
+
+    typedef ValueType_                  ValueType;
+    typedef Codec_                      Codec;
+    typedef typename Codec::StorageType StorageType;
+
+    //////////
+
+    /// Default constructor, always constructs a uniform attribute.
+    explicit TypedAttributeArray(size_t n = 1,
+        const ValueType& uniformValue = zeroVal<ValueType>());
+    /// Deep copy constructor (optionally decompress during copy).
+    TypedAttributeArray(const TypedAttributeArray&, bool uncompress = false);
+    /// Deep copy assignment operator.
+    TypedAttributeArray& operator=(const TypedAttributeArray&);
+
+    virtual ~TypedAttributeArray() { this->deallocate(); }
+
+    /// Return a copy of this attribute.
+    virtual AttributeArray::Ptr copy() const;
+
+    /// Return an uncompressed copy of this attribute (will just return a copy if not compressed).
+    virtual AttributeArray::Ptr copyUncompressed() const;
+
+    /// Return a new attribute array of the given length @a n with uniform value zero.
+    static Ptr create(size_t n);
+
+    /// Cast an AttributeArray to TypedAttributeArray<T>
+    static TypedAttributeArray& cast(AttributeArray& attributeArray);
+
+    /// Cast an AttributeArray to TypedAttributeArray<T>
+    static const TypedAttributeArray& cast(const AttributeArray& attributeArray);
+
+    /// Return the name of this attribute's type (includes codec)
+    static const NamePair& attributeType();
+    /// Return the name of this attribute's type.
+    virtual const NamePair& type() const { return attributeType(); }
+
+    /// Return @c true if this attribute type is registered.
+    static bool isRegistered();
+    /// Register this attribute type along with a factory function.
+    static void registerType();
+    /// Remove this attribute type from the registry.
+    static void unregisterType();
+
+    /// Return the length of this array.
+    virtual size_t size() const { return mSize; };
+
+    /// Return the number of bytes of memory used by this attribute.
+    virtual size_t memUsage() const;
+
+    /// Return the value at index @a n (assumes uncompressed and in-core)
+    ValueType getUnsafe(Index n) const;
+    /// Return the value at index @a n
+    ValueType get(Index n) const;
+    /// Return the @a value at index @a n (assumes uncompressed and in-core)
+    template<typename T> void getUnsafe(Index n, T& value) const;
+    /// Return the @a value at index @a n
+    template<typename T> void get(Index n, T& value) const;
+
+    /// Non-member equivalent to getUnsafe() that static_casts array to this TypedAttributeArray
+    /// (assumes uncompressed and in-core)
+    static ValueType getUnsafe(const AttributeArray* array, const Index n);
+
+    /// Set @a value at the given index @a n (assumes uncompressed and in-core)
+    void setUnsafe(Index n, const ValueType& value);
+    /// Set @a value at the given index @a n
+    void set(Index n, const ValueType& value);
+    /// Set @a value at the given index @a n (assumes uncompressed and in-core)
+    template<typename T> void setUnsafe(Index n, const T& value);
+    /// Set @a value at the given index @a n
+    template<typename T> void set(Index n, const T& value);
+
+    /// Non-member equivalent to setUnsafe() that static_casts array to this TypedAttributeArray
+    /// (assumes uncompressed and in-core)
+    static void setUnsafe(AttributeArray* array, const Index n, const ValueType& value);
+
+    /// Set value at given index @a n from @a sourceIndex of another @a sourceArray
+    virtual void set(const Index n, const AttributeArray& sourceArray, const Index sourceIndex);
+
+    /// Return @c true if this array is stored as a single uniform value.
+    virtual bool isUniform() const { return mIsUniform; }
+    /// @brief  Replace the single value storage with an array of length size().
+    /// @note   Non-uniform attributes are unchanged.
+    /// @param  fill toggle to initialize the array elements with the pre-expanded value.
+    virtual void expand(bool fill = true);
+    /// Replace the existing array with a uniform zero value.
+    virtual void collapse();
+    /// Compact the existing array to become uniform if all values are identical
+    virtual bool compact();
+
+    /// Replace the existing array with the given uniform value.
+    void collapse(const ValueType& uniformValue);
+    /// @brief Fill the existing array with the given value.
+    /// @note Identical to collapse() except a non-uniform array will not become uniform.
+    void fill(const ValueType& value);
+
+    /// Non-member equivalent to collapse() that static_casts array to this TypedAttributeArray
+    static void collapse(AttributeArray* array, const ValueType& value);
+    /// Non-member equivalent to fill() that static_casts array to this TypedAttributeArray
+    static void fill(AttributeArray* array, const ValueType& value);
+
+    /// Compress the attribute array.
+    virtual bool compress();
+    /// Uncompress the attribute array.
+    virtual bool decompress();
+
+    /// Read attribute data from a stream.
+    virtual void read(std::istream& is);
+    /// Write attribute data to a stream.
+    virtual void write(std::ostream& os) const;
+
+    /// Return @c true if this buffer's values have not yet been read from disk.
+    inline bool isOutOfCore() const;
+
+    /// Ensures all data is in-core
+    virtual void loadData() const;
+
+protected:
+    virtual AccessorBasePtr getAccessor() const;
+
+private:
+    /// Load data from memory-mapped file.
+    inline void doLoad() const;
+    /// Load data from memory-mapped file (unsafe as this function is not protected by a mutex).
+    inline void doLoadUnsafe() const;
+
+    /// Toggle out-of-core state
+    inline void setOutOfCore(const bool);
+
+    /// Compare the this data to another attribute array. Used by the base class comparison operator
+    virtual bool isEqual(const AttributeArray& other) const;
+
+    size_t arrayMemUsage() const;
+    void allocate(const size_t size);
+    void deallocate();
+
+    /// Helper function for use with registerType()
+    static AttributeArray::Ptr factory(size_t n) { return TypedAttributeArray::create(n); }
+
+    static tbb::atomic<const NamePair*> sTypeName;
+    StorageType*    mData;
+    size_t          mSize;
+    bool            mIsUniform;
+    tbb::spin_mutex mMutex;
+}; // class TypedAttributeArray
+
+
+////////////////////////////////////////
+
+
+/// AttributeHandles provide access to specific TypedAttributeArray methods without needing
+/// to know the compression codec, however these methods also incur the cost of a function pointer
+template <typename T>
+class AttributeHandle
+{
+public:
+    typedef boost::shared_ptr<AttributeHandle<T> > Ptr;
+
+protected:
+    typedef T (*GetterPtr)(const AttributeArray* array, const Index n);
+    typedef void (*SetterPtr)(AttributeArray* array, const Index n, const T& value);
+    typedef void (*ValuePtr)(AttributeArray* array, const T& value);
+
+public:
+    static Ptr create(const AttributeArray& array, const bool preserveCompression = true);
+
+    AttributeHandle(const AttributeArray& array, const bool preserveCompression = true);
+
+    bool isUniform() const;
+
+    T get(Index n) const;
+
+protected:
+    const AttributeArray* mArray;
+
+    GetterPtr mGetter;
+    SetterPtr mSetter;
+    ValuePtr  mCollapser;
+    ValuePtr  mFiller;
+
+private:
+    // local copy of AttributeArray (to preserve compression)
+    AttributeArray::Ptr mLocalArray;
+}; // class AttributeHandle
+
+
+/// Write-able version of AttributeHandle
+template <typename T>
+class AttributeWriteHandle : public AttributeHandle<T>
+{
+public:
+    typedef boost::shared_ptr<AttributeWriteHandle<T> > Ptr;
+
+    static Ptr create(AttributeArray& array);
+
+    AttributeWriteHandle(AttributeArray& array);
+
+    /// @brief  If this array is uniform, replace it with an array of length size().
+    /// @param  fill if true, assign the uniform value to each element of the array.
+    void expand(bool fill = true);
+
+    /// Replace the existing array with a uniform value (zero if none provided).
+    void collapse();
+    void collapse(const T& uniformValue);
+
+    /// Compact the existing array to become uniform if all values are identical
+    bool compact();
+
+    /// @brief Fill the existing array with the given value.
+    /// @note Identical to collapse() except a non-uniform array will not become uniform.
+    void fill(const T& value);
+
+    void set(Index n, const T& value);
+}; // class AttributeWriteHandle
+
+
+typedef AttributeHandle<float> AttributeHandleROF;
+typedef AttributeWriteHandle<float> AttributeHandleRWF;
+
+typedef AttributeHandle<Vec3f> AttributeHandleROVec3f;
+typedef AttributeWriteHandle<Vec3f> AttributeHandleRWVec3f;
+
+
+////////////////////////////////////////
+
+
+// Attribute codec implementation
+
+
+template<typename StorageType_>
+template<typename ValueType>
+inline void
+NullAttributeCodec<StorageType_>::decode(const StorageType& data, ValueType& val)
+{
+    val = static_cast<ValueType>(data);
+}
+
+
+template<typename StorageType_>
+template<typename ValueType>
+inline void
+NullAttributeCodec<StorageType_>::encode(const StorageType& val, ValueType& data)
+{
+    data = static_cast<StorageType>(val);
+}
+
+
+template<typename IntType>
+template<typename ValueType>
+inline void
+FixedPointAttributeCodec<IntType>::decode(const StorageType& data, ValueType& val)
+{
+    val = fixedPointToFloatingPoint<ValueType>(data);
+
+    // shift value range to be -0.5 => 0.5 (as this is most commonly used for position)
+
+    val -= ValueType(0.5);
+}
+
+
+template<typename IntType>
+template<typename ValueType>
+inline void
+FixedPointAttributeCodec<IntType>::encode(const ValueType& val, StorageType& data)
+{
+    // shift value range to be -0.5 => 0.5 (as this is most commonly used for position)
+
+    const ValueType newVal = val + ValueType(0.5);
+
+    data = floatingPointToFixedPoint<StorageType>(newVal);
+}
+
+
+template<typename T>
+inline void
+UnitVecAttributeCodec::decode(const StorageType& data, math::Vec3<T>& val)
+{
+    val = math::QuantizedUnitVec::unpack(data);
+}
+
+
+template<typename T>
+inline void
+UnitVecAttributeCodec::encode(const math::Vec3<T>& val, StorageType& data)
+{
+    data = math::QuantizedUnitVec::pack(val);
+}
+
+
+////////////////////////////////////////
+
+// TypedAttributeArray implementation
+
+template<typename ValueType_, typename Codec_>
+tbb::atomic<const NamePair*> TypedAttributeArray<ValueType_, Codec_>::sTypeName;
+
+
+template<typename ValueType_, typename Codec_>
+TypedAttributeArray<ValueType_, Codec_>::TypedAttributeArray(
+    size_t n, const ValueType& uniformValue)
+    : AttributeArray()
+    , mData(new StorageType[1])
+    , mSize(n)
+    , mIsUniform(true)
+    , mMutex()
+{
+    mSize = std::max(size_t(1), mSize);
+    Codec::encode(uniformValue, mData[0]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+TypedAttributeArray<ValueType_, Codec_>::TypedAttributeArray(const TypedAttributeArray& rhs, bool uncompress)
+    : AttributeArray(rhs)
+    , mData(NULL)
+    , mSize(rhs.mSize)
+    , mIsUniform(rhs.mIsUniform)
+    , mMutex()
+{
+    using attribute_compression::decompress;
+    using attribute_compression::uncompressedSize;
+
+    // disable uncompress if data is not compressed
+
+    if (!this->isCompressed())  uncompress = false;
+
+    if (mIsUniform) {
+        this->allocate(1);
+        mData[0] = rhs.mData[0];
+    } else if (this->isOutOfCore()) {
+        // do nothing
+    } else if (this->isCompressed()) {
+        char* buffer = 0;
+        if (uncompress) {
+            rhs.doLoad();
+            const char* charBuffer = reinterpret_cast<char*>(rhs.mData);
+            buffer = decompress(charBuffer, uncompressedSize(charBuffer));
+        }
+        if (buffer)         mCompressedBytes = 0;
+        else {
+            // decompression wasn't requested or failed so deep copy instead
+            buffer = new char[mCompressedBytes];
+            memcpy(buffer, rhs.mData, mCompressedBytes);
+        }
+        assert(buffer);
+        mData = reinterpret_cast<StorageType*>(buffer);
+    } else {
+        this->allocate(mSize);
+        memcpy(mData, rhs.mData, mSize * sizeof(StorageType));
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::TypedAttributeArray&
+TypedAttributeArray<ValueType_, Codec_>::operator=(const TypedAttributeArray& rhs)
+{
+    if (&rhs != this) {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+
+        this->deallocate();
+
+        mFlags = rhs.mFlags;
+        mCompressedBytes = rhs.mCompressedBytes;
+        mSize = rhs.mSize;
+        mIsUniform = rhs.mIsUniform;
+
+        if (mIsUniform) {
+            this->allocate(1);
+            mData[0] = rhs.mData[0];
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        } else if (rhs.isOutOfCore()) {
+            mFileInfo = rhs.mFileInfo;
+#endif
+        } else if (this->isCompressed()) {
+            char* buffer = new char[mCompressedBytes];
+            memcpy(buffer, rhs.mData, mCompressedBytes);
+            mData = reinterpret_cast<StorageType*>(buffer);
+        } else {
+            this->allocate(mSize);
+            memcpy(mData, rhs.mData, mSize * sizeof(StorageType));
+        }
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline const NamePair&
+TypedAttributeArray<ValueType_, Codec_>::attributeType()
+{
+    if (sTypeName == NULL) {
+        std::ostringstream ostr1, ostr2;
+        ostr1 << typeNameAsString<ValueType>();
+        ostr2 << Codec::name() << "_" << typeNameAsString<StorageType>();
+        NamePair* s = new NamePair(ostr1.str(), ostr2.str());
+        if (sTypeName.compare_and_swap(s, NULL) != NULL) delete s;
+    }
+    return *sTypeName;
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline bool
+TypedAttributeArray<ValueType_, Codec_>::isRegistered()
+{
+    return AttributeArray::isRegistered(TypedAttributeArray::attributeType());
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline void
+TypedAttributeArray<ValueType_, Codec_>::registerType()
+{
+    AttributeArray::registerType(TypedAttributeArray::attributeType(), TypedAttributeArray::factory);
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline void
+TypedAttributeArray<ValueType_, Codec_>::unregisterType()
+{
+    AttributeArray::unregisterType(TypedAttributeArray::attributeType());
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline typename TypedAttributeArray<ValueType_, Codec_>::Ptr
+TypedAttributeArray<ValueType_, Codec_>::create(size_t n)
+{
+    return Ptr(new TypedAttributeArray(n));
+}
+
+template<typename ValueType_, typename Codec_>
+inline TypedAttributeArray<ValueType_, Codec_>&
+TypedAttributeArray<ValueType_, Codec_>::cast(AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<TypedAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<TypedAttributeArray&>(attributeArray);
+}
+
+template<typename ValueType_, typename Codec_>
+inline const TypedAttributeArray<ValueType_, Codec_>&
+TypedAttributeArray<ValueType_, Codec_>::cast(const AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<TypedAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<const TypedAttributeArray&>(attributeArray);
+}
+
+template<typename ValueType_, typename Codec_>
+AttributeArray::Ptr
+TypedAttributeArray<ValueType_, Codec_>::copy() const
+{
+    return AttributeArray::Ptr(new TypedAttributeArray<ValueType, Codec>(*this));
+}
+
+
+template<typename ValueType_, typename Codec_>
+AttributeArray::Ptr
+TypedAttributeArray<ValueType_, Codec_>::copyUncompressed() const
+{
+    return AttributeArray::Ptr(new TypedAttributeArray<ValueType, Codec>(*this, /*decompress = */true));
+}
+
+
+template<typename ValueType_, typename Codec_>
+size_t
+TypedAttributeArray<ValueType_, Codec_>::arrayMemUsage() const
+{
+    if (mIsUniform)                 return sizeof(StorageType);
+    if (this->isOutOfCore())        return 0;
+    if (this->isCompressed())       return mCompressedBytes;
+
+    return mSize * sizeof(StorageType);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::allocate(const size_t size)
+{
+    assert(!mData);
+    mData = new StorageType[size];
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::deallocate()
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    // detach from file if delay-loaded
+    if (this->isOutOfCore()) {
+        this->setOutOfCore(false);
+        this->mFileInfo.reset();
+    }
+#endif
+    if (mData) {
+        delete[] mData;
+        mData = NULL;
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+size_t
+TypedAttributeArray<ValueType_, Codec_>::memUsage() const
+{
+    return sizeof(*this) + (mData != NULL ? this->arrayMemUsage() : 0);
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::ValueType
+TypedAttributeArray<ValueType_, Codec_>::getUnsafe(Index n) const
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    ValueType val;
+    Codec::decode(/*in=*/mData[mIsUniform ? 0 : n], /*out=*/val);
+    return val;
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::ValueType
+TypedAttributeArray<ValueType_, Codec_>::get(Index n) const
+{
+    if (this->isCompressed())           const_cast<TypedAttributeArray*>(this)->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    return this->getUnsafe(n);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::getUnsafe(Index n, T& val) const
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    ValueType tmp;
+    Codec::decode(/*in=*/mData[mIsUniform ? 0 : n], /*out=*/tmp);
+    val = static_cast<T>(tmp);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::get(Index n, T& val) const
+{
+    if (this->isCompressed())           const_cast<TypedAttributeArray*>(this)->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    this->getUnsafe(n, val);
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::ValueType
+TypedAttributeArray<ValueType_, Codec_>::getUnsafe(const AttributeArray* array, const Index n)
+{
+    return static_cast<const TypedAttributeArray<ValueType, Codec>*>(array)->getUnsafe(n);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::setUnsafe(Index n, const ValueType& val)
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    if (mIsUniform)     this->expand();
+
+    Codec::encode(/*in=*/val, /*out=*/mData[n]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::set(Index n, const ValueType& val)
+{
+    if (this->isCompressed())           this->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    this->setUnsafe(n, val);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::setUnsafe(Index n, const T& val)
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    if (mIsUniform)     this->expand();
+
+    const ValueType tmp = static_cast<ValueType>(val);
+    Codec::encode(/*in=*/tmp, /*out=*/mData[n]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::set(Index n, const T& val)
+{
+    if (this->isCompressed())           this->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    this->setUnsafe(n, val);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::setUnsafe(AttributeArray* array, const Index n, const ValueType& value)
+{
+    static_cast<TypedAttributeArray<ValueType, Codec>*>(array)->setUnsafe(n, value);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::set(Index n, const AttributeArray& sourceArray, const Index sourceIndex)
+{
+    const TypedAttributeArray& sourceTypedArray = static_cast<const TypedAttributeArray&>(sourceArray);
+
+    ValueType sourceValue;
+    sourceTypedArray.get(sourceIndex, sourceValue);
+
+    this->set(n, sourceValue);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::expand(bool fill)
+{
+    if (!mIsUniform)    return;
+
+    const StorageType val = mData[0];
+
+    {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+        this->deallocate();
+        this->allocate(mSize);
+    }
+
+    mCompressedBytes = 0;
+    mIsUniform = false;
+
+    if (fill) {
+        for (size_t i = 0; i < mSize; ++i)  mData[i] = val;
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+bool
+TypedAttributeArray<ValueType_, Codec_>::compact()
+{
+    if (mIsUniform)     return true;
+
+    // compaction is not possible if any values are different
+    const ValueType_ val = this->get(0);
+    for (size_t i = 1; i < size(); i++) {
+        if (this->get(i) != val)    return false;
+    }
+
+    this->collapse(this->get(0));
+    return true;
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::collapse()
+{
+    this->collapse(zeroVal<ValueType>());
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::collapse(const ValueType& uniformValue)
+{
+    if (!mIsUniform) {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+        this->deallocate();
+        this->allocate(1);
+        mIsUniform = true;
+    }
+    Codec::encode(uniformValue, mData[0]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::collapse(AttributeArray* array, const ValueType& value)
+{
+    static_cast<TypedAttributeArray<ValueType, Codec>*>(array)->collapse(value);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::fill(const ValueType& value)
+{
+    if (this->isOutOfCore()) {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+        this->deallocate();
+        this->allocate(mSize);
+    }
+
+    const size_t size = mIsUniform ? 1 : mSize;
+    for (size_t i = 0; i < size; ++i)  {
+        Codec::encode(value, mData[i]);
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::fill(AttributeArray* array, const ValueType& value)
+{
+    static_cast<TypedAttributeArray<ValueType, Codec>*>(array)->fill(value);
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline bool
+TypedAttributeArray<ValueType_, Codec_>::compress()
+{
+    using attribute_compression::canCompress;
+    using attribute_compression::compress;
+
+    if (!canCompress())     return false;
+
+    if (!mIsUniform && !this->isCompressed()) {
+
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+
+        this->doLoadUnsafe();
+
+        const size_t typeSize = sizeof(typename Codec_::StorageType);
+        const int inBytes = int(mSize * sizeof(StorageType));
+        int outBytes;
+        char* charBuffer = reinterpret_cast<char*>(mData);
+        char* buffer = compress(charBuffer, typeSize, inBytes, outBytes, /*cleanup=*/true);
+
+        if (buffer) {
+            mData = reinterpret_cast<StorageType*>(buffer);
+            mCompressedBytes = size_t(outBytes);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline bool
+TypedAttributeArray<ValueType_, Codec_>::decompress()
+{
+    using attribute_compression::decompress;
+    using attribute_compression::uncompressedSize;
+
+    tbb::spin_mutex::scoped_lock lock(mMutex);
+
+    if (this->isCompressed()) {
+        this->doLoadUnsafe();
+        char* charBuffer = reinterpret_cast<char*>(this->mData);
+        char* buffer = decompress(charBuffer, uncompressedSize(charBuffer));
+        if (buffer) {
+            mData = reinterpret_cast<StorageType*>(buffer);
+            mCompressedBytes = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+template<typename ValueType_, typename Codec_>
+bool
+TypedAttributeArray<ValueType_, Codec_>::isOutOfCore() const
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    return (mFlags & OUTOFCORE);
+#else
+    return false;
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::setOutOfCore(const bool b)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (b)  mFlags |= OUTOFCORE;
+    else    mFlags &= ~OUTOFCORE;
+#else
+    (void) b;
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::doLoad() const
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!(this->isOutOfCore()))     return;
+
+    TypedAttributeArray<ValueType_, Codec_>* self = const_cast<TypedAttributeArray<ValueType_, Codec_>*>(this);
+
+    // This lock will be contended at most once, after which this buffer
+    // will no longer be out-of-core.
+    tbb::spin_mutex::scoped_lock lock(self->mMutex);
+    this->doLoadUnsafe();
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::loadData() const
+{
+    this->doLoad();
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::read(std::istream& is)
+{
+    using attribute_compression::decompress;
+
+    // read data
+
+    Index64 bytes = Index64(0);
+    is.read(reinterpret_cast<char*>(&bytes), sizeof(Index64));
+    bytes = bytes - /*flags*/sizeof(Int16) - /*size*/sizeof(Index64);
+
+    Int16 flags = Int16(0);
+    is.read(reinterpret_cast<char*>(&flags), sizeof(Int16));
+    mFlags = flags;
+
+    Index64 size = Index64(0);
+    is.read(reinterpret_cast<char*>(&size), sizeof(Index64));
+    mSize = size;
+
+    char* buffer = new char[bytes];
+
+    // read uniform and compressed state
+
+    mIsUniform = mFlags & WRITEUNIFORM;
+    mCompressedBytes = mFlags & WRITEMEMCOMPRESS ? bytes : Index64(0);
+
+    // clear uniform and compress flags
+
+    mFlags &= Int16(~WRITEUNIFORM & ~WRITEMEMCOMPRESS);
+
+    tbb::spin_mutex::scoped_lock lock(mMutex);
+
+    this->deallocate();
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    // If this array is being read from a memory-mapped file, delay loading of its data
+    // until the data is actually accessed.
+    io::MappedFile::Ptr mappedFile = io::getMappedFilePtr(is);
+    const bool delayLoad = (mappedFile.get() != NULL);
+
+    if (delayLoad) {
+        this->setOutOfCore(true);
+        mFileInfo.reset(new FileInfo);
+        mFileInfo->bufpos = is.tellg();
+        mFileInfo->mapping = mappedFile;
+        mFileInfo->bytes = bytes;
+        mFileInfo->meta = io::getStreamMetadataPtr(is);
+
+        // read and discard buffer
+        is.read(buffer, bytes);
+        delete[] buffer;
+        return;
+    }
+#endif
+
+    is.read(buffer, bytes);
+
+    // compressed on-disk
+
+    if (mFlags & WRITEDISKCOMPRESS) {
+
+        // decompress buffer
+
+        const int inBytes = int(mSize * sizeof(StorageType));
+        char* newBuffer = decompress(buffer, inBytes, /*cleanup=*/true);
+        if (newBuffer)  buffer = newBuffer;
+    }
+
+    // set data to buffer
+
+    mData = reinterpret_cast<StorageType*>(buffer);
+
+    // clear all write flags
+
+    mFlags &= Int16(~WRITEDISKCOMPRESS);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::write(std::ostream& os) const
+{
+    using attribute_compression::compress;
+
+    if (this->isTransient())    return;
+
+    Int16 flags(mFlags);
+    Index64 size(mSize);
+
+    boost::scoped_array<char> compressedBuffer;
+    int compressedBytes = 0;
+
+    this->doLoad();
+
+    if (mIsUniform)
+    {
+        flags |= WRITEUNIFORM;
+    }
+    else if (this->isCompressed())
+    {
+        flags |= WRITEMEMCOMPRESS;
+    }
+    else if (io::getDataCompression(os) & io::COMPRESS_BLOSC)
+    {
+        const char* charBuffer = reinterpret_cast<const char*>(mData);
+        const size_t typeSize = sizeof(typename Codec_::StorageType);
+        const int inBytes = int(mSize * sizeof(StorageType));
+        compressedBuffer.reset(compress(charBuffer, typeSize, inBytes, compressedBytes));
+        if (compressedBuffer)   flags |= WRITEDISKCOMPRESS;
+    }
+
+    Index64 bytes = /*flags*/ sizeof(Int16) + /*size*/ sizeof(Index64);
+
+    bytes += compressedBuffer ? compressedBytes : this->arrayMemUsage();
+
+    // write data
+
+    os.write(reinterpret_cast<const char*>(&bytes), sizeof(Index64));
+    os.write(reinterpret_cast<const char*>(&flags), sizeof(Int16));
+    os.write(reinterpret_cast<const char*>(&size), sizeof(Index64));
+
+    if (compressedBuffer)   os.write(reinterpret_cast<const char*>(compressedBuffer.get()), compressedBytes);
+    else                    os.write(reinterpret_cast<const char*>(mData), this->arrayMemUsage());
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::doLoadUnsafe() const
+{
+    using attribute_compression::decompress;
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!(this->isOutOfCore()))     return;
+
+    // this function expects the mutex to already be locked
+
+    TypedAttributeArray<ValueType_, Codec_>* self = const_cast<TypedAttributeArray<ValueType_, Codec_>*>(this);
+
+    assert(self->mFileInfo);
+    assert(self->mFileInfo->mapping.get() != NULL);
+
+    FileInfo& info = *(self->mFileInfo);
+
+    boost::shared_ptr<std::streambuf> buf = info.mapping->createBuffer();
+    std::istream is(buf.get());
+
+    const Index64 bytes = info.bytes;
+
+    is.seekg(info.bufpos);
+
+    char* buffer = new char[bytes];
+    is.read(buffer, bytes);
+
+    // compressed on-disk
+
+    if (mFlags & WRITEDISKCOMPRESS) {
+
+        // decompress buffer
+
+        const int inBytes = int(mSize * sizeof(StorageType));
+        char* newBuffer = decompress(buffer, inBytes, /*cleanup=*/true);
+        if (newBuffer)  buffer = newBuffer;
+    }
+
+    // set data to buffer
+
+    self->mData = reinterpret_cast<StorageType*>(buffer);
+
+    // clear write and out-of-core flags
+
+    self->mFlags &= Int16(~WRITEDISKCOMPRESS & ~OUTOFCORE);
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+AttributeArray::AccessorBasePtr
+TypedAttributeArray<ValueType_, Codec_>::getAccessor() const
+{
+    // use the faster 'unsafe' get and set methods as attribute handles
+    // ensure data is uncompressed and in-core when constructed
+
+    return AccessorBasePtr(new AttributeArray::Accessor<ValueType_>(
+        &TypedAttributeArray<ValueType_, Codec_>::getUnsafe,
+        &TypedAttributeArray<ValueType_, Codec_>::setUnsafe,
+        &TypedAttributeArray<ValueType_, Codec_>::collapse,
+        &TypedAttributeArray<ValueType_, Codec_>::fill));
+}
+
+
+template<typename ValueType_, typename Codec_>
+bool
+TypedAttributeArray<ValueType_, Codec_>::isEqual(const AttributeArray& other) const
+{
+    const TypedAttributeArray<ValueType_, Codec_>* const otherT = dynamic_cast<const TypedAttributeArray<ValueType_, Codec_>* >(&other);
+    if(!otherT) return false;
+    if(this->mSize != otherT->mSize ||
+       this->mIsUniform != otherT->mIsUniform ||
+       *this->sTypeName != *otherT->sTypeName) return false;
+
+    this->doLoad();
+
+    const StorageType *target = this->mData, *source = otherT->mData;
+    if (!target && !source) return true;
+    if (!target || !source) return false;
+    Index n = this->mIsUniform ? 1 : mSize;
+    while (n && math::isExactlyEqual(*target++, *source++)) --n;
+    return n == 0;
+}
+
+////////////////////////////////////////
+
+// AttributeHandle implementation
+
+template <typename T>
+typename AttributeHandle<T>::Ptr
+AttributeHandle<T>::create(const AttributeArray& array, const bool preserveCompression)
+{
+    return typename AttributeHandle<T>::Ptr(new AttributeHandle<T>(array, preserveCompression));
+}
+
+template <typename T>
+AttributeHandle<T>::AttributeHandle(const AttributeArray& array, const bool preserveCompression)
+    : mArray(&array)
+{
+    // load data if delay-loaded
+
+    mArray->loadData();
+
+    // if array is compressed and preserve compression is true, copy and decompress
+    // into a local copy that is destroyed with handle to maintain thread-safety
+
+    if (array.isCompressed())
+    {
+        if (preserveCompression) {
+            mLocalArray = array.copyUncompressed();
+            mLocalArray->decompress();
+            mArray = mLocalArray.get();
+        }
+        else {
+            const_cast<AttributeArray*>(mArray)->decompress();
+        }
+    }
+
+    // bind getter and setter methods
+
+    AttributeArray::AccessorBasePtr accessor = mArray->getAccessor();
+    assert(accessor);
+
+    AttributeArray::Accessor<T>* typedAccessor = static_cast<AttributeArray::Accessor<T>*>(accessor.get());
+
+    if (!typedAccessor) {
+        OPENVDB_THROW(RuntimeError, "Cannot bind AttributeHandle due to mis-matching types.");
+    }
+
+    mGetter = typedAccessor->mGetter;
+    mSetter = typedAccessor->mSetter;
+    mCollapser = typedAccessor->mCollapser;
+    mFiller = typedAccessor->mFiller;
+}
+
+
+template <typename T>
+T AttributeHandle<T>::get(Index n) const
+{
+    return mGetter(mArray, n);
+}
+
+template <typename T>
+bool AttributeHandle<T>::isUniform() const
+{
+    return mArray->isUniform();
+}
+
+////////////////////////////////////////
+
+// AttributeWriteHandle implementation
+
+template <typename T>
+typename AttributeWriteHandle<T>::Ptr
+AttributeWriteHandle<T>::create(AttributeArray& array)
+{
+    return typename AttributeWriteHandle<T>::Ptr(new AttributeWriteHandle<T>(array));
+}
+
+template <typename T>
+AttributeWriteHandle<T>::AttributeWriteHandle(AttributeArray& array)
+    : AttributeHandle<T>(array, /*preserveCompression = */ false) { }
+
+template <typename T>
+void AttributeWriteHandle<T>::set(Index n, const T& value)
+{
+    this->mSetter(const_cast<AttributeArray*>(this->mArray), n, value);
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::expand(const bool fill)
+{
+    const_cast<AttributeArray*>(this->mArray)->expand(fill);
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::collapse()
+{
+    const_cast<AttributeArray*>(this->mArray)->collapse();
+}
+
+template <typename T>
+bool AttributeWriteHandle<T>::compact()
+{
+    return const_cast<AttributeArray*>(this->mArray)->compact();
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::collapse(const T& uniformValue)
+{
+    this->mCollapser(const_cast<AttributeArray*>(this->mArray), uniformValue);
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::fill(const T& value)
+{
+    this->mFiller(const_cast<AttributeArray*>(this->mArray), value);
+}
+
+} // namespace tools
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/AttributeGroup.h b/nuparu/include/openvdb/tools/AttributeGroup.h
new file mode 100644
index 00000000..830be73a
--- /dev/null
+++ b/nuparu/include/openvdb/tools/AttributeGroup.h
@@ -0,0 +1,209 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file AttributeGroup.h
+///
+/// @authors Dan Bailey
+///
+/// @brief  Attribute Group access and filtering for iteration.
+///
+
+
+#ifndef OPENVDB_TOOLS_ATTRIBUTE_GROUP_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ATTRIBUTE_GROUP_HAS_BEEN_INCLUDED
+
+#include <openvdb_points/tools/AttributeArray.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+typedef uint8_t GroupType;
+
+
+////////////////////////////////////////
+
+
+class GroupAttributeArray : public TypedAttributeArray<GroupType, NullAttributeCodec<GroupType> >
+{
+public:
+    /// Default constructor, always constructs a uniform attribute.
+    explicit GroupAttributeArray(   size_t n = 1,
+                                    const ValueType& uniformValue = zeroVal<ValueType>());
+    /// Deep copy constructor (optionally decompress during copy).
+    GroupAttributeArray(const GroupAttributeArray& array,
+                        const bool decompress = false);
+
+    /// Cast an AttributeArray to GroupAttributeArray
+    static GroupAttributeArray& cast(AttributeArray& attributeArray);
+
+    /// Cast an AttributeArray to GroupAttributeArray
+    static const GroupAttributeArray& cast(const AttributeArray& attributeArray);
+
+    /// Return @c true if the AttributeArray provided is a group
+    static bool isGroup(const AttributeArray& attributeArray);
+
+    /// @brief Specify whether this attribute is for tracking group membership
+    /// @note  Attributes are not group attributes by default.
+    void setGroup(bool state);
+    /// Return @c true if this attribute is for tracking groups
+    bool isGroup() const { return bool(mFlags & GROUP); }
+
+}; // class GroupAttributeArray
+
+
+inline GroupAttributeArray&
+GroupAttributeArray::cast(AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<GroupAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<GroupAttributeArray&>(attributeArray);
+}
+
+
+inline const GroupAttributeArray&
+GroupAttributeArray::cast(const AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<GroupAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<const GroupAttributeArray&>(attributeArray);
+}
+
+
+inline bool
+GroupAttributeArray::isGroup(const AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<GroupAttributeArray>())  return false;
+
+    return GroupAttributeArray::cast(attributeArray).isGroup();
+}
+
+
+////////////////////////////////////////
+
+
+class GroupHandle
+{
+public:
+    // Dummy class that distinguishes an offset from a bitmask on construction
+    struct BitMask { };
+
+    typedef std::pair<size_t, uint8_t> GroupIndex;
+
+    GroupHandle(const GroupAttributeArray& array, const GroupType& offset);
+    GroupHandle(const GroupAttributeArray& array, const GroupType& bitMask, BitMask);
+
+    size_t size() const { return mArray.size(); }
+    bool isUniform() const { return mArray.isUniform(); }
+
+    bool get(Index n) const;
+
+protected:
+    const GroupAttributeArray& mArray;
+    const GroupType mBitMask;
+}; // class GroupHandle
+
+
+////////////////////////////////////////
+
+
+class GroupWriteHandle : public GroupHandle
+{
+public:
+
+    GroupWriteHandle(GroupAttributeArray& array, const GroupType& offset);
+
+    void set(Index n, bool on);
+
+    /// @brief Set membership for the whole array and attempt to collapse
+    ///
+    /// @param on True or false for inclusion in group
+    ///
+    /// @note This method guarantees that all attributes will have group membership
+    /// changed according to the input bool, however compaction will not be performed
+    /// if other groups that share the same underlying array are non-uniform.
+    /// The return value indicates if the group array ends up being uniform.
+    bool collapse(bool on);
+
+}; // class GroupWriteHandle
+
+
+////////////////////////////////////////
+
+
+/// Index filtering on group membership
+class GroupFilter
+{
+public:
+    struct Data
+    {
+        Data(const Name& _attribute)
+            : attribute(_attribute) { }
+        const Name attribute;
+    };
+
+    GroupFilter(const GroupHandle& handle)
+        : mHandle(handle) { }
+
+    template <typename LeafT>
+    static GroupFilter create(const LeafT& leaf, const Data& data) {
+        return GroupFilter(leaf.groupHandle(data.attribute));
+    }
+
+    template <typename IterT>
+    bool valid(const IterT& iter) const {
+        return mHandle.get(*iter);
+    }
+
+private:
+    const GroupHandle mHandle;
+}; // class GroupFilter
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_ATTRIBUTE_GROUP_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/AttributeSet.h b/nuparu/include/openvdb/tools/AttributeSet.h
new file mode 100644
index 00000000..3bb5aafd
--- /dev/null
+++ b/nuparu/include/openvdb/tools/AttributeSet.h
@@ -0,0 +1,405 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file AttributeSet.h
+///
+/// @authors Dan Bailey, Mihai Alden, Peter Cucka
+///
+/// @brief  Set of Attribute Arrays which tracks metadata about each array.
+///
+
+
+#ifndef OPENVDB_TOOLS_ATTRIBUTE_SET_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ATTRIBUTE_SET_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/metadata/MetaMap.h>
+
+#include <boost/integer_traits.hpp> // integer_traits
+#include <boost/shared_ptr.hpp> // shared_ptr
+
+#include <vector>
+
+#include <openvdb_points/tools/AttributeArray.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+/// Ordered collection of uniquely-named attribute arrays
+class AttributeSet
+{
+public:
+    enum { INVALID_POS = boost::integer_traits<size_t>::const_max };
+
+    typedef boost::shared_ptr<AttributeSet> Ptr;
+    typedef boost::shared_ptr<const AttributeSet> ConstPtr;
+
+    class Descriptor;
+
+    typedef boost::shared_ptr<Descriptor> DescriptorPtr;
+    typedef boost::shared_ptr<const Descriptor> DescriptorConstPtr;
+
+    //////////
+
+    struct Util
+    {
+        /// Attribute and type name pair.
+        struct NameAndType {
+            NameAndType(const std::string& n, const NamePair& t)
+                : name(n), type(t) {}
+            Name name;
+            NamePair type;
+        };
+
+        typedef std::vector<NameAndType> NameAndTypeVec;
+        typedef std::map<std::string, size_t> NameToPosMap;
+        typedef std::pair<size_t, uint8_t> GroupIndex;
+    };
+
+    //////////
+
+    AttributeSet();
+
+    /// Construct from the given descriptor
+    explicit AttributeSet(const DescriptorPtr&, size_t arrayLength = 1);
+
+    /// Shallow copy constructor, the descriptor and attribute arrays will be shared.
+    AttributeSet(const AttributeSet&);
+
+    //@{
+    /// @brief  Return a reference to this attribute set's descriptor, which might
+    ///         be shared with other sets.
+    Descriptor& descriptor() { return *mDescr; }
+    const Descriptor& descriptor() const { return *mDescr; }
+    //@}
+
+    /// @brief Return a pointer to this attribute set's descriptor, which might be
+    /// shared with other sets
+    DescriptorPtr descriptorPtr() const { return mDescr; }
+
+    /// Return the number of attributes in this set.
+    size_t size() const { return mAttrs.size(); }
+
+    /// Return the number of attributes with this flag set
+    size_t size(const uint16_t flag) const;
+
+    /// Return the number of bytes of memory used by this attribute set.
+    size_t memUsage() const;
+
+    /// @brief  Return the position of the attribute array whose name is @a name,
+    ///         or @c INVALID_POS if no match is found.
+    size_t find(const std::string& name) const;
+
+    /// @brief  Replace the attribute array whose name is @a name.
+    /// @return The position of the updated attribute array or @c INVALID_POS
+    ///         if the given name does not exist or if the replacement failed because
+    ///         the new array type does not comply with the descriptor.
+    size_t replace(const std::string& name, const AttributeArray::Ptr&);
+
+    /// @brief  Replace the attribute array stored at position @a pos in this container.
+    /// @return The position of the updated attribute array or @c INVALID_POS
+    ///         if replacement failed because the new array type does not comply with
+    ///         the descriptor.
+    size_t replace(size_t pos, const AttributeArray::Ptr&);
+
+    //@{
+    /// @brief  Return a pointer to the attribute array whose name is @a name or
+    ///         a null pointer if no match is found.
+    const AttributeArray* getConst(const std::string& name) const;
+    const AttributeArray* get(const std::string& name) const;
+    AttributeArray*       get(const std::string& name);
+    //@}
+
+    //@{
+    /// @brief  Return a pointer to the attribute array stored at position @a pos
+    ///         in this set.
+    const AttributeArray* getConst(size_t pos) const;
+    const AttributeArray* get(size_t pos) const;
+    AttributeArray*       get(size_t pos);
+    //@}
+
+    //@{
+    /// @brief Return the group offset from the name or index of the group
+    /// A group attribute array is a single byte (8-bit), each bit of which
+    /// can denote a group. The group offset is the position of the bit that
+    /// denotes the requested group if all group attribute arrays in the set
+    /// (and only attribute arrays marked as group) were to be laid out linearly
+    /// according to their order in the set.
+    size_t groupOffset(const Name& groupName) const;
+    size_t groupOffset(const Util::GroupIndex& index) const;
+    //@}
+
+    /// Return the group index from the name of the group
+    Util::GroupIndex groupIndex(const Name& groupName) const;
+    /// Return the group index from the offset of the group
+    /// @note see offset description for groupOffset()
+    Util::GroupIndex groupIndex(const size_t offset) const;
+
+    /// Create an iterator for iterating through point indices
+    IndexIter beginIndex() const;
+
+    /// Return true if the attribute array stored at position @a pos is shared.
+    bool isShared(size_t pos) const;
+    /// @brief  If the attribute array stored at position @a pos is shared,
+    ///         replace the array with a deep copy of itself that is not
+    ///         shared with anyone else.
+    void makeUnique(size_t pos);
+
+    /// Append attribute @a attribute (simple method)
+    AttributeArray::Ptr appendAttribute(const Util::NameAndType& attribute,
+                                        Metadata::Ptr defaultValue = Metadata::Ptr());
+
+    /// Append attribute @a attribute (descriptor-sharing)
+    /// Requires current descriptor to match @a expected
+    /// On append, current descriptor is replaced with @a replacement
+    AttributeArray::Ptr appendAttribute(const Util::NameAndType& attribute,
+                                        const Descriptor& expected, DescriptorPtr& replacement);
+
+    /// Drop attributes with @a pos indices (simple method)
+    /// Creates a new descriptor for this attribute set
+    void dropAttributes(const std::vector<size_t>& pos);
+
+    /// Drop attributes with @a pos indices (descriptor-sharing method)
+    /// Requires current descriptor to match @a expected
+    /// On drop, current descriptor is replaced with @a replacement
+    void dropAttributes(const std::vector<size_t>& pos,
+                        const Descriptor& expected, DescriptorPtr& replacement);
+
+    /// Re order attribute set to match a provided descriptor
+    /// Replaces own descriptor with @a replacement
+    void reorderAttributes(const DescriptorPtr& replacement);
+
+    /// Re-name attributes in set to match a provided descriptor
+    /// Replaces own descriptor with @a replacement
+    void renameAttributes(const Descriptor& expected, DescriptorPtr& replacement);
+
+    /// Read the entire set from a stream.
+    void read(std::istream&);
+    /// Write the entire set to a stream.
+    void write(std::ostream&) const;
+
+    /// This will read the attribute descriptor from a stream, but no attribute data.
+    void readMetadata(std::istream&);
+    /// This will write the attribute descriptor to a stream, but no attribute data.
+    void writeMetadata(std::ostream&) const;
+
+    /// Read attribute data from a stream.
+    void readAttributes(std::istream&);
+    /// Write attribute data to a stream.
+    void writeAttributes(std::ostream&) const;
+
+    /// Compare the descriptors and attribute arrays on the attribute sets
+    /// Exit early if the descriptors do not match
+    bool operator==(const AttributeSet& other) const;
+    bool operator!=(const AttributeSet& other) const { return !this->operator==(other); }
+
+private:
+    /// Disallow assignment, since it wouldn't be obvious whether the copy is deep or shallow.
+    AttributeSet& operator=(const AttributeSet&);
+
+    typedef std::vector<AttributeArray::Ptr> AttrArrayVec;
+
+    DescriptorPtr mDescr;
+    AttrArrayVec  mAttrs;
+}; // class AttributeSet
+
+////////////////////////////////////////
+
+
+/// @brief  An immutable object that stores name, type and AttributeSet position
+///         for a constant collection of attribute arrays.
+/// @note   The attribute name is actually mutable, but the attribute type
+///         and position can not be changed after creation.
+class AttributeSet::Descriptor
+{
+public:
+    typedef boost::shared_ptr<Descriptor> Ptr;
+
+    typedef Util::NameAndType             NameAndType;
+    typedef Util::NameAndTypeVec          NameAndTypeVec;
+    typedef Util::GroupIndex              GroupIndex;
+    typedef Util::NameToPosMap            NameToPosMap;
+    typedef NameToPosMap::const_iterator  ConstIterator;
+
+    /// Utility method to construct a NameAndType sequence.
+    struct Inserter {
+        NameAndTypeVec vec;
+        Inserter& add(const NameAndType& nameAndType) {
+            vec.push_back(nameAndType); return *this;
+        }
+        Inserter& add(const Name& name, const NamePair& type) {
+            vec.push_back(NameAndType(name, type)); return *this;
+        }
+        Inserter& add(const NameAndTypeVec& other) {
+            for (NameAndTypeVec::const_iterator it = other.begin(), itEnd = other.end(); it != itEnd; ++it) {
+                vec.push_back(NameAndType(it->name, it->type));
+            }
+            return *this;
+        }
+    };
+
+    //////////
+
+    Descriptor();
+
+    /// Copy constructor
+    Descriptor(const Descriptor&);
+
+    /// Create a new descriptor from the given attribute and type name pairs.
+    static Ptr create(const NameAndTypeVec&);
+
+    /// Create a new descriptor from the given attribute and type name pairs
+    /// and copy the group maps and metamap.
+    static Ptr create(const NameAndTypeVec&, const NameToPosMap&, const MetaMap&);
+
+    /// Create a new descriptor from a position attribute type and assumes "P" (for convenience).
+    static Ptr create(const NamePair&);
+
+    Ptr duplicateAppend(const NameAndType& attribute) const;
+    Ptr duplicateAppend(const NameAndTypeVec& vec) const;
+    Ptr duplicateDrop(const std::vector<size_t>& pos) const;
+
+    /// Return the number of attributes in this descriptor.
+    size_t size() const { return mTypes.size(); }
+
+    /// Return the number of bytes of memory used by this attribute set.
+    size_t memUsage() const;
+
+    /// @brief  Return the position of the attribute array whose name is @a name,
+    ///         or @c INVALID_POS if no match is found.
+    size_t find(const std::string& name) const;
+
+    /// Rename an attribute array
+    size_t rename(const std::string& fromName, const std::string& toName);
+
+    /// Return the name of the attribute array's type.
+    const Name& valueType(size_t pos) const;
+    /// Return the name of the attribute array's type.
+    const NamePair& type(size_t pos) const;
+
+    /// Retrieve metadata map
+    MetaMap& getMetadata();
+    const MetaMap& getMetadata() const;
+
+    /// Return true if the attribute has a default value
+    bool hasDefaultValue(const Name& name) const;
+    /// Get a default value for an existing attribute
+    template <typename ValueType>
+    ValueType getDefaultValue(const Name& name) const;
+    /// Set a default value for an existing attribute
+    void setDefaultValue(const Name& name, const Metadata& defaultValue);
+    // Remove the default value if it exists
+    void removeDefaultValue(const Name& name);
+    // Prune any default values for which the key is no longer present
+    void pruneUnusedDefaultValues();
+
+    /// Return true if this descriptor is equal to the given one.
+    bool operator==(const Descriptor&) const;
+    /// Return true if this descriptor is not equal to the given one.
+    bool operator!=(const Descriptor& rhs) const { return !this->operator==(rhs); }
+    /// Return true if this descriptor contains the same attributes
+    /// as the given descriptor, ignoring attribute order
+    bool hasSameAttributes(const Descriptor& rhs) const;
+
+    /// Return a reference to the name-to-position map.
+    const NameToPosMap& map() const { return mNameMap; }
+    /// Return a reference to the name-to-position group map.
+    const NameToPosMap& groupMap() const { return mGroupMap; }
+
+    /// Append to a vector of names and types from this Descriptor in position order
+    void appendTo(NameAndTypeVec& attrs) const;
+
+    /// Return @c true if group exists
+    bool hasGroup(const Name& group) const;
+    /// Define a group name to offset mapping
+    void setGroup(const Name& group, const size_t offset);
+    /// Drop any mapping keyed by group name
+    void dropGroup(const Name& group);
+    /// Clear all groups
+    void clearGroups();
+
+    /// Return a unique name for an attribute array based on given name
+    const Name uniqueName(const Name& name) const;
+
+    /// Serialize this descriptor to the given stream.
+    void write(std::ostream&) const;
+    /// Unserialize this transform from the given stream.
+    void read(std::istream&);
+
+private:
+    size_t insert(const std::string& name, const NamePair& typeName);
+    NameToPosMap                mNameMap;
+    std::vector<NamePair>       mTypes;
+    NameToPosMap                mGroupMap;
+    MetaMap                     mMetadata;
+}; // class Descriptor
+
+
+template <typename ValueType>
+ValueType
+AttributeSet::Descriptor::getDefaultValue(const Name& name) const
+{
+    typedef typename TypedMetadata<ValueType>::ConstPtr MetadataPtr;
+
+    const size_t pos = find(name);
+    if (pos == INVALID_POS) {
+        OPENVDB_THROW(LookupError, "Cannot find attribute name to set default value.")
+    }
+
+    std::stringstream ss;
+    ss << "default:" << name;
+
+    MetadataPtr metadata = mMetadata.getMetadata<TypedMetadata<ValueType> >(ss.str());
+
+    if (metadata)   return metadata->value();
+
+    return zeroVal<ValueType>();
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+
diff --git a/nuparu/include/openvdb/tools/IndexFilter.h b/nuparu/include/openvdb/tools/IndexFilter.h
new file mode 100644
index 00000000..4e45a5c6
--- /dev/null
+++ b/nuparu/include/openvdb/tools/IndexFilter.h
@@ -0,0 +1,161 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file IndexFilter.h
+///
+/// @authors Dan Bailey
+///
+/// @brief  Index filters primarily designed to be used with a FilterIndexIter.
+///
+
+
+#ifndef OPENVDB_TOOLS_INDEX_FILTER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_INDEX_FILTER_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+
+#include <openvdb/math/Transform.h>
+
+#include <openvdb_points/tools/IndexIterator.h>
+#include <openvdb_points/tools/AttributeArray.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+// Random index filtering per leaf
+template <typename RandGenT>
+class RandomLeafFilter
+{
+public:
+    typedef std::map<openvdb::Coord, Index64> LeafSeedMap;
+    typedef boost::uniform_01<RandGenT> Distribution;
+    typedef typename Distribution::result_type ResultT;
+
+    struct Data
+    {
+        Data(const ResultT _factor, const LeafSeedMap& _leafSeedMap)
+            : factor(_factor), leafSeedMap(_leafSeedMap) { }
+        const ResultT factor;
+        const LeafSeedMap& leafSeedMap;
+    };
+
+    RandomLeafFilter(const Data& data, const unsigned int seed)
+        : mData(data)
+        , mDistribution(RandGenT(seed)) { }
+
+    inline ResultT next() const {
+        return const_cast<boost::uniform_01<boost::mt11213b>&>(mDistribution)();
+    }
+
+    template <typename LeafT>
+    static RandomLeafFilter create(const LeafT& leaf, const Data& data) {
+        const LeafSeedMap::const_iterator it = data.leafSeedMap.find(leaf.origin());
+        if (it == data.leafSeedMap.end()) {
+            OPENVDB_THROW(openvdb::KeyError, "Cannot find leaf origin in offset map for random filter");
+        }
+        return RandomLeafFilter(data, (unsigned int) it->second);
+    }
+
+    template <typename IterT>
+    bool valid(const IterT&) const {
+        return next() < mData.factor;
+    }
+
+private:
+    const Data mData;
+    Distribution mDistribution;
+}; // class RandomLeafFilter
+
+
+// BBox index filtering
+class BBoxFilter
+{
+public:
+    struct Data
+    {
+        Data(const openvdb::math::Transform& _transform,
+             const openvdb::BBoxd& _bboxWS)
+            : transform(_transform)
+            , bbox(transform.worldToIndex(_bboxWS)) { }
+        const openvdb::math::Transform transform;
+        const openvdb::BBoxd bbox;
+    };
+
+    BBoxFilter( const Data& data,
+                const AttributeHandle<openvdb::Vec3f>::Ptr& positionHandle)
+        : mData(data)
+        , mPositionHandle(positionHandle) { }
+
+    template <typename LeafT>
+    static BBoxFilter create(const LeafT& leaf, const Data& data) {
+        return BBoxFilter(data, AttributeHandle<openvdb::Vec3f>::create(leaf.constAttributeArray("P")));
+    }
+
+    template <typename IterT>
+    bool valid(const IterT& iter) const {
+        const openvdb::Coord ijk = iter.getCoord();
+        const openvdb::Vec3f voxelIndexSpace = ijk.asVec3d();
+
+        // Retrieve point position in voxel space
+        const openvdb::Vec3f& pointVoxelSpace = mPositionHandle->get(*iter);
+
+        // Compute point position in index space
+        const openvdb::Vec3f pointIndexSpace = pointVoxelSpace + voxelIndexSpace;
+
+        return mData.bbox.isInside(pointIndexSpace);
+    }
+
+private:
+    const Data mData;
+    const AttributeHandle<openvdb::Vec3f>::Ptr mPositionHandle;
+}; // class BBoxFilter
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_INDEX_FILTER_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/IndexIterator.h b/nuparu/include/openvdb/tools/IndexIterator.h
new file mode 100644
index 00000000..06d70fe5
--- /dev/null
+++ b/nuparu/include/openvdb/tools/IndexIterator.h
@@ -0,0 +1,347 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file IndexIterator.h
+///
+/// @authors Dan Bailey
+///
+/// @brief  Index Iterators.
+///
+
+
+#ifndef OPENVDB_TOOLS_INDEX_ITERATOR_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_INDEX_ITERATOR_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Count up the number of times the iterator can iterate
+///
+/// @param iter the iterator.
+///
+/// @note counting by iteration only performed where a dynamic filter is in use,
+template <typename IterT>
+inline Index64 iterCount(const IterT& iter);
+
+
+////////////////////////////////////////
+
+
+/// @brief A forward iterator over array indices
+class IndexIter
+{
+public:
+    IndexIter()
+        : mEnd(0), mItem(0) {}
+    IndexIter(Index32 item, Index32 end)
+        : mEnd(end), mItem(item) {}
+    IndexIter(const IndexIter& other)
+        : mEnd(other.mEnd), mItem(other.mItem) { }
+
+    inline Index32 end() const { return mEnd; }
+
+    /// @brief Reset the begining and end of the iterator.
+    inline void reset(Index32 item, Index32 end) {
+        mItem = item;
+        mEnd = end;
+    }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    inline Index32 operator*() { return mItem; }
+    inline Index32 operator*() const { return mItem; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    inline operator bool() const { return mItem < mEnd; }
+    inline bool test() const { return mItem < mEnd; }
+
+    /// @brief  Advance to the next (valid) item (prefix).
+    inline IndexIter& operator++() {
+        ++mItem;
+        return *this;
+    }
+
+    /// @brief  Advance to the next (valid) item (postfix).
+    inline IndexIter operator++(int /*dummy*/) {
+        IndexIter newIterator(*this);
+        this->operator++();
+        return newIterator;
+    }
+
+    /// @brief  Advance to the next (valid) item.
+    inline bool next() { this->operator++(); return this->test(); }
+    inline bool increment() { this->next(); return this->test(); }
+
+    /// Throw an error as Coord methods are not available on this iterator
+    inline Coord getCoord() const { OPENVDB_THROW(RuntimeError, "IndexIter does not provide a valid Coord, use a ValueIndexIter instead."); }
+    /// Throw an error as Coord methods are not available on this iterator
+    inline void getCoord(Coord&) const { OPENVDB_THROW(RuntimeError, "IndexIter does not provide a valid Coord, use a ValueIndexIter instead."); }
+
+    /// @brief Equality operators
+    inline bool operator==(const IndexIter& other) const { return mItem == other.mItem; }
+    inline bool operator!=(const IndexIter& other) const { return !this->operator==(other); }
+
+private:
+    Index32 mEnd, mItem;
+}; // class IndexIter
+
+
+/// @brief A forward iterator over array indices from a value iterator (such as ValueOnCIter)
+template <typename ValueIterT>
+class ValueIndexIter
+{
+public:
+    ValueIndexIter(ValueIterT& iter)
+        : mIndexIter(), mIter(iter), mParent(mIter.parent())
+    {
+        if (mIter) {
+            Index32 start = mIter.offset() > 0 ? Index32(mParent.getValue(mIter.offset() - 1)) : Index32(0);
+            mIndexIter.reset(start, *mIter);
+            if (!mIndexIter.test())   this->operator++();
+        }
+    }
+    ValueIndexIter(const ValueIndexIter& other)
+        : mIndexIter(other.mIndexIter), mIter(other.mIter), mParent(other.mParent) { }
+
+    inline Index32 end() const { return mIndexIter.end(); }
+
+    inline void reset(Index32 item, Index32 end) {
+        mIndexIter.reset(item, end);
+    }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    inline Index32 operator*() { return *mIndexIter; }
+    inline Index32 operator*() const { return *mIndexIter; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    inline operator bool() const { return mIter; }
+    inline bool test() const { return mIter; }
+
+    /// @brief  Advance to the next (valid) item (prefix).
+    inline ValueIndexIter& operator++() {
+        mIndexIter.next();
+        while (!mIndexIter.test() && mIter.next()) {
+            mIndexIter.reset(mParent.getValue(mIter.offset() - 1), *mIter);
+        }
+        return *this;
+    }
+
+    /// @brief  Advance to the next (valid) item (postfix).
+    inline ValueIndexIter operator++(int /*dummy*/) {
+        IndexIter newIterator(*this);
+        this->operator++();
+        return newIterator;
+    }
+
+    /// @brief  Advance to the next (valid) item.
+    inline bool next() { this->operator++(); return this->test(); }
+    inline bool increment() { this->next(); return this->test(); }
+
+    /// Return the coordinates of the item to which the value iterator is pointing.
+    inline Coord getCoord() const { return mIter.getCoord(); }
+    /// Return in @a xyz the coordinates of the item to which the value iterator is pointing.
+    inline void getCoord(Coord& xyz) const { xyz = mIter.getCoord(); }
+
+    /// Return the const index iterator
+    inline const IndexIter& indexIter() const { return mIndexIter; }
+    /// Return the const value iterator
+    inline const ValueIterT& valueIter() const { return mIter; }
+
+    /// @brief Equality operators
+    bool operator==(const ValueIndexIter& other) const { return *mIndexIter == *other.mIndexIter; }
+    bool operator!=(const ValueIndexIter& other) const { return !this->operator==(other); }
+
+private:
+    IndexIter mIndexIter;
+    ValueIterT mIter;
+    const typename ValueIterT::NodeType& mParent;
+}; // ValueIndexIter
+
+
+/// IndexIterTraits provides the following for iterators of the three value
+/// types, i.e., for {Value}{On,Off,All}{CIter}:
+/// - a begin(leaf) function that returns an index iterator or an index value
+///   iterator for the leaf provided,
+///   eg IndexIterTraits<Tree, Tree::LeafNodeType::ValueOn>::begin(leaf) returns
+///   leaf.beginIndexOn()
+/// - an Iterator typedef that aliases to the index iterator for this value type
+template<typename TreeT, typename ValueT> struct IndexIterTraits;
+
+template<typename TreeT>
+struct IndexIterTraits<TreeT, typename TreeT::LeafNodeType::ValueAllCIter> {
+    typedef IndexIter Iterator;
+    static Iterator begin(const typename TreeT::LeafNodeType& leaf) {
+        return Iterator(leaf.beginIndexAll());
+    }
+};
+
+template<typename TreeT>
+struct IndexIterTraits<TreeT, typename TreeT::LeafNodeType::ValueOnCIter> {
+    typedef typename TreeT::LeafNodeType::IndexOnIter Iterator;
+    static Iterator begin(const typename TreeT::LeafNodeType& leaf) {
+        return Iterator(leaf.beginIndexOn());
+    }
+};
+
+template<typename TreeT>
+struct IndexIterTraits<TreeT, typename TreeT::LeafNodeType::ValueOffCIter> {
+    typedef typename TreeT::LeafNodeType::IndexOffIter Iterator;
+    static Iterator begin(const typename TreeT::LeafNodeType& leaf) {
+        return Iterator(leaf.beginIndexOff());
+    }
+};
+
+
+/// @brief A forward iterator over array indices with filtering
+/// IteratorT can be either IndexIter or ValueIndexIter (or some custom index iterator)
+/// FilterT should be a struct or class with a valid() method than can be evaluated per index
+/// Here's a simple filter example that only accepts even indices:
+///
+/// struct EvenIndexFilter
+/// {
+///     bool valid(const Index32 offset) const {
+///         return (offset % 2) == 0;
+///     }
+/// };
+///
+template <typename IteratorT, typename FilterT>
+class FilterIndexIter
+{
+public:
+    FilterIndexIter(const IteratorT& iterator, const FilterT& filter)
+        : mIterator(iterator), mFilter(filter) { if (mIterator) { this->reset(*mIterator, mIterator.end()); } }
+    FilterIndexIter(const FilterIndexIter& other)
+        : mIterator(other.mIterator), mFilter(other.mFilter) { }
+
+    Index32 end() const { return mIterator.end(); }
+
+    /// @brief Reset the begining and end of the iterator.
+    void reset(Index32 begin, Index32 end) {
+        mIterator.reset(begin, end);
+        while (mIterator.test() && !mFilter.template valid<IteratorT>(mIterator)) {
+            ++mIterator;
+        }
+    }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    Index32 operator*() { return *mIterator; }
+    Index32 operator*() const { return *mIterator; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    operator bool() const { return mIterator.test(); }
+    bool test() const { return mIterator.test(); }
+
+    /// @brief  Advance to the next (valid) item (prefix).
+    FilterIndexIter& operator++() {
+        while (true) {
+            ++mIterator;
+            if (!mIterator.test() || mFilter.template valid<IteratorT>(mIterator)) {
+                break;
+            }
+        }
+        return *this;
+    }
+
+    /// @brief  Advance to the next (valid) item (postfix).
+    FilterIndexIter operator++(int /*dummy*/) {
+        FilterIndexIter newIterator(*this);
+        this->operator++();
+        return newIterator;
+    }
+
+    /// @brief  Advance to the next (valid) item.
+    bool next() { this->operator++(); return this->test(); }
+    bool increment() { this->next(); return this->test(); }
+
+    /// Return the const index iterator
+    inline const IteratorT& indexIter() const { return mIterator; }
+    /// Return the const filter
+    inline const FilterT& filter() const { return mFilter; }
+
+    /// @brief Equality operators
+    bool operator==(const FilterIndexIter& other) const { return mIterator == other.mIterator; }
+    bool operator!=(const FilterIndexIter& other) const { return !this->operator==(other); }
+
+private:
+    IteratorT mIterator;
+    const FilterT mFilter;
+}; // class FilterIndexIter
+
+
+////////////////////////////////////////
+
+
+template <typename IterT>
+inline Index64 iterCount(const IterT& iter)
+{
+    Index64 size = 0;
+    for (IterT newIter(iter); newIter; ++newIter, ++size) { }
+    return size;
+}
+
+
+template <>
+inline Index64 iterCount(const IndexIter& iter)
+{
+    return iter ? iter.end() - *iter : 0;
+}
+
+
+template <typename T>
+inline Index64 iterCount(const ValueIndexIter<T>& iter)
+{
+    T newIter(iter.valueIter());
+    Index64 size = 0;
+    for ( ; newIter; ++newIter) {
+        size += *newIter - (newIter.offset() == 0 ? Index32(0) : Index32(newIter.parent().getValue(newIter.offset() - 1)));
+    }
+    return size;
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_INDEX_ITERATOR_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/PointAttribute.h b/nuparu/include/openvdb/tools/PointAttribute.h
new file mode 100644
index 00000000..5506b6d3
--- /dev/null
+++ b/nuparu/include/openvdb/tools/PointAttribute.h
@@ -0,0 +1,545 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointAttribute.h
+///
+/// @brief  Point attribute manipulation in a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/AttributeGroup.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Appends a new attribute to the VDB tree.
+///
+/// @param tree          the PointDataTree to be appended to.
+/// @param newAttribute  name and type for the new attribute.
+/// @param defaultValue  metadata default attribute value
+/// @param hidden        mark attribute as hidden
+/// @param transient     mark attribute as transient
+/// @param group         mark attribute as group
+template <typename PointDataTree>
+inline void appendAttribute(PointDataTree& tree,
+                            const AttributeSet::Util::NameAndType& newAttribute,
+                            Metadata::Ptr defaultValue = Metadata::Ptr(),
+                            const bool hidden = false,
+                            const bool transient = false,
+                            const bool group = false);
+
+/// @brief Drops attributes from the VDB tree.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param indices       indices of the attributes to drop.
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<size_t>& indices);
+
+/// @brief Drops attributes from the VDB tree.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param names         names of the attributes to drop.
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<Name>& names);
+
+/// @brief Drop one attribute from the VDB tree (convenience method).
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param index         index of the attribute to drop.
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const size_t& index);
+
+/// @brief Drop one attribute from the VDB tree (convenience method).
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param name          name of the attribute to drop.
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const Name& name);
+
+/// @brief Rename attributes in a VDB tree.
+///
+/// @param tree          the PointDataTree.
+/// @param oldNames      a list of old attribute names to rename from.
+/// @param newNames      a list of new attribute names to rename to.
+///
+/// @note Number of oldNames must match the number of newNames.
+///
+/// @note Duplicate names and renaming group attributes are not allowed.
+template <typename PointDataTree>
+inline void renameAttributes(PointDataTree& tree,
+                            const std::vector<Name>& oldNames,
+                            const std::vector<Name>& newNames);
+
+/// @brief Rename an attribute in a VDB tree.
+///
+/// @param tree          the PointDataTree.
+/// @param oldName       the old attribute name to rename from.
+/// @param newName       the new attribute name to rename to.
+///
+/// @note newName must not already exist and must not be a group attribute.
+template <typename PointDataTree>
+inline void renameAttribute(PointDataTree& tree,
+                            const Name& oldName,
+                            const Name& newName);
+
+/// @brief Compact attributes in a VDB tree (if possible).
+///
+/// @param tree          the PointDataTree.
+template <typename PointDataTree>
+inline void compactAttributes(PointDataTree& tree);
+
+/// @brief Apply Blosc compression to one attribute in the VDB tree.
+///
+/// @param tree          the PointDataTree.
+/// @param name          name of the attribute to compress.
+template <typename PointDataTree>
+inline void bloscCompressAttribute( PointDataTree& tree,
+                                    const Name& name);
+
+////////////////////////////////////////
+
+
+namespace point_attribute_internal {
+
+template<typename PointDataTreeType>
+struct AppendAttributeOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef AttributeSet::Descriptor::NameAndType               NameAndType;
+
+    AppendAttributeOp(  PointDataTreeType& tree,
+                        const NameAndType& newAttribute,
+                        AttributeSet::DescriptorPtr& descriptor,
+                        const bool hidden = false,
+                        const bool transient = false,
+                        const bool group = false)
+        : mTree(tree)
+        , mNewAttribute(newAttribute)
+        , mDescriptor(descriptor)
+        , mHidden(hidden)
+        , mTransient(transient)
+        , mGroup(group) { }
+
+    void operator()(const LeafRangeT& range) const {
+
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            const AttributeSet::Descriptor& expected = leaf->attributeSet().descriptor();
+
+            AttributeArray::Ptr attribute = leaf->appendAttribute(mNewAttribute, expected, mDescriptor);
+
+            if (mHidden)      attribute->setHidden(true);
+            if (mTransient)   attribute->setTransient(true);
+
+            if (mGroup) {
+                GroupAttributeArray::cast(*attribute).setGroup(true);
+            }
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&              mTree;
+    const NameAndType&              mNewAttribute;
+    AttributeSet::DescriptorPtr&    mDescriptor;
+    const bool                      mHidden;
+    const bool                      mTransient;
+    const bool                      mGroup;
+}; // class AppendAttributeOp
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataTreeType>
+struct DropAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef std::vector<size_t>                                 Indices;
+
+    DropAttributesOp(   PointDataTreeType& tree,
+                        const Indices& indices,
+                        AttributeSet::DescriptorPtr& descriptor)
+        : mTree(tree)
+        , mIndices(indices)
+        , mDescriptor(descriptor) { }
+
+    void operator()(const LeafRangeT& range) const {
+
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            const AttributeSet::Descriptor& expected = leaf->attributeSet().descriptor();
+
+            leaf->dropAttributes(mIndices, expected, mDescriptor);
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&              mTree;
+    const Indices&                  mIndices;
+    AttributeSet::DescriptorPtr&    mDescriptor;
+}; // class DropAttributesOp
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataTreeType>
+struct CompactAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+
+    CompactAttributesOp() { }
+
+    void operator()(const LeafRangeT& range) const {
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+            leaf->compactAttributes();
+        }
+    }
+}; // class CompactAttributesOp
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataTreeType>
+struct BloscCompressAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef std::vector<size_t>                                 Indices;
+
+    BloscCompressAttributesOp(  PointDataTreeType& tree,
+                                const Indices& indices)
+        : mTree(tree)
+        , mIndices(indices) { }
+
+    void operator()(const LeafRangeT& range) const {
+
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            for (Indices::const_iterator    it = mIndices.begin(),
+                                            itEnd = mIndices.end(); it != itEnd; ++it) {
+
+                AttributeArray& array = leaf->attributeArray(*it);
+                array.compress();
+            }
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&              mTree;
+    const Indices&                  mIndices;
+}; // class BloscCompressAttributesOp
+
+
+} // namespace point_attribute_internal
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void appendAttribute(PointDataTree& tree,
+                            const AttributeSet::Util::NameAndType& newAttribute,
+                            Metadata::Ptr defaultValue,
+                            const bool hidden, const bool transient, const bool group)
+{
+    typedef AttributeSet::Util::NameAndTypeVec                    NameAndTypeVec;
+    typedef AttributeSet::Descriptor                              Descriptor;
+
+    using point_attribute_internal::AppendAttributeOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    // do not append a non-unique attribute
+
+    const Descriptor& descriptor = iter->attributeSet().descriptor();
+    const size_t index = descriptor.find(newAttribute.name);
+
+    if (index != AttributeSet::INVALID_POS) {
+        OPENVDB_THROW(KeyError, "Cannot append an attribute with a non-unique name - " << newAttribute.name << ".");
+    }
+
+    // create a new attribute descriptor
+    NameAndTypeVec vec;
+    vec.push_back(newAttribute);
+
+    Descriptor::Ptr newDescriptor = descriptor.duplicateAppend(vec);
+
+    // store the attribute default value in the descriptor metadata
+
+    if (defaultValue) {
+        newDescriptor->setDefaultValue(newAttribute.name, *defaultValue);
+    }
+
+    // insert attributes using the new descriptor
+
+    AppendAttributeOp<PointDataTree> append(tree, newAttribute, newDescriptor, hidden, transient, group);
+    tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), append);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<size_t>& indices)
+{
+    typedef typename tree::LeafManager<PointDataTree>       LeafManagerT;
+    typedef AttributeSet::Descriptor                        Descriptor;
+
+    using point_attribute_internal::DropAttributesOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const Descriptor& descriptor = iter->attributeSet().descriptor();
+
+    // throw if position index present in the indices as this attribute is mandatory
+
+    const size_t positionIndex = descriptor.find("P");
+    if (positionIndex!= AttributeSet::INVALID_POS &&
+        std::find(indices.begin(), indices.end(), positionIndex) != indices.end()) {
+        OPENVDB_THROW(KeyError, "Cannot drop mandatory position attribute.");
+    }
+
+    // insert attributes using the new descriptor
+
+    Descriptor::Ptr newDescriptor = descriptor.duplicateDrop(indices);
+    tbb::parallel_for(LeafManagerT(tree).leafRange(), DropAttributesOp<PointDataTree>(tree, indices, newDescriptor));
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<Name>& names)
+{
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const AttributeSet::Descriptor& descriptor = attributeSet.descriptor();
+
+    std::vector<size_t> indices;
+
+    for (std::vector<Name>::const_iterator it = names.begin(), itEnd = names.end(); it != itEnd; ++it) {
+        const size_t index = descriptor.find(*it);
+
+        // do not attempt to drop an attribute that does not exist
+        if (index == AttributeSet::INVALID_POS) {
+            OPENVDB_THROW(KeyError, "Cannot drop an attribute that does not exist - " << *it << ".");
+        }
+
+        indices.push_back(index);
+    }
+
+    dropAttributes(tree, indices);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const size_t& index)
+{
+    std::vector<size_t> indices;
+    indices.push_back(index);
+    dropAttributes(tree, indices);
+}
+
+
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const Name& name)
+{
+    std::vector<Name> names;
+    names.push_back(name);
+    dropAttributes(tree, names);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void renameAttributes(   PointDataTree& tree,
+                                const std::vector<Name>& oldNames,
+                                const std::vector<Name>& newNames)
+{
+    if (oldNames.size() != newNames.size()) {
+        OPENVDB_THROW(ValueError, "Mis-matching sizes of name vectors, cannot rename attributes.");
+    }
+
+    typedef AttributeSet::Descriptor                        Descriptor;
+
+    typename PointDataTree::LeafIter iter = tree.beginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const Descriptor& descriptor = attributeSet.descriptor();
+    AttributeSet::DescriptorPtr newDescriptor(new Descriptor(descriptor));
+
+    for (size_t i = 0; i < oldNames.size(); i++) {
+        const Name oldName(oldNames[i]);
+        if (descriptor.find(oldName) == AttributeSet::INVALID_POS) {
+            OPENVDB_THROW(KeyError, "Cannot find requested attribute - " << oldName << ".");
+        }
+
+        const Name newName(newNames[i]);
+        if (descriptor.find(newName) != AttributeSet::INVALID_POS) {
+            OPENVDB_THROW(KeyError, "Cannot rename attribute as new name already exists - " << newName << ".");
+        }
+
+        const AttributeArray* array = attributeSet.getConst(oldName);
+        assert(array);
+
+        if (GroupAttributeArray::isGroup(*array)) {
+            OPENVDB_THROW(KeyError, "Cannot rename group attribute - " << oldName << ".");
+        }
+
+        newDescriptor->rename(oldName, newName);
+    }
+
+    for (; iter; ++iter) {
+        iter->renameAttributes(descriptor, newDescriptor);
+    }
+}
+
+
+template <typename PointDataTree>
+inline void renameAttribute(PointDataTree& tree,
+                            const Name& oldName,
+                            const Name& newName)
+{
+    std::vector<Name> oldNames;
+    std::vector<Name> newNames;
+    oldNames.push_back(oldName);
+    newNames.push_back(newName);
+    renameAttributes(tree, oldNames, newNames);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void compactAttributes(PointDataTree& tree)
+{
+    typedef typename tree::LeafManager<PointDataTree>       LeafManagerT;
+
+    using point_attribute_internal::CompactAttributesOp;
+
+    typename PointDataTree::LeafIter iter = tree.beginLeaf();
+    if (!iter)  return;
+
+    tbb::parallel_for(LeafManagerT(tree).leafRange(), CompactAttributesOp<PointDataTree>());
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void bloscCompressAttribute( PointDataTree& tree,
+                                    const Name& name)
+{
+    using point_attribute_internal::BloscCompressAttributesOp;
+
+    typedef typename tree::LeafManager<PointDataTree>       LeafManagerT;
+    typedef AttributeSet::Descriptor                        Descriptor;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const Descriptor& descriptor = iter->attributeSet().descriptor();
+
+    // throw if index cannot be found in descriptor
+
+    const size_t index = descriptor.find(name);
+    if (index == AttributeSet::INVALID_POS) {
+        OPENVDB_THROW(KeyError, "Cannot find requested attribute - " << name << ".");
+    }
+
+    // blosc compress attributes
+
+    std::vector<size_t> indices;
+    indices.push_back(index);
+
+    tbb::parallel_for(LeafManagerT(tree).leafRange(), BloscCompressAttributesOp<PointDataTree>(tree, indices));
+}
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/PointConversion.h b/nuparu/include/openvdb/tools/PointConversion.h
new file mode 100644
index 00000000..5593919d
--- /dev/null
+++ b/nuparu/include/openvdb/tools/PointConversion.h
@@ -0,0 +1,414 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointConversion.h
+///
+/// @brief  Convert existing points and attributes into VDB Point Data grids and attributes.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_CONVERSION_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_CONVERSION_HAS_BEEN_INCLUDED
+
+#include <openvdb/math/Transform.h>
+
+#include <openvdb/tools/PointIndexGrid.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief  Localises points with position into a @c PointDataGrid into two stages:
+///         allocation of the leaf attribute data and population of the positions.
+///
+/// @param  pointIndexGrid  a PointIndexGrid into the points.
+/// @param  positions       list of world space point positions.
+/// @param  positionType    the type of the position (includes compression info).
+/// @param  xform           world to index space transform.
+/// @param  positionDefaultValue metadata default position value
+///
+/// @note   The position data must be supplied in a Point-Partitioner compatible
+///         data structure. A convenience PointAttributeVector class is offered.
+///
+/// @note   The position data is populated separately to perform world space to
+///         voxel space conversion and apply quantisation.
+///
+/// @note   A @c PointIndexGrid to the points must be supplied to perform this
+///         operation. Typically this is built implicitly by the PointDataGrid constructor.
+
+template<typename PointDataGridT, typename PositionArrayT, typename PointIndexGridT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const PointIndexGridT& pointIndexGrid, const PositionArrayT& positions,
+                    const openvdb::NamePair& positionType, const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue = Metadata::Ptr());
+
+
+/// @brief  Convenience method to create a @c PointDataGrid from a std::vector of
+///         point positions.
+///
+/// @param  positions     list of world space point positions.
+/// @param  positionType  the type of the position (includes compression info).
+/// @param  xform         world to index space transform.
+/// @param  positionDefaultValue metadata default position value
+///
+/// @note   This method implicitly wraps the std::vector for a Point-Partitioner compatible
+///         data structure and creates the required @c PointIndexGrid to the points.
+
+template <typename PointDataGridT, typename ValueT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const std::vector<ValueT>& positions,
+                    const openvdb::NamePair& positionType, const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue = Metadata::Ptr());
+
+
+/// @brief  Stores point attribute data in an existing @c PointDataGrid attribute.
+///
+/// @param  tree            the PointDataGrid to be populated.
+/// @param  pointIndexTree  a PointIndexTree into the points.
+/// @param  attributeName   the name of the VDB Points attribute to be populated.
+/// @param  data            a wrapper to the attribute data.
+///
+/// @note   A @c PointIndexGrid to the points must be supplied to perform this
+///         operation. This is required to ensure the same point index ordering.
+
+template <typename PointDataTreeT, typename PointIndexTreeT, typename PointArrayT>
+inline void
+populateAttribute(  PointDataTreeT& tree, const PointIndexTreeT& pointIndexTree,
+                    const openvdb::Name& attributeName, const PointArrayT& data);
+
+
+////////////////////////////////////////
+
+
+/// @brief Point-partitioner compatible STL vector attribute wrapper for convenience
+template<typename ValueType>
+class PointAttributeVector {
+public:
+    typedef ValueType PosType;
+    typedef ValueType value_type;
+
+    PointAttributeVector(const std::vector<value_type>& data)
+        : mData(data) { }
+
+    size_t size() const { return mData.size(); }
+    void getPos(size_t n, ValueType& xyz) const { xyz = mData[n]; }
+
+    template <typename T>
+    void get(size_t n, T& value) const { value = mData[n]; }
+
+private:
+    const std::vector<value_type>& mData;
+}; // PointAttributeVector
+
+
+////////////////////////////////////////
+
+
+namespace point_conversion_internal {
+
+template<typename PointDataTreeType, typename PointIndexTreeType>
+struct InitialiseAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType> LeafManagerT;
+    typedef typename LeafManagerT::LeafRange LeafRangeT;
+
+    typedef typename PointIndexTreeType::LeafNodeType PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray IndexArray;
+
+    InitialiseAttributesOp( PointDataTreeType& tree,
+                            const PointIndexTreeType& pointIndexTree,
+                            const AttributeSet::Descriptor::Ptr& attributeDescriptor)
+        : mTree(tree)
+        , mPointIndexTree(pointIndexTree)
+        , mAttributeDescriptor(attributeDescriptor) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mPointIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            // initialise the attribute storage
+
+            pointIndexLeaf->indices();
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            Index64 pointCount = indices.size();
+
+            leaf->initializeAttributes(mAttributeDescriptor, pointCount);
+        }
+    }
+
+    //////////
+
+    const PointDataTreeType&                mTree;
+    const PointIndexTreeType&               mPointIndexTree;
+    const AttributeSet::Descriptor::Ptr&    mAttributeDescriptor;
+};
+
+template<   typename PointDataTreeType,
+            typename PointIndexTreeType,
+            typename PositionListType>
+struct PopulatePositionAttributeOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType> LeafManagerT;
+    typedef typename LeafManagerT::LeafRange LeafRangeT;
+
+    typedef typename PointIndexTreeType::LeafNodeType PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray IndexArray;
+
+    typedef typename PositionListType::value_type ValueType;
+
+    PopulatePositionAttributeOp(const PointIndexTreeType& pointIndexTree,
+                                const math::Transform& transform,
+                                const PositionListType& positions)
+        : mPointIndexTree(pointIndexTree)
+        , mTransform(transform)
+        , mPositions(positions) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mPointIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            typename AttributeWriteHandle<Vec3f>::Ptr attributeWriteHandle =
+                AttributeWriteHandle<Vec3f>::create(leaf->template attributeArray("P"));
+
+            Index64 index = 0;
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            for (typename IndexArray::const_iterator it = indices.begin(), it_end = indices.end(); it != it_end; ++it)
+            {
+                ValueType positionWorldSpace;
+                mPositions.getPos(*it, positionWorldSpace);
+
+                const ValueType positionIndexSpace = mTransform.worldToIndex(positionWorldSpace);
+
+                const ValueType positionVoxelSpace = ValueType(
+                            positionIndexSpace.x() - math::Round(positionIndexSpace.x()),
+                            positionIndexSpace.y() - math::Round(positionIndexSpace.y()),
+                            positionIndexSpace.z() - math::Round(positionIndexSpace.z()));
+
+                attributeWriteHandle->set(index, Vec3f(positionVoxelSpace));
+
+                index++;
+            }
+        }
+    }
+
+    //////////
+
+    const PointIndexTreeType&   mPointIndexTree;
+    const math::Transform&      mTransform;
+    const PositionListType&     mPositions;
+};
+
+template<   typename PointDataTreeType,
+            typename PointIndexTreeType,
+            typename AttributeListType>
+struct PopulateAttributeOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType> LeafManagerT;
+    typedef typename LeafManagerT::LeafRange LeafRangeT;
+
+    typedef typename PointIndexTreeType::LeafNodeType PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray IndexArray;
+
+    typedef typename AttributeListType::value_type ValueType;
+
+    PopulateAttributeOp(const PointIndexTreeType& pointIndexTree,
+                        const AttributeListType& data,
+                        const openvdb::Name& attributeName)
+        : mPointIndexTree(pointIndexTree)
+        , mData(data)
+        , mAttributeName(attributeName) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mPointIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            typename AttributeWriteHandle<ValueType>::Ptr attributeWriteHandle =
+                AttributeWriteHandle<ValueType>::create(leaf->attributeArray(mAttributeName));
+
+            Index64 index = 0;
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            for (typename IndexArray::const_iterator it = indices.begin(), it_end = indices.end(); it != it_end; ++it)
+            {
+                ValueType value;
+                mData.template get<ValueType>(*it, value);
+
+                attributeWriteHandle->set(index, value);
+
+                index++;
+            }
+        }
+    }
+
+    //////////
+
+    const PointIndexTreeType&   mPointIndexTree;
+    const AttributeListType&    mData;
+    const openvdb::Name&        mAttributeName;
+};
+
+} // namespace point_conversion_internal
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataGridT, typename PositionArrayT, typename PointIndexGridT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const PointIndexGridT& pointIndexGrid, const PositionArrayT& positions,
+                    const openvdb::NamePair& positionType, const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue)
+{
+    typedef typename PointDataGridT::TreeType                       PointDataTreeT;
+    typedef typename PointIndexGridT::TreeType                      PointIndexTreeT;
+    typedef typename tree::template LeafManager<PointDataTreeT>     LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                        LeafRangeT;
+
+    using point_conversion_internal::InitialiseAttributesOp;
+    using point_conversion_internal::PopulatePositionAttributeOp;
+
+    // construct the Tree using a topology copy of the PointIndexGrid
+
+    const PointIndexTreeT& pointIndexTree(pointIndexGrid.tree());
+    typename PointDataTreeT::Ptr treePtr(new PointDataTreeT(pointIndexTree));
+
+    LeafManagerT leafManager = LeafManagerT(*treePtr);
+    LeafRangeT leafRange = leafManager.leafRange();
+
+    // create attribute descriptor from position type
+
+    AttributeSet::Descriptor::Ptr descriptor = AttributeSet::Descriptor::create(positionType);
+
+    // add default value for position if provided
+
+    if (positionDefaultValue)   descriptor->setDefaultValue("P", *positionDefaultValue);
+
+    // create point attribute storage on each leaf
+
+    InitialiseAttributesOp<PointDataTreeT, PointIndexTreeT> initialise(
+                                *treePtr, pointIndexGrid.tree(), descriptor);
+    tbb::parallel_for(leafRange, initialise);
+
+    // populate position attribute
+
+    PopulatePositionAttributeOp<PointDataTreeT,
+                                PointIndexTreeT,
+                                PositionArrayT> populate(pointIndexTree,
+                                                        xform,
+                                                        positions);
+
+    tbb::parallel_for(leafRange, populate);
+
+    typename PointDataGridT::Ptr grid = PointDataGridT::create(treePtr);
+    grid->setTransform(xform.copy());
+    return grid;
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataGridT, typename ValueT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const std::vector<ValueT>& positions,
+                    const openvdb::NamePair& positionType,
+                    const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue)
+{
+    const PointAttributeVector<ValueT> pointList(positions);
+
+    PointIndexGrid::Ptr pointIndexGrid = createPointIndexGrid<PointIndexGrid>(pointList, xform);
+    return createPointDataGrid<PointDataGridT>(*pointIndexGrid, pointList, positionType, xform, positionDefaultValue);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTreeT, typename PointIndexTreeT, typename PointArrayT>
+inline void
+populateAttribute(  PointDataTreeT& tree, const PointIndexTreeT& pointIndexTree,
+                    const openvdb::Name& attributeName, const PointArrayT& data)
+{
+    using point_conversion_internal::PopulateAttributeOp;
+
+    // populate attribute
+
+    PopulateAttributeOp<PointDataTreeT,
+                        PointIndexTreeT,
+                        PointArrayT> populate(pointIndexTree, data, attributeName);
+
+    tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), populate);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_CONVERSION_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/PointCount.h b/nuparu/include/openvdb/tools/PointCount.h
new file mode 100644
index 00000000..a3ada00e
--- /dev/null
+++ b/nuparu/include/openvdb/tools/PointCount.h
@@ -0,0 +1,277 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointCount.h
+///
+/// @brief  Various point counting methods using a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_COUNT_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_COUNT_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+#include <openvdb_points/tools/PointAttribute.h>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+#include <tbb/parallel_reduce.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Total points in the PointDataTree
+/// @param tree PointDataTree.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 pointCount(const PointDataTreeT& tree, const bool inCoreOnly = false);
+
+
+/// @brief Total active points in the PointDataTree
+/// @param tree PointDataTree.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 activePointCount(const PointDataTreeT& tree, const bool inCoreOnly = false);
+
+
+/// @brief Total inactive points in the PointDataTree
+/// @param tree PointDataTree.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 inactivePointCount(const PointDataTreeT& tree, const bool inCoreOnly = false);
+
+
+/// @brief Total points in the group in the PointDataTree
+/// @param tree PointDataTree.
+/// @param name group name.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 groupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly = false);
+
+
+/// @brief Total active points in the group in the PointDataTree
+/// @param tree PointDataTree.
+/// @param name group name.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 activeGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly = false);
+
+
+/// @brief Total inactive points in the group in the PointDataTree
+/// @param tree PointDataTree.
+/// @param name group name.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 inactiveGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly = false);
+
+
+////////////////////////////////////////
+
+
+namespace point_count_internal {
+
+template <  typename PointDataTreeT,
+            typename ValueIterT,
+            typename FilterT>
+struct PointCountOp
+{
+    typedef typename tree::LeafManager<const PointDataTreeT>    LeafManagerT;
+    typedef IndexIterTraits<PointDataTreeT, ValueIterT>         IndexIteratorFromLeafT;
+    typedef typename IndexIteratorFromLeafT::Iterator           IndexIterator;
+    typedef typename FilterT::Data                              FilterDataT;
+    typedef FilterIndexIter<IndexIterator, FilterT>             Iterator;
+
+    PointCountOp(const FilterDataT& filterData,
+                 const bool inCoreOnly = false)
+        : mFilterData(filterData)
+        , mInCoreOnly(inCoreOnly) { }
+
+    Index64 operator()(const typename LeafManagerT::LeafRange& range, Index64 size) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf = range.begin(); leaf; ++leaf) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+            if (mInCoreOnly && leaf->buffer().isOutOfCore())     continue;
+#endif
+            IndexIterator indexIterator(IndexIteratorFromLeafT::begin(*leaf));
+            FilterT filter(FilterT::create(*leaf, mFilterData));
+            Iterator iter(indexIterator, filter);
+            size += iterCount(iter);
+        }
+
+        return size;
+    }
+
+    static Index64 join(Index64 size1, Index64 size2) {
+        return size1 + size2;
+    }
+
+private:
+    const FilterDataT& mFilterData;
+    const bool mInCoreOnly;
+}; // struct PointCountOp
+
+
+template <typename PointDataTreeT, typename FilterT, typename ValueIterT>
+Index64 threadedFilterPointCount(   const PointDataTreeT& tree,
+                                    const typename FilterT::Data& filter,
+                                    const bool inCoreOnly = false)
+{
+    typedef point_count_internal::PointCountOp< PointDataTreeT, ValueIterT, FilterT> PointCountOp;
+
+    typename tree::LeafManager<const PointDataTreeT> leafManager(tree);
+    const PointCountOp pointCountOp(filter, inCoreOnly);
+    return tbb::parallel_reduce(leafManager.leafRange(), Index64(0), pointCountOp, PointCountOp::join);
+}
+
+
+template <typename PointDataTreeT, typename FilterT>
+Index64 filterPointCount(const PointDataTreeT& tree,
+                         const typename FilterT::Data& filter,
+                         const bool inCoreOnly = false)
+{
+    typedef typename PointDataTreeT::LeafNodeType::ValueAllCIter ValueIterT;
+    return threadedFilterPointCount<  PointDataTreeT, FilterT, ValueIterT>(tree, filter, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT, typename FilterT>
+Index64 filterActivePointCount( const PointDataTreeT& tree,
+                                const typename FilterT::Data& filter,
+                                const bool inCoreOnly = false)
+{
+    typedef typename PointDataTreeT::LeafNodeType::ValueOnCIter ValueIterT;
+    return threadedFilterPointCount<  PointDataTreeT, FilterT, ValueIterT>(tree, filter, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT, typename FilterT>
+Index64 filterInactivePointCount(   const PointDataTreeT& tree,
+                                    const typename FilterT::Data& filter,
+                                    const bool inCoreOnly = false)
+{
+    typedef typename PointDataTreeT::LeafNodeType::ValueOffCIter ValueIterT;
+    return threadedFilterPointCount<  PointDataTreeT, FilterT, ValueIterT>(tree, filter, inCoreOnly);
+}
+
+
+} // namespace point_count_internal
+
+
+template <typename PointDataTreeT>
+Index64 pointCount(const PointDataTreeT& tree, const bool inCoreOnly)
+{
+    (void) inCoreOnly;
+    Index64 size = 0;
+    for (typename PointDataTreeT::LeafCIter iter = tree.cbeginLeaf(); iter; ++iter) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (inCoreOnly && iter->buffer().isOutOfCore())     continue;
+#endif
+        size += iter->pointCount();
+    }
+    return size;
+}
+
+
+template <typename PointDataTreeT>
+Index64 activePointCount(const PointDataTreeT& tree, const bool inCoreOnly)
+{
+    (void) inCoreOnly;
+    Index64 size = 0;
+    for (typename PointDataTreeT::LeafCIter iter = tree.cbeginLeaf(); iter; ++iter) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (inCoreOnly && iter->buffer().isOutOfCore())     continue;
+#endif
+        size += iter->onPointCount();
+    }
+    return size;
+}
+
+
+template <typename PointDataTreeT>
+Index64 inactivePointCount(const PointDataTreeT& tree, const bool inCoreOnly)
+{
+    (void) inCoreOnly;
+    Index64 size = 0;
+    for (typename PointDataTreeT::LeafCIter iter = tree.cbeginLeaf(); iter; ++iter) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (inCoreOnly && iter->buffer().isOutOfCore())     continue;
+#endif
+        size += iter->offPointCount();
+    }
+    return size;
+}
+
+
+template <typename PointDataTreeT>
+Index64 groupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly)
+{
+    GroupFilter::Data groupFilterData(name);
+    return point_count_internal::filterPointCount<PointDataTreeT, GroupFilter>(tree, groupFilterData, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT>
+Index64 activeGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly)
+{
+    GroupFilter::Data groupFilterData(name);
+    return point_count_internal::filterActivePointCount<PointDataTreeT, GroupFilter>(tree, groupFilterData, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT>
+Index64 inactiveGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly)
+{
+    GroupFilter::Data groupFilterData(name);
+    return point_count_internal::filterInactivePointCount<PointDataTreeT, GroupFilter>(tree, groupFilterData, inCoreOnly);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_COUNT_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/PointDataGrid.h b/nuparu/include/openvdb/tools/PointDataGrid.h
new file mode 100644
index 00000000..2a691cfd
--- /dev/null
+++ b/nuparu/include/openvdb/tools/PointDataGrid.h
@@ -0,0 +1,920 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey, Nick Avramoussis, Matt Warner
+///
+/// @file PointDataGrid.h
+///
+/// @brief  Attribute-owned data structure for points. Point attributes are
+///         stored in leaf nodes and ordered by voxel for fast random and
+///         sequential access.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_DATA_GRID_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_DATA_GRID_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/tree/LeafNode.h>
+
+#include <openvdb/tools/PointIndexGrid.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/AttributeGroup.h>
+
+#include <utility> // std::pair, std::make_pair
+
+
+class TestPointDataLeaf;
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+// forward declaration
+namespace tree {
+    template<Index, typename> struct SameLeafConfig;
+}
+
+
+////////////////////////////////////////
+
+
+namespace tools {
+
+
+// forward declaration
+template<typename T, Index Log2Dim> class PointDataLeafNode;
+
+/// @brief Point index tree configured to match the default VDB configurations.
+typedef tree::Tree<tree::RootNode<tree::InternalNode<tree::InternalNode
+    <PointDataLeafNode<PointDataIndex32, 3>, 4>, 5> > > PointDataTree;
+
+
+/// @brief Point data grid.
+typedef Grid<PointDataTree> PointDataGrid;
+
+
+////////////////////////////////////////
+
+// Internal utility methods
+namespace point_data_grid_internal {
+
+template<typename T>
+struct UniquePtr
+{
+#ifdef OPENVDB_HAS_CXX11
+    typedef std::unique_ptr<T>  type;
+#else
+    typedef std::auto_ptr<T>    type;
+#endif
+};
+}
+
+
+template <typename T, Index Log2Dim>
+class PointDataLeafNode : public tree::LeafNode<T, Log2Dim> {
+
+public:
+    typedef PointDataLeafNode<T, Log2Dim>           LeafNodeType;
+    typedef boost::shared_ptr<PointDataLeafNode>    Ptr;
+
+    typedef T                                       ValueType;
+    typedef std::pair<ValueType, ValueType>         ValueTypePair;
+    typedef std::vector<ValueType>                  IndexArray;
+
+    typedef AttributeSet::Descriptor                Descriptor;
+
+    ////////////////////////////////////////
+
+    // The following methods had to be copied from the LeafNode class
+    // to make the derived PointDataLeafNode class compatible with the tree structure.
+
+    typedef tree::LeafNode<T, Log2Dim>    BaseLeaf;
+    typedef util::NodeMask<Log2Dim> NodeMaskType;
+
+    using BaseLeaf::LOG2DIM;
+    using BaseLeaf::TOTAL;
+    using BaseLeaf::DIM;
+    using BaseLeaf::NUM_VALUES;
+    using BaseLeaf::NUM_VOXELS;
+    using BaseLeaf::SIZE;
+    using BaseLeaf::LEVEL;
+
+    /// Default constructor
+    PointDataLeafNode()
+        : BaseLeaf()
+        , mAttributeSet(new AttributeSet) { }
+
+    ~PointDataLeafNode() { }
+
+    /// Construct using deep copy of other PointDataLeafNode
+    explicit PointDataLeafNode(const PointDataLeafNode& other)
+        : BaseLeaf(other)
+        , mAttributeSet(new AttributeSet(*other.mAttributeSet)) { }
+
+    /// Construct using supplied origin, value and active status
+    explicit
+    PointDataLeafNode(const Coord& coords, const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(coords, value, active)
+        , mAttributeSet(new AttributeSet) { }
+
+    /// Construct using supplied origin, value and active status
+    /// use attribute map from another PointDataLeafNode
+    PointDataLeafNode(const PointDataLeafNode& other, const Coord& coords, const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(coords, value, active)
+        , mAttributeSet(new AttributeSet(*other.mAttributeSet)) { }
+
+    // Copy-construct from a PointIndexLeafNode with the same configuration but a different ValueType.
+    template<typename OtherValueType>
+    PointDataLeafNode(const tools::PointIndexLeafNode<OtherValueType, Log2Dim>& other)
+        : BaseLeaf(other)
+        , mAttributeSet(new AttributeSet) { }
+
+    // Copy-construct from a LeafNode with the same configuration but a different ValueType.
+    // Used for topology copies - explicitly sets the value (background) to zeroVal
+    template <typename ValueType>
+    PointDataLeafNode(const tree::LeafNode<ValueType, Log2Dim>& other, const T& /*value*/, TopologyCopy)
+        : BaseLeaf(other, zeroVal<T>(), TopologyCopy())
+        , mAttributeSet(new AttributeSet) { }
+
+    // Copy-construct from a LeafNode with the same configuration but a different ValueType.
+    // Used for topology copies - explicitly sets the on and off value (background) to zeroVal
+    template <typename ValueType>
+    PointDataLeafNode(const tree::LeafNode<ValueType, Log2Dim>& other, const T& /*offValue*/, const T& /*onValue*/, TopologyCopy)
+        : BaseLeaf(other, zeroVal<T>(), zeroVal<T>(), TopologyCopy())
+        , mAttributeSet(new AttributeSet) { }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    PointDataLeafNode(PartialCreate, const Coord& coords,
+        const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(PartialCreate(), coords, value, active)
+        , mAttributeSet(new AttributeSet) { }
+#endif
+
+public:
+
+    /// Retrieve the attribute set.
+    const AttributeSet& attributeSet() const { return *mAttributeSet; }
+
+    /// @brief Create a new attribute set. Existing attributes will be removed.
+    void initializeAttributes(const Descriptor::Ptr& descriptor, const size_t arrayLength);
+    /// @brief Clear the attribute set.
+    void clearAttributes(const bool updateValueMask = true);
+
+    /// @brief Returns @c true if an attribute with this index exists.
+    /// @param pos Index of the attribute
+    bool hasAttribute(const size_t pos) const;
+    /// @brief Returns @c true if an attribute with this name exists.
+    /// @param attributeName    Name of the attribute
+    bool hasAttribute(const Name& attributeName) const;
+
+    /// @brief Append an attribute to the leaf.
+    /// @param attribute Name and type of the attribute to append.
+    /// @param expected Existing descriptor is expected to match this parameter.
+    /// @param replacement New descriptor to replace the existing one.
+    AttributeArray::Ptr appendAttribute(const AttributeSet::Util::NameAndType& attribute,
+                                        const Descriptor& expected, Descriptor::Ptr& replacement);
+    /// @brief Drop list of attributes.
+    /// @param pos vector of attribute indices to drop
+    /// @param expected Existing descriptor is expected to match this parameter.
+    /// @param replacement New descriptor to replace the existing one.
+    void dropAttributes(const std::vector<size_t>& pos,
+                        const Descriptor& expected, Descriptor::Ptr& replacement);
+    /// @brief Reorder attribute set.
+    /// @param replacement New descriptor to replace the existing one.
+    void reorderAttributes(const Descriptor::Ptr& replacement);
+    /// @brief Rename attributes in attribute set (order must remain the same).
+    /// @param expected Existing descriptor is expected to match this parameter.
+    /// @param replacement New descriptor to replace the existing one.
+    void renameAttributes(const Descriptor& expected, Descriptor::Ptr& replacement);
+    /// @brief Compact all attributes in attribute set.
+    void compactAttributes();
+
+    /// @brief Swap the underlying attribute set with the given @a attributeSet.
+    /// This leaf will assume ownership of the given attribute set. The descriptors must
+    /// match and the voxel offsets values will need updating if the point order is different.
+    void swap(AttributeSet* attributeSet);
+
+    /// @brief Sets all of the voxel offset values on this leaf, from the given vector
+    /// of @a offsets. If @a updateValueMask is true, then the active value mask will
+    /// be updated so voxels with points are active and empty voxels are inactive.
+    void setOffsets(const std::vector<ValueType>& offsets, const bool updateValueMask = true);
+
+    /// @brief Throws an error if the voxel values on this leaf are not monotonically
+    /// increasing or within the bounds of the attribute arrays
+    void validateOffsets() const;
+
+    /// @brief Read-write attribute array reference from index
+    /// {
+    AttributeArray& attributeArray(const size_t pos);
+    const AttributeArray& attributeArray(const size_t pos) const;
+    const AttributeArray& constAttributeArray(const size_t pos) const;
+    /// }
+    /// @brief Read-write attribute array reference from name
+    /// {
+    AttributeArray& attributeArray(const Name& attributeName);
+    const AttributeArray& attributeArray(const Name& attributeName) const;
+    const AttributeArray& constAttributeArray(const Name& attributeName) const;
+    /// }
+
+    /// @brief Read-only group handle from group index
+    GroupHandle groupHandle(const AttributeSet::Descriptor::GroupIndex& index) const;
+    /// @brief Read-only group handle from group name
+    GroupHandle groupHandle(const Name& group) const;
+    /// @brief Read-write group handle from group index
+    GroupWriteHandle groupWriteHandle(const AttributeSet::Descriptor::GroupIndex& index);
+    /// @brief Read-write group handle from group name
+    GroupWriteHandle groupWriteHandle(const Name& name);
+
+    /// @brief Compute the total point count for the leaf
+    Index64 pointCount() const;
+    /// @brief Compute the total active (on) point count for the leaf
+    Index64 onPointCount() const;
+    /// @brief Compute the total inactive (off) point count for the leaf
+    Index64 offPointCount() const;
+    /// @brief Compute the point count in a specific group for the leaf
+    Index64 groupPointCount(const Name& groupName) const;
+
+    /// @brief Activate voxels with non-zero points, deactivate voxels with zero points.
+    void updateValueMask();
+
+    ////////////////////////////////////////
+
+    void setOffsetOn(Index offset, const ValueType& val);
+    void setOffsetOnly(Index offset, const ValueType& val);
+
+    /// @brief Return @c true if the given node (which may have a different @c ValueType
+    /// than this node) has the same active value topology as this node.
+    template<typename OtherType, Index OtherLog2Dim>
+    bool hasSameTopology(const PointDataLeafNode<OtherType, OtherLog2Dim>* other) const {
+        return BaseLeaf::hasSameTopology(other);
+    }
+
+    /// Check for buffer, state and origin equivalence first.
+    /// If this returns true, do a deeper comparison on the attribute set to check
+    bool operator==(const PointDataLeafNode& other) const {
+        if(BaseLeaf::operator==(other) != true) return false;
+        return (*this->mAttributeSet == *other.mAttributeSet);
+    }
+
+    bool operator!=(const PointDataLeafNode& other) const { return !(other == *this); }
+
+    void addLeaf(PointDataLeafNode*) {}
+    template<typename AccessorT>
+    void addLeafAndCache(PointDataLeafNode*, AccessorT&) {}
+
+    //@{
+    /// @brief Return a pointer to this node.
+    PointDataLeafNode* touchLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    PointDataLeafNode* touchLeafAndCache(const Coord&, AccessorT&) { return this; }
+
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord&, AccessorT&)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,PointDataLeafNode>::value)) return NULL;
+        return reinterpret_cast<NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    PointDataLeafNode* probeLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    PointDataLeafNode* probeLeafAndCache(const Coord&, AccessorT&) { return this; }
+    //@}
+
+    //@{
+    /// @brief Return a @const pointer to this node.
+    const PointDataLeafNode* probeConstLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const PointDataLeafNode* probeConstLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    template<typename AccessorT>
+    const PointDataLeafNode* probeLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    const PointDataLeafNode* probeLeaf(const Coord&) const { return this; }
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord&, AccessorT&) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,PointDataLeafNode>::value)) return NULL;
+        return reinterpret_cast<const NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+
+    // I/O methods
+
+    void readTopology(std::istream& is, bool fromHalf = false);
+    void writeTopology(std::ostream& os, bool toHalf = false) const;
+
+    void readBuffers(std::istream& is, bool fromHalf = false);
+    void readBuffers(std::istream& is, const CoordBBox&, bool fromHalf = false);
+    void writeBuffers(std::ostream& os, bool toHalf = false) const;
+
+
+    Index64 memUsage() const;
+
+    ////////////////////////////////////////
+
+    // Disable all write methods to avoid unintentional changes
+    // to the point-array offsets.
+
+    void assertNonmodifiable() {
+        assert(false && "Cannot modify voxel values in a PointDataTree.");
+    }
+
+    void setActiveState(const Coord& xyz, bool on) { BaseLeaf::setActiveState(xyz, on); }
+    void setActiveState(Index offset, bool on) { BaseLeaf::setActiveState(offset, on); }
+
+    void setValueOnly(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOnly(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValueOff(const Coord& xyz) { BaseLeaf::setValueOff(xyz); }
+    void setValueOff(Index offset) { BaseLeaf::setValueOff(offset); }
+
+    void setValueOff(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOff(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValueOn(const Coord& xyz) { BaseLeaf::setValueOn(xyz); }
+    void setValueOn(Index offset) {  BaseLeaf::setValueOn(offset); }
+
+    void setValueOn(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOn(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValue(const Coord&, const ValueType&) { assertNonmodifiable(); }
+
+    void setValuesOn() { BaseLeaf::setValuesOn(); }
+    void setValuesOff() { BaseLeaf::setValuesOff(); }
+
+    template<typename ModifyOp>
+    void modifyValue(Index, const ModifyOp&) { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValue(const Coord&, const ModifyOp&) { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord&, const ModifyOp&) { assertNonmodifiable(); }
+
+    void clip(const CoordBBox&, const ValueType&) { assertNonmodifiable(); }
+
+    void fill(const CoordBBox&, const ValueType&, bool) { assertNonmodifiable(); }
+    void fill(const ValueType&) {}
+    void fill(const ValueType&, bool) { assertNonmodifiable(); }
+
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord&, const ValueType&, AccessorT&) {assertNonmodifiable();}
+
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord&, const ModifyOp&, AccessorT&) {
+        assertNonmodifiable();
+    }
+
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord&, const ValueType&, AccessorT&) { assertNonmodifiable(); }
+
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT& parent) { BaseLeaf::setActiveStateAndCache(xyz, on, parent); }
+
+    void resetBackground(const ValueType&, const ValueType&) { assertNonmodifiable(); }
+
+    void signedFloodFill(const ValueType&) { assertNonmodifiable(); }
+    void signedFloodFill(const ValueType&, const ValueType&) { assertNonmodifiable(); }
+
+    void negate() { assertNonmodifiable(); }
+
+    friend class ::TestPointDataLeaf;
+
+    typedef typename BaseLeaf::ValueOn ValueOn;
+    typedef typename BaseLeaf::ValueOff ValueOff;
+    typedef typename BaseLeaf::ValueAll ValueAll;
+
+private:
+    point_data_grid_internal::UniquePtr<AttributeSet>::type mAttributeSet;
+
+protected:
+    typedef typename BaseLeaf::ChildOn ChildOn;
+    typedef typename BaseLeaf::ChildOff ChildOff;
+    typedef typename BaseLeaf::ChildAll ChildAll;
+
+    typedef typename NodeMaskType::OnIterator    MaskOnIterator;
+    typedef typename NodeMaskType::OffIterator   MaskOffIterator;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIterator;
+
+    // During topology-only construction, access is needed
+    // to protected/private members of other template instances.
+    template<typename, Index> friend class PointDataLeafNode;
+
+    friend class tree::IteratorBase<MaskOnIterator, PointDataLeafNode>;
+    friend class tree::IteratorBase<MaskOffIterator, PointDataLeafNode>;
+    friend class tree::IteratorBase<MaskDenseIterator, PointDataLeafNode>;
+
+public:
+
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOnIterator, PointDataLeafNode, const ValueType, ValueOn> ValueOnIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOnIterator, const PointDataLeafNode, const ValueType, ValueOn> ValueOnCIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOffIterator, PointDataLeafNode, const ValueType, ValueOff> ValueOffIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOffIterator,const PointDataLeafNode,const ValueType,ValueOff> ValueOffCIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskDenseIterator, PointDataLeafNode, const ValueType, ValueAll> ValueAllIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskDenseIterator,const PointDataLeafNode,const ValueType,ValueAll> ValueAllCIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOnIterator, PointDataLeafNode, ChildOn> ChildOnIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOnIterator, const PointDataLeafNode, ChildOn> ChildOnCIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOffIterator, PointDataLeafNode, ChildOff> ChildOffIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOffIterator, const PointDataLeafNode, ChildOff> ChildOffCIter;
+    typedef typename BaseLeaf::template DenseIter<
+        PointDataLeafNode, ValueType, ChildAll> ChildAllIter;
+    typedef typename BaseLeaf::template DenseIter<
+        const PointDataLeafNode, const ValueType, ChildAll> ChildAllCIter;
+
+    typedef openvdb::tools::IndexIter IndexIter;
+    typedef ValueIndexIter<ValueOnCIter> IndexOnIter;
+    typedef ValueIndexIter<ValueOffCIter> IndexOffIter;
+
+    /// @brief Leaf index iterator
+    IndexIter beginIndexAll() const;
+    IndexOnIter beginIndexOn() const;
+    IndexOffIter beginIndexOff() const;
+    /// @brief Leaf index iterator from voxel
+    IndexIter beginIndex(const unsigned index) const;
+    IndexIter beginIndex(const Coord& ijk) const;
+
+#define VMASK_ this->getValueMask()
+    ValueOnCIter  cbeginValueOn() const  { return ValueOnCIter(VMASK_.beginOn(), this); }
+    ValueOnCIter   beginValueOn() const  { return ValueOnCIter(VMASK_.beginOn(), this); }
+    ValueOnIter    beginValueOn()        { return ValueOnIter(VMASK_.beginOn(), this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(VMASK_.beginOff(), this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(VMASK_.beginOff(), this); }
+    ValueOffIter   beginValueOff()       { return ValueOffIter(VMASK_.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(VMASK_.beginDense(), this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(VMASK_.beginDense(), this); }
+    ValueAllIter   beginValueAll()       { return ValueAllIter(VMASK_.beginDense(), this); }
+
+    ValueOnCIter  cendValueOn() const    { return ValueOnCIter(VMASK_.endOn(), this); }
+    ValueOnCIter   endValueOn() const    { return ValueOnCIter(VMASK_.endOn(), this); }
+    ValueOnIter    endValueOn()          { return ValueOnIter(VMASK_.endOn(), this); }
+    ValueOffCIter cendValueOff() const   { return ValueOffCIter(VMASK_.endOff(), this); }
+    ValueOffCIter  endValueOff() const   { return ValueOffCIter(VMASK_.endOff(), this); }
+    ValueOffIter   endValueOff()         { return ValueOffIter(VMASK_.endOff(), this); }
+    ValueAllCIter cendValueAll() const   { return ValueAllCIter(VMASK_.endDense(), this); }
+    ValueAllCIter  endValueAll() const   { return ValueAllCIter(VMASK_.endDense(), this); }
+    ValueAllIter   endValueAll()         { return ValueAllIter(VMASK_.endDense(), this); }
+
+    ChildOnCIter  cbeginChildOn() const  { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnCIter   beginChildOn() const  { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnIter    beginChildOn()        { return ChildOnIter(VMASK_.endOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffCIter  beginChildOff() const { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffIter   beginChildOff()       { return ChildOffIter(VMASK_.endOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(VMASK_.beginDense(), this); }
+    ChildAllCIter  beginChildAll() const { return ChildAllCIter(VMASK_.beginDense(), this); }
+    ChildAllIter   beginChildAll()       { return ChildAllIter(VMASK_.beginDense(), this); }
+
+    ChildOnCIter  cendChildOn() const    { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnCIter   endChildOn() const    { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnIter    endChildOn()          { return ChildOnIter(VMASK_.endOn(), this); }
+    ChildOffCIter cendChildOff() const   { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffCIter  endChildOff() const   { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffIter   endChildOff()         { return ChildOffIter(VMASK_.endOff(), this); }
+    ChildAllCIter cendChildAll() const   { return ChildAllCIter(VMASK_.endDense(), this); }
+    ChildAllCIter  endChildAll() const   { return ChildAllCIter(VMASK_.endDense(), this); }
+    ChildAllIter   endChildAll()         { return ChildAllIter(VMASK_.endDense(), this); }
+#undef VMASK_
+}; // struct PointDataLeafNode
+
+////////////////////////////////////////
+
+// PointDataLeafNode implementation
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::initializeAttributes(const Descriptor::Ptr& descriptor, const size_t arrayLength)
+{
+    mAttributeSet.reset(new AttributeSet(descriptor, arrayLength));
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::clearAttributes(const bool updateValueMask)
+{
+    mAttributeSet.reset(new AttributeSet(mAttributeSet->descriptorPtr(), 0));
+
+    // zero voxel values
+
+    for (Index n = 0; n < LeafNodeType::NUM_VALUES; n++) {
+        this->setOffsetOnly(n, 0);
+    }
+
+    // if updateValueMask, also de-activate all voxels
+
+    if (updateValueMask)    this->setValuesOff();
+}
+
+template<typename T, Index Log2Dim>
+inline bool
+PointDataLeafNode<T, Log2Dim>::hasAttribute(const size_t pos) const
+{
+    return pos < mAttributeSet->size();
+}
+
+template<typename T, Index Log2Dim>
+inline bool
+PointDataLeafNode<T, Log2Dim>::hasAttribute(const Name& attributeName) const
+{
+    const size_t pos = mAttributeSet->find(attributeName);
+    return pos != AttributeSet::INVALID_POS;
+}
+
+template<typename T, Index Log2Dim>
+inline AttributeArray::Ptr
+PointDataLeafNode<T, Log2Dim>::appendAttribute(const AttributeSet::Util::NameAndType& attribute,
+                     const Descriptor& expected, Descriptor::Ptr& replacement)
+{
+    return mAttributeSet->appendAttribute(attribute, expected, replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::dropAttributes(const std::vector<size_t>& pos,
+                    const Descriptor& expected, Descriptor::Ptr& replacement)
+{
+    mAttributeSet->dropAttributes(pos, expected, replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::reorderAttributes(const Descriptor::Ptr& replacement)
+{
+    mAttributeSet->reorderAttributes(replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::renameAttributes(const Descriptor& expected, Descriptor::Ptr& replacement)
+{
+    mAttributeSet->renameAttributes(expected, replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::compactAttributes()
+{
+    for (size_t i = 0; i < mAttributeSet->size(); i++) {
+        AttributeArray* array = mAttributeSet->get(i);
+        array->compact();
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::swap(AttributeSet* attributeSet)
+{
+    if (!attributeSet) {
+        OPENVDB_THROW(ValueError, "Cannot swap with a null attribute set");
+    }
+
+    if (mAttributeSet->descriptor() != attributeSet->descriptor()) {
+        OPENVDB_THROW(ValueError, "Attribute set descriptors are not equal.");
+    }
+
+    mAttributeSet.reset(attributeSet);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::setOffsets(const std::vector<ValueType>& offsets, const bool updateValueMask)
+{
+    if (offsets.size() != LeafNodeType::NUM_VALUES) {
+        OPENVDB_THROW(ValueError, "Offset vector size doesn't match number of voxels.")
+    }
+
+    for (size_t index = 0; index < offsets.size(); ++index) {
+        setOffsetOnly(index, offsets[index]);
+    }
+
+    if (updateValueMask) this->updateValueMask();
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::validateOffsets() const
+{
+    // Ensure all of the offset values are monotonically increasing
+    for (size_t index = 1; index < BaseLeaf::SIZE; ++index) {
+        if (this->getValue(index-1) > this->getValue(index)) {
+            OPENVDB_THROW(ValueError, "Voxel offset values are not monotonically increasing");
+        }
+    }
+
+    // Ensure all attribute arrays are of equal length
+    for (size_t attributeIndex = 1; attributeIndex < mAttributeSet->size(); ++attributeIndex ) {
+        if (mAttributeSet->getConst(attributeIndex-1)->size() != mAttributeSet->getConst(attributeIndex)->size()) {
+            OPENVDB_THROW(ValueError, "Attribute arrays have inconsistent length");
+        }
+    }
+
+    // Ensure the last voxel's offset value matches the size of each attribute array
+    if (mAttributeSet->size() > 0 && this->getValue(BaseLeaf::SIZE-1) != mAttributeSet->getConst(0)->size()) {
+        OPENVDB_THROW(ValueError, "Last voxel offset value does not match attribute array length");
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const size_t pos)
+{
+    if (pos >= mAttributeSet->size())             OPENVDB_THROW(LookupError, "Attribute Out Of Range - " << pos);
+    return *mAttributeSet->get(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const size_t pos) const
+{
+    if (pos >= mAttributeSet->size())             OPENVDB_THROW(LookupError, "Attribute Out Of Range - " << pos);
+    return *mAttributeSet->getConst(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::constAttributeArray(const size_t pos) const
+{
+    return this->attributeArray(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const Name& attributeName)
+{
+    const size_t pos = mAttributeSet->find(attributeName);
+    if (pos == AttributeSet::INVALID_POS)         OPENVDB_THROW(LookupError, "Attribute Not Found - " << attributeName);
+    return *mAttributeSet->get(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const Name& attributeName) const
+{
+    const size_t pos = mAttributeSet->find(attributeName);
+    if (pos == AttributeSet::INVALID_POS)         OPENVDB_THROW(LookupError, "Attribute Not Found - " << attributeName);
+    return *mAttributeSet->getConst(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::constAttributeArray(const Name& attributeName) const
+{
+    return this->attributeArray(attributeName);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupHandle
+PointDataLeafNode<T, Log2Dim>::groupHandle(const AttributeSet::Descriptor::GroupIndex& index) const
+{
+    const AttributeArray& array = this->attributeArray(index.first);
+    assert(GroupAttributeArray::isGroup(array));
+
+    const GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
+
+    return GroupHandle(groupArray, index.second);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupHandle
+PointDataLeafNode<T, Log2Dim>::groupHandle(const Name& name) const
+{
+    const AttributeSet::Descriptor::GroupIndex index = this->attributeSet().groupIndex(name);
+    return this->groupHandle(index);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupWriteHandle
+PointDataLeafNode<T, Log2Dim>::groupWriteHandle(const AttributeSet::Descriptor::GroupIndex& index)
+{
+    AttributeArray& array = this->attributeArray(index.first);
+    assert(GroupAttributeArray::isGroup(array));
+
+    GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
+
+    return GroupWriteHandle(groupArray, index.second);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupWriteHandle
+PointDataLeafNode<T, Log2Dim>::groupWriteHandle(const Name& name)
+{
+    const AttributeSet::Descriptor::GroupIndex index = this->attributeSet().groupIndex(name);
+    return this->groupWriteHandle(index);
+}
+
+template<typename T, Index Log2Dim>
+inline IndexIter
+PointDataLeafNode<T, Log2Dim>::beginIndexAll() const
+{
+    const ValueType start = 0;
+    const ValueType end = this->getValue(NUM_VOXELS - 1);
+    return IndexIter(start, end);
+}
+
+template<typename T, Index Log2Dim>
+inline typename PointDataLeafNode<T, Log2Dim>::IndexOnIter
+PointDataLeafNode<T, Log2Dim>::beginIndexOn() const
+{
+    ValueOnCIter iter = this->cbeginValueOn();
+    return IndexOnIter(iter);
+}
+
+template<typename T, Index Log2Dim>
+inline typename PointDataLeafNode<T, Log2Dim>::IndexOffIter
+PointDataLeafNode<T, Log2Dim>::beginIndexOff() const
+{
+    ValueOffCIter iter = this->cbeginValueOff();
+    return IndexOffIter(iter);
+}
+
+template<typename T, Index Log2Dim>
+inline IndexIter
+PointDataLeafNode<T, Log2Dim>::beginIndex(const unsigned index) const
+{
+    assert(index < BaseLeaf::SIZE);
+    const ValueType end = this->getValue(index);
+    const ValueType start = (index == 0) ? ValueType(0) : this->getValue(index - 1);
+    return IndexIter(start, end);
+}
+
+template<typename T, Index Log2Dim>
+inline IndexIter
+PointDataLeafNode<T, Log2Dim>::beginIndex(const Coord& ijk) const
+{
+    return this->beginIndex(LeafNodeType::coordToOffset(ijk));
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::pointCount() const
+{
+    return iterCount(this->beginIndexAll());
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::onPointCount() const
+{
+    if (this->isEmpty())        return 0;
+    else if (this->isDense())   return this->pointCount();
+    return iterCount(this->beginIndexOn());
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::offPointCount() const
+{
+    if (this->isEmpty())        return this->pointCount();
+    else if (this->isDense())   return 0;
+    return iterCount(this->beginIndexOff());
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::groupPointCount(const Name& groupName) const
+{
+    IndexIter indexIter = this->beginIndexAll();
+    GroupFilter filter(GroupFilter::create(*this, GroupFilter::Data(groupName)));
+    FilterIndexIter<IndexIter, GroupFilter> filterIndexIter(indexIter, filter);
+    return iterCount(filterIndexIter);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::updateValueMask()
+{
+    ValueType start = 0, end = 0;
+    for (Index n = 0; n < LeafNodeType::NUM_VALUES; n++) {
+        end = this->getValue(n);
+        this->setValueMask(n, (end - start) > 0);
+        start = end;
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::setOffsetOn(Index offset, const ValueType& val)
+{
+    this->buffer().setValue(offset, val);
+    this->setValueMaskOn(offset);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::setOffsetOnly(Index offset, const ValueType& val)
+{
+    this->buffer().setValue(offset, val);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::readTopology(std::istream& is, bool fromHalf)
+{
+    BaseLeaf::readTopology(is, fromHalf);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::writeTopology(std::ostream& os, bool toHalf) const
+{
+    BaseLeaf::writeTopology(os, toHalf);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::readBuffers(std::istream& is, bool fromHalf)
+{
+    BaseLeaf::readBuffers(is, fromHalf);
+
+    mAttributeSet->read(is);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::readBuffers(std::istream& is, const CoordBBox& bbox, bool fromHalf)
+{
+    // Read and clip voxel values (no clipping yet).
+    BaseLeaf::readBuffers(is, bbox, fromHalf);
+
+    mAttributeSet->read(is);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::writeBuffers(std::ostream& os, bool toHalf) const
+{
+    BaseLeaf::writeBuffers(os, toHalf);
+
+    mAttributeSet->write(os);
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::memUsage() const
+{
+    return BaseLeaf::memUsage() + mAttributeSet->memUsage();
+}
+
+} // namespace tools
+
+////////////////////////////////////////
+
+namespace tree
+{
+
+/// Helper metafunction used to implement LeafNode::SameConfiguration
+/// (which, as an inner class, can't be independently specialized)
+template<Index Dim1, typename T2>
+struct SameLeafConfig<Dim1, tools::PointDataLeafNode<T2, Dim1> > { static const bool value = true; };
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_DATA_GRID_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/PointGroup.h b/nuparu/include/openvdb/tools/PointGroup.h
new file mode 100644
index 00000000..413fc6a2
--- /dev/null
+++ b/nuparu/include/openvdb/tools/PointGroup.h
@@ -0,0 +1,671 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointGroup.h
+///
+/// @brief  Point group manipulation in a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_GROUP_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_GROUP_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+#include <openvdb_points/tools/PointAttribute.h>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Appends a new empty group to the VDB tree.
+///
+/// @param tree          the PointDataTree to be appended to.
+/// @param group         name of the new group.
+template <typename PointDataTree>
+inline void appendGroup(PointDataTree& tree,
+                        const Name& group);
+
+/// @brief Appends new empty groups to the VDB tree.
+///
+/// @param tree          the PointDataTree to be appended to.
+/// @param groups        names of the new groups.
+template <typename PointDataTree>
+inline void appendGroups(PointDataTree& tree,
+                         const std::vector<Name>& groups);
+
+/// @brief Drops an existing group from the VDB tree.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param group         name of the group.
+/// @param compact       compact attributes if possible to reduce memory - if dropping
+///                      more than one group, compacting once at the end will be faster
+template <typename PointDataTree>
+inline void dropGroup(  PointDataTree& tree,
+                        const Name& group,
+                        const bool compact = true);
+
+/// @brief Drops existing groups from the VDB tree, the tree is compacted after dropping.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param groups        names of the groups.
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree,
+                        const std::vector<Name>& groups);
+
+/// @brief Drops all existing groups from the VDB tree, the tree is compacted after dropping.
+///
+/// @param tree          the PointDataTree to be dropped from.
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree);
+
+/// @brief Compacts existing groups of a VDB Tree to use less memory if possible.
+///
+/// @param tree          the PointDataTree to be compacted.
+template <typename PointDataTree>
+inline void compactGroups(PointDataTree& tree);
+
+/// @brief Sets group membership from a PointIndexTree-ordered vector.
+///
+/// @param tree          the PointDataTree.
+/// @param indexTree     the PointIndexTree.
+/// @param membership    @c true if the point is in the group.
+/// @param group         the name of the group.
+/// @param remove        if @c true also perform removal of points from the group.
+template <typename PointDataTree, typename PointIndexTree>
+inline void setGroup(   PointDataTree& tree,
+                        const PointIndexTree& indexTree,
+                        const std::vector<bool>& membership,
+                        const Name& group,
+                        const bool remove = false);
+
+
+////////////////////////////////////////
+
+
+namespace point_group_internal {
+
+
+/// Copy a group attribute value from one group offset to another
+template<typename PointDataTreeType>
+struct CopyGroupOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef AttributeSet::Descriptor::NameAndType               NameAndType;
+    typedef AttributeSet::Descriptor::GroupIndex                GroupIndex;
+
+    CopyGroupOp(PointDataTreeType& tree,
+                const GroupIndex& targetIndex,
+                const GroupIndex& sourceIndex)
+        : mTree(tree)
+        , mTargetIndex(targetIndex)
+        , mSourceIndex(sourceIndex) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            GroupHandle sourceGroup = leaf->groupHandle(mSourceIndex);
+            GroupWriteHandle targetGroup = leaf->groupWriteHandle(mTargetIndex);
+
+            for (IndexIter iter = leaf->beginIndexAll(); iter; ++iter) {
+                const bool groupOn = sourceGroup.get(*iter);
+                targetGroup.set(*iter, groupOn);
+            }
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&      mTree;
+    const GroupIndex        mTargetIndex;
+    const GroupIndex        mSourceIndex;
+};
+
+
+/// Set membership on or off for the specified group
+template <typename PointDataTree, bool Member>
+struct SetGroupOp
+{
+    typedef typename tree::LeafManager<PointDataTree>   LeafManagerT;
+    typedef AttributeSet::Descriptor::GroupIndex        GroupIndex;
+
+    SetGroupOp(const AttributeSet::Descriptor::GroupIndex& index)
+        : mIndex(index) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const
+    {
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the group attribute array
+
+            GroupWriteHandle group(leaf->groupWriteHandle(mIndex));
+
+            // set the group value
+
+            group.collapse(Member);
+        }
+    }
+
+    //////////
+
+    const GroupIndex        mIndex;
+}; // struct SetGroupOp
+
+
+template <typename PointDataTree, typename PointIndexTree, bool Remove>
+struct SetGroupFromIndexOp
+{
+    typedef typename tree::LeafManager<PointDataTree>   LeafManagerT;
+    typedef typename LeafManagerT::LeafRange            LeafRangeT;
+    typedef typename PointIndexTree::LeafNodeType       PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray     IndexArray;
+    typedef AttributeSet::Descriptor::GroupIndex        GroupIndex;
+    typedef std::vector<bool>                           BoolArray;
+
+    SetGroupFromIndexOp(const PointIndexTree& indexTree,
+                        const BoolArray& membership,
+                        const GroupIndex& index)
+        : mIndexTree(indexTree)
+        , mMembership(membership)
+        , mIndex(index) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const
+    {
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            // obtain the group attribute array
+
+            GroupWriteHandle group(leaf->groupWriteHandle(mIndex));
+
+            // initialise the attribute storage
+
+            Index64 index = 0;
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            for (typename IndexArray::const_iterator it = indices.begin(),
+                                                     it_end = indices.end(); it != it_end; ++it)
+            {
+                if (Remove) {
+                    group.set(index++, mMembership.at(*it));
+                }
+                else {
+                    if (mMembership.at(*it))    group.set(index, true);
+
+                    index++;
+                }
+            }
+        }
+    }
+
+    //////////
+
+    const PointIndexTree& mIndexTree;
+    const BoolArray& mMembership;
+    const GroupIndex mIndex;
+}; // struct SetGroupFromIndexOp
+
+
+////////////////////////////////////////
+
+
+/// Convenience class with methods for analyzing group data
+class GroupInfo
+{
+public:
+    typedef AttributeSet::Descriptor Descriptor;
+
+    GroupInfo(const AttributeSet& attributeSet)
+        : mAttributeSet(attributeSet) { }
+
+    /// Return the number of bits in a group (typically 8)
+    static size_t groupBits() { return sizeof(GroupType) * CHAR_BIT; }
+
+    /// Return the number of empty group slots which correlates to the number of groups
+    /// that can be stored without increasing the number of group attribute arrays
+    size_t unusedGroups() const
+    {
+        // compute total slots (one slot per bit of the group attributes)
+
+        const size_t groupAttributes = mAttributeSet.size(AttributeArray::GROUP);
+
+        if (groupAttributes == 0)   return 0;
+
+        const size_t totalSlots = groupAttributes * this->groupBits();
+
+        // compute slots in use
+
+        const AttributeSet::Descriptor::NameToPosMap& groupMap = mAttributeSet.descriptor().groupMap();
+        const size_t usedSlots = groupMap.size();
+
+        return totalSlots - usedSlots;
+    }
+
+    /// Return @c true if there are sufficient empty slots to allow compacting
+    bool canCompactGroups() const
+    {
+        // can compact if more unused groups than in one group attribute array
+
+        return this->unusedGroups() >= this->groupBits();
+    }
+
+    /// Return the next empty group slot
+    size_t nextUnusedOffset() const
+    {
+        const Descriptor::NameToPosMap& groupMap = mAttributeSet.descriptor().groupMap();
+
+        // build a list of group indices
+
+        std::vector<size_t> indices;
+        for (Descriptor::ConstIterator  it = groupMap.begin(),
+                                        endIt = groupMap.end(); it != endIt; ++it) {
+            indices.push_back(it->second);
+        }
+
+        std::sort(indices.begin(), indices.end());
+
+        // return first index not present
+
+        size_t offset = 0;
+        for (std::vector<size_t>::const_iterator    it = indices.begin(),
+                                                    endIt = indices.end(); it != endIt; ++it) {
+            if (*it != offset)     break;
+            offset++;
+        }
+
+        return offset;
+    }
+
+    /// Fill the @p indices vector with the indices correlating to the group attribute arrays
+    void populateGroupIndices(std::vector<size_t>& indices) const
+    {
+        const Descriptor::NameToPosMap& map = mAttributeSet.descriptor().map();
+
+        for (Descriptor::ConstIterator  it = map.begin(),
+                                        itEnd = map.end(); it != itEnd; ++it) {
+
+            const AttributeArray* array = mAttributeSet.getConst(it->first);
+            if (GroupAttributeArray::isGroup(*array)) {
+                indices.push_back(it->second);
+            }
+        }
+    }
+
+    /// Determine if a move is required to efficiently compact the data and store the
+    /// source name, offset and the target offset in the input parameters
+    bool requiresMove(Name& sourceName, size_t& sourceOffset, size_t& targetOffset) const {
+
+        targetOffset = this->nextUnusedOffset();
+
+        const Descriptor::NameToPosMap& groupMap = mAttributeSet.descriptor().groupMap();
+
+        typedef Descriptor::NameToPosMap::const_reverse_iterator ReverseMapIterator;
+
+        for (ReverseMapIterator it = groupMap.rbegin(),
+                                itEnd = groupMap.rend(); it != itEnd; ++it) {
+
+            // move only required if source comes after the target
+
+            if (it->second >= targetOffset) {
+                sourceName = it->first;
+                sourceOffset = it->second;
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+private:
+    const AttributeSet& mAttributeSet;
+}; // class GroupInfo
+
+
+} // namespace point_group_internal
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void appendGroup(PointDataTree& tree, const Name& group)
+{
+    typedef AttributeSet::Descriptor                              Descriptor;
+    typedef AttributeSet::Util::NameAndType                       NameAndType;
+
+    using point_attribute_internal::AppendAttributeOp;
+    using point_group_internal::GroupInfo;
+
+    if (group.empty()) {
+        OPENVDB_THROW(KeyError, "Cannot use an empty group name as a key.");
+    }
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+    GroupInfo groupInfo(attributeSet);
+
+    // don't add if group already exists
+
+    if (descriptor->hasGroup(group))    return;
+
+    // add a new group attribute if there are no unused groups
+
+    if (groupInfo.unusedGroups() == 0) {
+
+        // find a new internal group name
+
+        const NameAndType groupAttribute(descriptor->uniqueName("__group"), GroupAttributeArray::attributeType());
+
+        descriptor = descriptor->duplicateAppend(groupAttribute);
+
+        // insert new group attribute
+
+        AppendAttributeOp<PointDataTree> append(tree, groupAttribute, descriptor,
+                                                /*hidden=*/false, /*transient=*/false, /*group=*/true);
+        tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), append);
+    }
+
+    // ensure that there are now available groups
+
+    assert(groupInfo.unusedGroups() > 0);
+
+    // find next unused offset
+
+    const size_t offset = groupInfo.nextUnusedOffset();
+
+    // add the group mapping to the descriptor
+
+    descriptor->setGroup(group, offset);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void appendGroups(PointDataTree& tree,
+                         const std::vector<Name>& groups)
+{
+    // TODO: could be more efficient by appending multiple groups at once
+    // instead of one-by-one, however this is likely not that common a use case
+
+    for (std::vector<Name>::const_iterator  it = groups.begin(),
+                                            itEnd = groups.end(); it != itEnd; ++it) {
+        appendGroup(tree, *it);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropGroup(PointDataTree& tree, const Name& group, const bool compact)
+{
+    typedef AttributeSet::Descriptor                              Descriptor;
+
+    if (group.empty()) {
+        OPENVDB_THROW(KeyError, "Cannot use an empty group name as a key.");
+    }
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+
+    descriptor->dropGroup(group);
+
+    if (compact) {
+        compactGroups(tree);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree,
+                        const std::vector<Name>& groups)
+{
+    for (std::vector<Name>::const_iterator  it = groups.begin(),
+                                            itEnd = groups.end(); it != itEnd; ++it) {
+        dropGroup(tree, *it, /*compact=*/false);
+    }
+
+    // compaction done once for efficiency
+
+    compactGroups(tree);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree)
+{
+    typedef AttributeSet::Descriptor        Descriptor;
+
+    using point_group_internal::GroupInfo;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+    GroupInfo groupInfo(attributeSet);
+
+    descriptor->clearGroups();
+
+    // find all indices for group attribute arrays
+
+    std::vector<size_t> indices;
+    groupInfo.populateGroupIndices(indices);
+
+    // drop these attributes arrays
+
+    dropAttributes(tree, indices);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void compactGroups(PointDataTree& tree)
+{
+    typedef AttributeSet::Descriptor                              Descriptor;
+    typedef Descriptor::GroupIndex                                GroupIndex;
+
+    using point_group_internal::CopyGroupOp;
+    using point_group_internal::GroupInfo;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+    GroupInfo groupInfo(attributeSet);
+
+    // early exit if not possible to compact
+
+    if (!groupInfo.canCompactGroups())    return;
+
+    // generate a list of group offsets and move them (one-by-one)
+    // TODO: improve this algorithm to move multiple groups per array at once
+    // though this is likely not that common a use case
+
+    Name sourceName;
+    size_t sourceOffset, targetOffset;
+
+    while (groupInfo.requiresMove(sourceName, sourceOffset, targetOffset)) {
+
+        const GroupIndex sourceIndex = attributeSet.groupIndex(sourceOffset);
+        const GroupIndex targetIndex = attributeSet.groupIndex(targetOffset);
+
+        CopyGroupOp<PointDataTree> copy(tree, targetIndex, sourceIndex);
+        tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), copy);
+
+        descriptor->setGroup(sourceName, targetOffset);
+    }
+
+    // drop unused attribute arrays
+
+    std::vector<size_t> indices;
+    groupInfo.populateGroupIndices(indices);
+
+    const size_t totalAttributesToDrop = groupInfo.unusedGroups() / groupInfo.groupBits();
+
+    assert(totalAttributesToDrop <= indices.size());
+
+    std::vector<size_t> indicesToDrop(indices.end() - totalAttributesToDrop, indices.end());
+
+    dropAttributes(tree, indicesToDrop);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree, typename PointIndexTree>
+inline void setGroup(   PointDataTree& tree,
+                        const PointIndexTree& indexTree,
+                        const std::vector<bool>& membership,
+                        const Name& group,
+                        const bool remove)
+{
+    typedef AttributeSet::Descriptor Descriptor;
+    typedef typename tree::template LeafManager<PointDataTree> LeafManagerT;
+
+    if (membership.size() != pointCount(tree)) {
+        OPENVDB_THROW(LookupError, "Membership vector size must match number of points.");
+    }
+
+    using point_group_internal::SetGroupFromIndexOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const Descriptor& descriptor = attributeSet.descriptor();
+
+    if (!descriptor.hasGroup(group)) {
+        OPENVDB_THROW(LookupError, "Group must exist on Tree before defining membership.");
+    }
+
+    const Descriptor::GroupIndex index = attributeSet.groupIndex(group);
+
+    // set membership
+
+    if (remove) {
+        SetGroupFromIndexOp<PointDataTree,
+                            PointIndexTree, false> set(indexTree, membership, index);
+        tbb::parallel_for(LeafManagerT(tree).leafRange(), set);
+    }
+    else {
+        SetGroupFromIndexOp<PointDataTree,
+                            PointIndexTree, true> set(indexTree, membership, index);
+        tbb::parallel_for(LeafManagerT(tree).leafRange(), set);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void setGroup(   PointDataTree& tree,
+                        const Name& group,
+                        const bool member = true)
+{
+    typedef AttributeSet::Descriptor Descriptor;
+    typedef typename tree::template LeafManager<PointDataTree> LeafManagerT;
+
+    using point_group_internal::SetGroupOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const Descriptor& descriptor = attributeSet.descriptor();
+
+    if (!descriptor.hasGroup(group)) {
+        OPENVDB_THROW(LookupError, "Group must exist on Tree before defining membership.");
+    }
+
+    const Descriptor::GroupIndex index = attributeSet.groupIndex(group);
+
+    // set membership based on member variable
+
+    if (member)     tbb::parallel_for(LeafManagerT(tree).leafRange(), SetGroupOp<PointDataTree, true>(index));
+    else            tbb::parallel_for(LeafManagerT(tree).leafRange(), SetGroupOp<PointDataTree, false>(index));
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_GROUP_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb/tools/PointLoad.h b/nuparu/include/openvdb/tools/PointLoad.h
new file mode 100644
index 00000000..65ebaaca
--- /dev/null
+++ b/nuparu/include/openvdb/tools/PointLoad.h
@@ -0,0 +1,161 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointLoad.h
+///
+/// @brief  Various point loading methods using a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_LOAD_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_LOAD_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Loads all leaf node voxel data in the given grid.
+///
+/// @param grid  the Grid to be loaded.
+/// @note This method wraps readNonresidentBuffers().
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid);
+
+
+/// @brief Loads all leaf node voxel data in the given grid that
+/// overlap with mask grid leaf nodes.
+///
+/// @param grid  the Grid to be loaded.
+/// @param mask  the mask to denote region of points to load
+template <typename PointDataGridT, typename MaskGridT>
+void loadPoints(PointDataGridT& grid, const MaskGridT& mask);
+
+
+/// @brief Load the leaf node voxel data in the given grid that
+/// overlap with a world-space bounding box.
+///
+/// @param grid  the Grid to be loaded.
+/// @param bbox  the bbox to denote region of points to load
+///
+/// @note Does not clip to the bounding box, leaf nodes with any
+/// overlap will be loaded.
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid, const BBoxd& bbox);
+
+
+////////////////////////////////////////
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid)
+{
+    grid.constTree().readNonresidentBuffers();
+}
+#else
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT&)
+{
+    // out-of-core not supported with ABI 2
+}
+#endif
+
+
+template <typename PointDataGridT, typename MaskGridT>
+void loadPoints(PointDataGridT& grid, const MaskGridT& mask)
+{
+    typedef typename PointDataGridT::TreeType PointDataTreeT;
+
+    tree::ValueAccessor<const PointDataTreeT> pointsAcc(grid.constTree());
+
+    typename MaskGridT::TreeType::LeafCIter leafIter = mask.constTree().cbeginLeaf();
+
+    for (; leafIter; ++leafIter) {
+        const Coord& ijk = leafIter->origin();
+        const typename PointDataTreeT::LeafNodeType* leaf = pointsAcc.probeConstLeaf(ijk);
+
+        if (!leaf)  continue;
+
+        // load out of core leaf nodes
+        if (leaf->buffer().isOutOfCore())    leaf->buffer().data();
+    }
+}
+
+
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid, const BBoxd& bbox)
+{
+    typedef typename PointDataGridT::template ValueConverter<bool>::Type BoolGridT;
+
+    // Transform the world-space bounding box into the source grid's index space.
+    Vec3d idxMin, idxMax;
+    math::calculateBounds(grid.constTransform(), bbox.min(), bbox.max(), idxMin, idxMax);
+    CoordBBox region(Coord::floor(idxMin), Coord::floor(idxMax));
+
+    // Construct a boolean mask grid that is true inside the index-space bounding box
+    // and false everywhere else.
+    BoolGridT clipMask(/*background=*/false);
+    clipMask.fill(region, /*value=*/true, /*active=*/true);
+
+    // MaskGrid introduced in OpenVDB 3.2
+    typedef BoolGrid MaskType;
+
+    // Convert the input grid to a mask grid (with the same tree configuration).
+    MaskType::Ptr pointsMask = MaskType::create(/*background=*/false);
+    pointsMask->topologyUnion(grid);
+    pointsMask->topologyIntersection(clipMask);
+
+    loadPoints(grid, *pointsMask);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_LOAD_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/Exceptions.h b/nuparu/include/openvdb_new/Exceptions.h
new file mode 100644
index 00000000..c2a38386
--- /dev/null
+++ b/nuparu/include/openvdb_new/Exceptions.h
@@ -0,0 +1,112 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_EXCEPTIONS_HAS_BEEN_INCLUDED
+#define OPENVDB_EXCEPTIONS_HAS_BEEN_INCLUDED
+
+#include <exception>
+#include <string>
+#include <iostream>
+#include <openvdb/version.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+class OPENVDB_API Exception: public std::exception
+{
+public:
+    virtual const char* what() const throw()
+    {
+        try { return mMessage.c_str(); } catch (...) {};
+        return NULL;
+    }
+
+    virtual ~Exception() throw() {}
+
+protected:
+    Exception() throw() {}
+    explicit Exception(const char* eType, const std::string* const msg = NULL) throw()
+    {
+        try {
+            if (eType) mMessage = eType;
+            if (msg) mMessage += ": " + (*msg);
+        } catch (...) {}
+    }
+
+private:
+    std::string mMessage;
+};
+
+
+#define OPENVDB_EXCEPTION(_classname) \
+class OPENVDB_API _classname: public Exception \
+{ \
+public: \
+    _classname() throw() : Exception( #_classname ) {} \
+    explicit _classname(const std::string &msg) throw() : Exception( #_classname , &msg) {} \
+}
+
+
+OPENVDB_EXCEPTION(ArithmeticError);
+OPENVDB_EXCEPTION(IllegalValueException);
+OPENVDB_EXCEPTION(IndexError);
+OPENVDB_EXCEPTION(IoError);
+OPENVDB_EXCEPTION(KeyError);
+OPENVDB_EXCEPTION(LookupError);
+OPENVDB_EXCEPTION(NotImplementedError);
+OPENVDB_EXCEPTION(ReferenceError);
+OPENVDB_EXCEPTION(RuntimeError);
+OPENVDB_EXCEPTION(TypeError);
+OPENVDB_EXCEPTION(ValueError);
+
+
+#undef OPENVDB_EXCEPTION
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#define OPENVDB_THROW(exception, message) \
+{ \
+    std::string _openvdb_throw_msg; \
+    try { \
+        std::ostringstream _openvdb_throw_os; \
+        _openvdb_throw_os << message; \
+        _openvdb_throw_msg = _openvdb_throw_os.str(); \
+    } catch (...) {} \
+    throw exception(_openvdb_throw_msg); \
+} // OPENVDB_THROW
+
+#endif // OPENVDB_EXCEPTIONS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/Grid.h b/nuparu/include/openvdb_new/Grid.h
new file mode 100644
index 00000000..0ac8adaf
--- /dev/null
+++ b/nuparu/include/openvdb_new/Grid.h
@@ -0,0 +1,1404 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_GRID_HAS_BEEN_INCLUDED
+#define OPENVDB_GRID_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <set>
+#include <vector>
+#include <boost/static_assert.hpp>
+#include <boost/type_traits/remove_const.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/util/Name.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/metadata/MetaMap.h>
+#include <openvdb/Exceptions.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+typedef tree::TreeBase TreeBase;
+
+template<typename> class Grid; // forward declaration
+
+
+/// @brief Create a new grid of type @c GridType with a given background value.
+///
+/// @note Calling createGrid<GridType>(background) is equivalent to calling
+/// GridType::create(background).
+template<typename GridType>
+inline typename GridType::Ptr createGrid(const typename GridType::ValueType& background);
+
+
+/// @brief Create a new grid of type @c GridType with background value zero.
+///
+/// @note Calling createGrid<GridType>() is equivalent to calling GridType::create().
+template<typename GridType>
+inline typename GridType::Ptr createGrid();
+
+
+/// @brief Create a new grid of the appropriate type that wraps the given tree.
+///
+/// @note This function can be called without specifying the template argument,
+/// i.e., as createGrid(tree).
+template<typename TreePtrType>
+inline typename Grid<typename TreePtrType::element_type>::Ptr createGrid(TreePtrType);
+
+
+/// @brief Create a new grid of type @c GridType classified as a "Level Set",
+/// i.e., a narrow-band level set.
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+///
+/// @param voxelSize  the size of a voxel in world units
+/// @param halfWidth  the half width of the narrow band in voxel units
+///
+/// @details The voxel size and the narrow band half width define the grid's
+/// background value as halfWidth*voxelWidth.  The transform is linear
+/// with a uniform scaling only corresponding to the specified voxel size.
+///
+/// @note It is generally advisable to specify a half-width of the narrow band
+/// that is larger than one voxel unit, otherwise zero crossings are not guaranteed.
+template<typename GridType>
+typename GridType::Ptr createLevelSet(
+    Real voxelSize = 1.0, Real halfWidth = LEVEL_SET_HALF_WIDTH);
+
+
+////////////////////////////////////////
+
+
+/// @brief Abstract base class for typed grids
+class OPENVDB_API GridBase: public MetaMap
+{
+public:
+    typedef boost::shared_ptr<GridBase> Ptr;
+    typedef boost::shared_ptr<const GridBase> ConstPtr;
+
+    typedef Ptr (*GridFactory)();
+
+
+    virtual ~GridBase() {}
+
+    /// @brief Return a new grid of the same type as this grid and whose
+    /// metadata and transform are deep copies of this grid's.
+    virtual GridBase::Ptr copyGrid(CopyPolicy treePolicy = CP_SHARE) const = 0;
+
+    /// Return a new grid whose metadata, transform and tree are deep copies of this grid's.
+    virtual GridBase::Ptr deepCopyGrid() const = 0;
+
+
+    //
+    // Registry methods
+    //
+    /// Create a new grid of the given (registered) type.
+    static Ptr createGrid(const Name& type);
+
+    /// Return @c true if the given grid type name is registered.
+    static bool isRegistered(const Name &type);
+
+    /// Clear the grid type registry.
+    static void clearRegistry();
+
+
+    //
+    // Grid type methods
+    //
+    /// Return the name of this grid's type.
+    virtual Name type() const = 0;
+    /// Return the name of the type of a voxel's value (e.g., "float" or "vec3d").
+    virtual Name valueType() const = 0;
+
+    /// Return @c true if this grid is of the same type as the template parameter.
+    template<typename GridType>
+    bool isType() const { return (this->type() == GridType::gridType()); }
+
+    //@{
+    /// @brief Return the result of downcasting a GridBase pointer to a Grid pointer
+    /// of the specified type, or return a null pointer if the types are incompatible.
+    template<typename GridType>
+    static typename GridType::Ptr grid(const GridBase::Ptr&);
+    template<typename GridType>
+    static typename GridType::ConstPtr grid(const GridBase::ConstPtr&);
+    template<typename GridType>
+    static typename GridType::ConstPtr constGrid(const GridBase::Ptr&);
+    template<typename GridType>
+    static typename GridType::ConstPtr constGrid(const GridBase::ConstPtr&);
+    //@}
+
+    //@{
+    /// @brief Return a pointer to this grid's tree, which might be
+    /// shared with other grids.  The pointer is guaranteed to be non-null.
+    TreeBase::Ptr baseTreePtr();
+    TreeBase::ConstPtr baseTreePtr() const { return this->constBaseTreePtr(); }
+    virtual TreeBase::ConstPtr constBaseTreePtr() const = 0;
+    //@}
+
+    //@{
+    /// @brief Return a reference to this grid's tree, which might be
+    /// shared with other grids.
+    /// @note Calling setTree() on this grid invalidates all references
+    /// previously returned by this method.
+    TreeBase& baseTree() { return const_cast<TreeBase&>(this->constBaseTree()); }
+    const TreeBase& baseTree() const { return this->constBaseTree(); }
+    const TreeBase& constBaseTree() const { return *(this->constBaseTreePtr()); }
+    //@}
+
+    /// @brief Associate the given tree with this grid, in place of its existing tree.
+    /// @throw ValueError if the tree pointer is null
+    /// @throw TypeError if the tree is not of the appropriate type
+    /// @note Invalidates all references previously returned by baseTree()
+    /// or constBaseTree().
+    virtual void setTree(TreeBase::Ptr) = 0;
+
+    /// Set a new tree with the same background value as the previous tree.
+    virtual void newTree() = 0;
+
+    /// Return @c true if this grid contains only background voxels.
+    virtual bool empty() const = 0;
+    /// Empty this grid, setting all voxels to the background.
+    virtual void clear() = 0;
+
+    /// @brief Reduce the memory footprint of this grid by increasing its sparseness
+    /// either losslessly (@a tolerance = 0) or lossily (@a tolerance > 0).
+    /// @details With @a tolerance > 0, sparsify regions where voxels have the same
+    /// active state and have values that differ by no more than the tolerance
+    /// (converted to this grid's value type).
+    virtual void pruneGrid(float tolerance = 0.0) = 0;
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Clip this grid to the given world-space bounding box.
+    /// @details Voxels that lie outside the bounding box are set to the background.
+    /// @warning Clipping a level set will likely produce a grid that is
+    /// no longer a valid level set.
+    void clipGrid(const BBoxd&);
+
+    /// @brief Clip this grid to the given index-space bounding box.
+    /// @details Voxels that lie outside the bounding box are set to the background.
+    /// @warning Clipping a level set will likely produce a grid that is
+    /// no longer a valid level set.
+    virtual void clip(const CoordBBox&) = 0;
+#endif
+
+
+    //
+    // Metadata
+    //
+    /// Return this grid's user-specified name.
+    std::string getName() const;
+    /// Specify a name for this grid.
+    void setName(const std::string&);
+
+    /// Return the user-specified description of this grid's creator.
+    std::string getCreator() const;
+    /// Provide a description of this grid's creator.
+    void setCreator(const std::string&);
+
+    /// @brief Return @c true if this grid should be written out with floating-point
+    /// voxel values (including components of vectors) quantized to 16 bits.
+    bool saveFloatAsHalf() const;
+    void setSaveFloatAsHalf(bool);
+
+    /// Return the class of volumetric data (level set, fog volume, etc.) stored in this grid.
+    GridClass getGridClass() const;
+    /// Specify the class of volumetric data (level set, fog volume, etc.) stored in this grid.
+    void setGridClass(GridClass);
+    /// Remove the setting specifying the class of this grid's volumetric data.
+    void clearGridClass();
+
+    /// Return the metadata string value for the given class of volumetric data.
+    static std::string gridClassToString(GridClass);
+    /// Return a formatted string version of the grid class.
+    static std::string gridClassToMenuName(GridClass);
+    /// @brief Return the class of volumetric data specified by the given string.
+    /// @details If the string is not one of the ones returned by gridClassToString(),
+    /// return @c GRID_UNKNOWN.
+    static GridClass stringToGridClass(const std::string&);
+
+    /// @brief Return the type of vector data (invariant, covariant, etc.) stored
+    /// in this grid, assuming that this grid contains a vector-valued tree.
+    VecType getVectorType() const;
+    /// @brief Specify the type of vector data (invariant, covariant, etc.) stored
+    /// in this grid, assuming that this grid contains a vector-valued tree.
+    void setVectorType(VecType);
+    /// Remove the setting specifying the type of vector data stored in this grid.
+    void clearVectorType();
+
+    /// Return the metadata string value for the given type of vector data.
+    static std::string vecTypeToString(VecType);
+    /// Return a string listing examples of the given type of vector data
+    /// (e.g., "Gradient/Normal", given VEC_COVARIANT).
+    static std::string vecTypeExamples(VecType);
+    /// @brief Return a string describing how the given type of vector data is affected
+    /// by transformations (e.g., "Does not transform", given VEC_INVARIANT).
+    static std::string vecTypeDescription(VecType);
+    static VecType stringToVecType(const std::string&);
+
+    /// Return @c true if this grid's voxel values are in world space and should be
+    /// affected by transformations, @c false if they are in local space and should
+    /// not be affected by transformations.
+    bool isInWorldSpace() const;
+    /// Specify whether this grid's voxel values are in world space or in local space.
+    void setIsInWorldSpace(bool);
+
+    // Standard metadata field names
+    // (These fields should normally not be accessed directly, but rather
+    // via the accessor methods above, when available.)
+    // Note: Visual C++ requires these declarations to be separate statements.
+    static const char* const META_GRID_CLASS;
+    static const char* const META_GRID_CREATOR;
+    static const char* const META_GRID_NAME;
+    static const char* const META_SAVE_HALF_FLOAT;
+    static const char* const META_IS_LOCAL_SPACE;
+    static const char* const META_VECTOR_TYPE;
+    static const char* const META_FILE_BBOX_MIN;
+    static const char* const META_FILE_BBOX_MAX;
+    static const char* const META_FILE_COMPRESSION;
+    static const char* const META_FILE_MEM_BYTES;
+    static const char* const META_FILE_VOXEL_COUNT;
+
+
+    //
+    // Statistics
+    //
+    /// Return the number of active voxels.
+    virtual Index64 activeVoxelCount() const = 0;
+
+    /// Return the axis-aligned bounding box of all active voxels. If
+    /// the grid is empty a default bbox is returned.
+    virtual CoordBBox evalActiveVoxelBoundingBox() const = 0;
+
+    /// Return the dimensions of the axis-aligned bounding box of all active voxels.
+    virtual Coord evalActiveVoxelDim() const = 0;
+
+    /// Return the number of bytes of memory used by this grid.
+    virtual Index64 memUsage() const = 0;
+
+    /// @brief Add metadata to this grid comprising the current values
+    /// of statistics like the active voxel count and bounding box.
+    /// @note This metadata is not automatically kept up-to-date with
+    /// changes to this grid.
+    void addStatsMetadata();
+    /// @brief Return a new MetaMap containing just the metadata that
+    /// was added to this grid with addStatsMetadata().
+    /// @details If addStatsMetadata() was never called on this grid,
+    /// return an empty MetaMap.
+    MetaMap::Ptr getStatsMetadata() const;
+
+
+    //
+    // Transform methods
+    //
+    //@{
+    /// @brief Return a pointer to this grid's transform, which might be
+    /// shared with other grids.
+    math::Transform::Ptr transformPtr() { return mTransform; }
+    math::Transform::ConstPtr transformPtr() const { return mTransform; }
+    math::Transform::ConstPtr constTransformPtr() const { return mTransform; }
+    //@}
+    //@{
+    /// @brief Return a reference to this grid's transform, which might be
+    /// shared with other grids.
+    /// @note Calling setTransform() on this grid invalidates all references
+    /// previously returned by this method.
+    math::Transform& transform() { return *mTransform; }
+    const math::Transform& transform() const { return *mTransform; }
+    const math::Transform& constTransform() const { return *mTransform; }
+    //@}
+    /// @brief Associate the given transform with this grid, in place of
+    /// its existing transform.
+    /// @throw ValueError if the transform pointer is null
+    /// @note Invalidates all references previously returned by transform()
+    /// or constTransform().
+    void setTransform(math::Transform::Ptr);
+
+    /// Return the size of this grid's voxels.
+    Vec3d voxelSize() const { return transform().voxelSize(); }
+    /// @brief Return the size of this grid's voxel at position (x, y, z).
+    /// @note Frustum and perspective transforms have position-dependent voxel size.
+    Vec3d voxelSize(const Vec3d& xyz) const { return transform().voxelSize(xyz); }
+    /// Return true if the voxels in world space are uniformly sized cubes
+    bool hasUniformVoxels() const { return mTransform->hasUniformScale(); }
+    //@{
+    /// Apply this grid's transform to the given coordinates.
+    Vec3d indexToWorld(const Vec3d& xyz) const { return transform().indexToWorld(xyz); }
+    Vec3d indexToWorld(const Coord& ijk) const { return transform().indexToWorld(ijk); }
+    //@}
+    /// Apply the inverse of this grid's transform to the given coordinates.
+    Vec3d worldToIndex(const Vec3d& xyz) const { return transform().worldToIndex(xyz); }
+
+
+    //
+    // I/O methods
+    //
+    /// @brief Read the grid topology from a stream.
+    /// This will read only the grid structure, not the actual data buffers.
+    virtual void readTopology(std::istream&) = 0;
+    /// @brief Write the grid topology to a stream.
+    /// This will write only the grid structure, not the actual data buffers.
+    virtual void writeTopology(std::ostream&) const = 0;
+
+    /// Read all data buffers for this grid.
+    virtual void readBuffers(std::istream&) = 0;
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Read all of this grid's data buffers that intersect the given index-space bounding box.
+    virtual void readBuffers(std::istream&, const CoordBBox&) = 0;
+    /// @brief Read all of this grid's data buffers that are not yet resident in memory
+    /// (because delayed loading is in effect).
+    /// @details If this grid was read from a memory-mapped file, this operation
+    /// disconnects the grid from the file.
+    /// @sa io::File::open, io::MappedFile
+    virtual void readNonresidentBuffers() const = 0;
+#endif
+    /// Write out all data buffers for this grid.
+    virtual void writeBuffers(std::ostream&) const = 0;
+
+    /// Read in the transform for this grid.
+    void readTransform(std::istream& is) { transform().read(is); }
+    /// Write out the transform for this grid.
+    void writeTransform(std::ostream& os) const { transform().write(os); }
+
+    /// Output a human-readable description of this grid.
+    virtual void print(std::ostream& = std::cout, int verboseLevel = 1) const = 0;
+
+
+protected:
+    /// @brief Initialize with an identity linear transform.
+    GridBase(): mTransform(math::Transform::createLinearTransform()) {}
+
+    /// @brief Deep copy another grid's metadata and transform.
+    GridBase(const GridBase& other): MetaMap(other), mTransform(other.mTransform->copy()) {}
+
+    /// @brief Copy another grid's metadata but share its transform.
+    GridBase(const GridBase& other, ShallowCopy): MetaMap(other), mTransform(other.mTransform) {}
+
+    /// Register a grid type along with a factory function.
+    static void registerGrid(const Name& type, GridFactory);
+    /// Remove a grid type from the registry.
+    static void unregisterGrid(const Name& type);
+
+
+private:
+    math::Transform::Ptr mTransform;
+}; // class GridBase
+
+
+////////////////////////////////////////
+
+
+typedef std::vector<GridBase::Ptr>      GridPtrVec;
+typedef GridPtrVec::iterator            GridPtrVecIter;
+typedef GridPtrVec::const_iterator      GridPtrVecCIter;
+typedef boost::shared_ptr<GridPtrVec>   GridPtrVecPtr;
+
+typedef std::vector<GridBase::ConstPtr> GridCPtrVec;
+typedef GridCPtrVec::iterator           GridCPtrVecIter;
+typedef GridCPtrVec::const_iterator     GridCPtrVecCIter;
+typedef boost::shared_ptr<GridCPtrVec>  GridCPtrVecPtr;
+
+typedef std::set<GridBase::Ptr>         GridPtrSet;
+typedef GridPtrSet::iterator            GridPtrSetIter;
+typedef GridPtrSet::const_iterator      GridPtrSetCIter;
+typedef boost::shared_ptr<GridPtrSet>   GridPtrSetPtr;
+
+typedef std::set<GridBase::ConstPtr>    GridCPtrSet;
+typedef GridCPtrSet::iterator           GridCPtrSetIter;
+typedef GridCPtrSet::const_iterator     GridCPtrSetCIter;
+typedef boost::shared_ptr<GridCPtrSet>  GridCPtrSetPtr;
+
+
+/// @brief Predicate functor that returns @c true for grids that have a specified name
+struct OPENVDB_API GridNamePred
+{
+    GridNamePred(const Name& _name): name(_name) {}
+    bool operator()(const GridBase::ConstPtr& g) const { return g && g->getName() == name; }
+    Name name;
+};
+
+/// Return the first grid in the given container whose name is @a name.
+template<typename GridPtrContainerT>
+inline typename GridPtrContainerT::value_type
+findGridByName(const GridPtrContainerT& container, const Name& name)
+{
+    typedef typename GridPtrContainerT::value_type GridPtrT;
+    typename GridPtrContainerT::const_iterator it =
+        std::find_if(container.begin(), container.end(), GridNamePred(name));
+    return (it == container.end() ? GridPtrT() : *it);
+}
+
+/// Return the first grid in the given map whose name is @a name.
+template<typename KeyT, typename GridPtrT>
+inline GridPtrT
+findGridByName(const std::map<KeyT, GridPtrT>& container, const Name& name)
+{
+    typedef std::map<KeyT, GridPtrT> GridPtrMapT;
+    for (typename GridPtrMapT::const_iterator it = container.begin(), end = container.end();
+        it != end; ++it)
+    {
+        const GridPtrT& grid = it->second;
+        if (grid && grid->getName() == name) return grid;
+    }
+    return GridPtrT();
+}
+//@}
+
+
+////////////////////////////////////////
+
+
+/// @brief Container class that associates a tree with a transform and metadata
+template<typename _TreeType>
+class Grid: public GridBase
+{
+public:
+    typedef boost::shared_ptr<Grid>                       Ptr;
+    typedef boost::shared_ptr<const Grid>                 ConstPtr;
+
+    typedef _TreeType                                     TreeType;
+    typedef typename _TreeType::Ptr                       TreePtrType;
+    typedef typename _TreeType::ConstPtr                  ConstTreePtrType;
+    typedef typename _TreeType::ValueType                 ValueType;
+    typedef typename _TreeType::BuildType                 BuildType;
+
+    typedef typename _TreeType::ValueOnIter               ValueOnIter;
+    typedef typename _TreeType::ValueOnCIter              ValueOnCIter;
+    typedef typename _TreeType::ValueOffIter              ValueOffIter;
+    typedef typename _TreeType::ValueOffCIter             ValueOffCIter;
+    typedef typename _TreeType::ValueAllIter              ValueAllIter;
+    typedef typename _TreeType::ValueAllCIter             ValueAllCIter;
+
+    typedef typename tree::ValueAccessor<_TreeType, true>        Accessor;
+    typedef typename tree::ValueAccessor<const _TreeType, true>  ConstAccessor;
+    typedef typename tree::ValueAccessor<_TreeType, false>       UnsafeAccessor;
+    typedef typename tree::ValueAccessor<const _TreeType, false> ConstUnsafeAccessor;
+
+    /// @brief ValueConverter<T>::Type is the type of a grid having the same
+    /// hierarchy as this grid but a different value type, T.
+    ///
+    /// For example, FloatGrid::ValueConverter<double>::Type is equivalent to DoubleGrid.
+    /// @note If the source grid type is a template argument, it might be necessary
+    /// to write "typename SourceGrid::template ValueConverter<T>::Type".
+    template<typename OtherValueType>
+    struct ValueConverter {
+        typedef Grid<typename TreeType::template ValueConverter<OtherValueType>::Type> Type;
+    };
+
+    /// Return a new grid with the given background value.
+    static Ptr create(const ValueType& background);
+    /// Return a new grid with background value zero.
+    static Ptr create();
+    /// @brief Return a new grid that contains the given tree.
+    /// @throw ValueError if the tree pointer is null
+    static Ptr create(TreePtrType);
+    /// @brief Return a new, empty grid with the same transform and metadata as the
+    /// given grid and with background value zero.
+    static Ptr create(const GridBase& other);
+
+
+    /// Construct a new grid with background value zero.
+    Grid();
+    /// Construct a new grid with the given background value.
+    explicit Grid(const ValueType& background);
+    /// @brief Construct a new grid that shares the given tree and associates with it
+    /// an identity linear transform.
+    /// @throw ValueError if the tree pointer is null
+    explicit Grid(TreePtrType);
+    /// Deep copy another grid's metadata, transform and tree.
+    Grid(const Grid&);
+    /// @brief Deep copy the metadata, transform and tree of another grid whose tree
+    /// configuration is the same as this grid's but whose value type is different.
+    /// Cast the other grid's values to this grid's value type.
+    /// @throw TypeError if the other grid's tree configuration doesn't match this grid's
+    /// or if this grid's ValueType is not constructible from the other grid's ValueType.
+    template<typename OtherTreeType>
+    explicit Grid(const Grid<OtherTreeType>&);
+    /// Deep copy another grid's metadata, but share its tree and transform.
+    Grid(const Grid&, ShallowCopy);
+    /// @brief Deep copy another grid's metadata and transform, but construct a new tree
+    /// with background value zero.
+    explicit Grid(const GridBase&);
+
+    virtual ~Grid() {}
+
+    //@{
+    /// @brief Return a new grid of the same type as this grid and whose
+    /// metadata and transform are deep copies of this grid's.
+    /// @details If @a treePolicy is @c CP_NEW, give the new grid a new, empty tree;
+    /// if @c CP_SHARE, the new grid shares this grid's tree and transform;
+    /// if @c CP_COPY, the new grid's tree is a deep copy of this grid's tree and transform
+    Ptr copy(CopyPolicy treePolicy = CP_SHARE) const;
+    virtual GridBase::Ptr copyGrid(CopyPolicy treePolicy = CP_SHARE) const;
+    //@}
+    //@{
+    /// Return a new grid whose metadata, transform and tree are deep copies of this grid's.
+    Ptr deepCopy() const { return Ptr(new Grid(*this)); }
+    virtual GridBase::Ptr deepCopyGrid() const { return this->deepCopy(); }
+    //@}
+
+    /// Return the name of this grid's type.
+    virtual Name type() const { return this->gridType(); }
+    /// Return the name of this type of grid.
+    static Name gridType() { return TreeType::treeType(); }
+
+
+    //
+    // Voxel access methods
+    //
+    /// Return the name of the type of a voxel's value (e.g., "float" or "vec3d").
+    virtual Name valueType() const { return tree().valueType(); }
+
+    /// @brief Return this grid's background value.
+    ///
+    /// @note Use tools::changeBackground to efficiently modify the background values.
+    const ValueType& background() const { return mTree->background(); }
+
+    /// Return @c true if this grid contains only inactive background voxels.
+    virtual bool empty() const { return tree().empty(); }
+    /// Empty this grid, so that all voxels become inactive background voxels.
+    virtual void clear() { tree().clear(); }
+
+    /// @brief Return an accessor that provides random read and write access
+    /// to this grid's voxels. The accessor is safe in the sense that
+    /// it is registered by the tree of this grid.
+    Accessor getAccessor() { return Accessor(tree()); }
+    /// @brief Return an accessor that provides random read and write access
+    /// to this grid's voxels. The accessor is unsafe in the sense that
+    /// it is not registered by the tree of this grid. In some rare
+    /// cases this can give a performance advantage over a registered
+    /// accessor but it is unsafe if the tree topology is modified.
+    ///
+    /// @warning Only use this method if you're an expert and know the
+    /// risks of using an unregistered accessor (see tree/ValueAccessor.h)
+    UnsafeAccessor getUnsafeAccessor() { return UnsafeAccessor(tree()); }
+    //@{
+    /// Return an accessor that provides random read-only access to this grid's voxels.
+    ConstAccessor getAccessor() const { return ConstAccessor(tree()); }
+    ConstAccessor getConstAccessor() const { return ConstAccessor(tree()); }
+    //@}
+    /// @brief Return an accessor that provides random read-only access
+    /// to this grid's voxels. The accessor is unsafe in the sense that
+    /// it is not registered by the tree of this grid. In some rare
+    /// cases this can give a performance advantage over a registered
+    /// accessor but it is unsafe if the tree topology is modified.
+    ///
+    /// @warning Only use this method if you're an expert and know the
+    /// risks of using an unregistered accessor (see tree/ValueAccessor.h)
+    ConstUnsafeAccessor getConstUnsafeAccessor() const { return ConstUnsafeAccessor(tree()); }
+
+    //@{
+    /// Return an iterator over all of this grid's active values (tile and voxel).
+    ValueOnIter   beginValueOn()       { return tree().beginValueOn(); }
+    ValueOnCIter  beginValueOn() const { return tree().cbeginValueOn(); }
+    ValueOnCIter cbeginValueOn() const { return tree().cbeginValueOn(); }
+    //@}
+    //@{
+    /// Return an iterator over all of this grid's inactive values (tile and voxel).
+    ValueOffIter   beginValueOff()       { return tree().beginValueOff(); }
+    ValueOffCIter  beginValueOff() const { return tree().cbeginValueOff(); }
+    ValueOffCIter cbeginValueOff() const { return tree().cbeginValueOff(); }
+    //@}
+    //@{
+    /// Return an iterator over all of this grid's values (tile and voxel).
+    ValueAllIter   beginValueAll()       { return tree().beginValueAll(); }
+    ValueAllCIter  beginValueAll() const { return tree().cbeginValueAll(); }
+    ValueAllCIter cbeginValueAll() const { return tree().cbeginValueAll(); }
+    //@}
+
+    /// Return the minimum and maximum active values in this grid.
+    void evalMinMax(ValueType& minVal, ValueType& maxVal) const;
+
+    /// @brief Set all voxels within a given axis-aligned box to a constant value.
+    /// @param bbox    inclusive coordinates of opposite corners of an axis-aligned box
+    /// @param value   the value to which to set voxels within the box
+    /// @param active  if true, mark voxels within the box as active,
+    ///                otherwise mark them as inactive
+    /// @param sparse  if false, active tiles are voxelized, i.e. only active voxels
+    ///                 are generated from the fill operation. Defaults to true.
+    /// @note If @a sparse is true this operation generates a sparse, but not always optimally sparse,
+    /// representation of the filled box.  Follow fill operations with a prune()
+    /// operation for optimal sparseness.
+    void fill(const CoordBBox& bbox, const ValueType& value, bool active = true, bool sparse = true);
+
+    /// Reduce the memory footprint of this grid by increasing its sparseness.
+    virtual void pruneGrid(float tolerance = 0.0);
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Clip this grid to the given index-space bounding box.
+    /// @details Voxels that lie outside the bounding box are set to the background.
+    /// @warning Clipping a level set will likely produce a grid that is
+    /// no longer a valid level set.
+    virtual void clip(const CoordBBox&);
+#endif
+
+    /// @brief Efficiently merge another grid into this grid using one of several schemes.
+    /// @details This operation is primarily intended to combine grids that are mostly
+    /// non-overlapping (for example, intermediate grids from computations that are
+    /// parallelized across disjoint regions of space).
+    /// @warning This operation always empties the other grid.
+    void merge(Grid& other, MergePolicy policy = MERGE_ACTIVE_STATES);
+
+    /// @brief Union this grid's set of active values with the active values
+    /// of the other grid, whose value type may be different.
+    /// @details The resulting state of a value is active if the corresponding value
+    /// was already active OR if it is active in the other grid. Also, a resulting
+    /// value maps to a voxel if the corresponding value already mapped to a voxel
+    /// OR if it is a voxel in the other grid. Thus, a resulting value can only
+    /// map to a tile if the corresponding value already mapped to a tile
+    /// AND if it is a tile value in the other grid.
+    ///
+    /// @note This operation modifies only active states, not values.
+    /// Specifically, active tiles and voxels in this grid are not changed, and
+    /// tiles or voxels that were inactive in this grid but active in the other grid
+    /// are marked as active in this grid but left with their original values.
+    template<typename OtherTreeType>
+    void topologyUnion(const Grid<OtherTreeType>& other);
+
+    /// @brief Intersect this grid's set of active values with the active values
+    /// of the other grid, whose value type may be different.
+    /// @details The resulting state of a value is active only if the corresponding
+    /// value was already active AND if it is active in the other tree. Also, a
+    /// resulting value maps to a voxel if the corresponding value
+    /// already mapped to an active voxel in either of the two grids
+    /// and it maps to an active tile or voxel in the other grid.
+    ///
+    /// @note This operation can delete branches of this grid that overlap with
+    /// inactive tiles in the other grid.  Also, because it can deactivate voxels,
+    /// it can create leaf nodes with no active values.  Thus, it is recommended
+    /// to prune this grid after calling this method.
+    template<typename OtherTreeType>
+    void topologyIntersection(const Grid<OtherTreeType>& other);
+
+    /// @brief Difference this grid's set of active values with the active values
+    /// of the other grid, whose value type may be different.
+    /// @details After this method is called, voxels in this grid will be active
+    /// only if they were active to begin with and if the corresponding voxels
+    /// in the other grid were inactive.
+    ///
+    /// @note This operation can delete branches of this grid that overlap with
+    /// active tiles in the other grid.  Also, because it can deactivate voxels,
+    /// it can create leaf nodes with no active values.  Thus, it is recommended
+    /// to prune this grid after calling this method.
+    template<typename OtherTreeType>
+    void topologyDifference(const Grid<OtherTreeType>& other);
+
+    //
+    // Statistics
+    //
+    /// Return the number of active voxels.
+    virtual Index64 activeVoxelCount() const { return tree().activeVoxelCount(); }
+    /// Return the axis-aligned bounding box of all active voxels.
+    virtual CoordBBox evalActiveVoxelBoundingBox() const;
+    /// Return the dimensions of the axis-aligned bounding box of all active voxels.
+    virtual Coord evalActiveVoxelDim() const;
+
+    /// Return the number of bytes of memory used by this grid.
+    /// @todo Add transform().memUsage()
+    virtual Index64 memUsage() const { return tree().memUsage(); }
+
+
+    //
+    // Tree methods
+    //
+    //@{
+    /// @brief Return a pointer to this grid's tree, which might be
+    /// shared with other grids.  The pointer is guaranteed to be non-null.
+    TreePtrType treePtr() { return mTree; }
+    ConstTreePtrType treePtr() const { return mTree; }
+    ConstTreePtrType constTreePtr() const { return mTree; }
+    virtual TreeBase::ConstPtr constBaseTreePtr() const { return mTree; }
+    //@}
+    //@{
+    /// @brief Return a reference to this grid's tree, which might be
+    /// shared with other grids.
+    /// @note Calling setTree() on this grid invalidates all references
+    /// previously returned by this method.
+    TreeType& tree() { return *mTree; }
+    const TreeType& tree() const { return *mTree; }
+    const TreeType& constTree() const { return *mTree; }
+    //@}
+
+    /// @brief Associate the given tree with this grid, in place of its existing tree.
+    /// @throw ValueError if the tree pointer is null
+    /// @throw TypeError if the tree is not of type TreeType
+    /// @note Invalidates all references previously returned by baseTree(),
+    /// constBaseTree(), tree() or constTree().
+    virtual void setTree(TreeBase::Ptr);
+
+    /// @brief Associate a new, empty tree with this grid, in place of its existing tree.
+    /// @note The new tree has the same background value as the existing tree.
+    virtual void newTree();
+
+
+    //
+    // I/O methods
+    //
+    /// @brief Read the grid topology from a stream.
+    /// This will read only the grid structure, not the actual data buffers.
+    virtual void readTopology(std::istream&);
+    /// @brief Write the grid topology to a stream.
+    /// This will write only the grid structure, not the actual data buffers.
+    virtual void writeTopology(std::ostream&) const;
+
+    /// Read all data buffers for this grid.
+    virtual void readBuffers(std::istream&);
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Read all of this grid's data buffers that intersect the given index-space bounding box.
+    virtual void readBuffers(std::istream&, const CoordBBox&);
+    /// @brief Read all of this grid's data buffers that are not yet resident in memory
+    /// (because delayed loading is in effect).
+    /// @details If this grid was read from a memory-mapped file, this operation
+    /// disconnects the grid from the file.
+    /// @sa io::File::open, io::MappedFile
+    virtual void readNonresidentBuffers() const;
+#endif
+    /// Write out all data buffers for this grid.
+    virtual void writeBuffers(std::ostream&) const;
+
+    /// Output a human-readable description of this grid.
+    virtual void print(std::ostream& = std::cout, int verboseLevel = 1) const;
+
+
+    //
+    // Registry methods
+    //
+    /// Return @c true if this grid type is registered.
+    static bool isRegistered() { return GridBase::isRegistered(Grid::gridType()); }
+    /// Register this grid type along with a factory function.
+    static void registerGrid() { GridBase::registerGrid(Grid::gridType(), Grid::factory); }
+    /// Remove this grid type from the registry.
+    static void unregisterGrid() { GridBase::unregisterGrid(Grid::gridType()); }
+
+
+private:
+    /// Disallow assignment, since it wouldn't be obvious whether the copy is deep or shallow.
+    Grid& operator=(const Grid& other);
+
+    /// Helper function for use with registerGrid()
+    static GridBase::Ptr factory() { return Grid::create(); }
+
+    TreePtrType mTree;
+}; // class Grid
+
+
+////////////////////////////////////////
+
+
+/// @brief Cast a generic grid pointer to a pointer to a grid of a concrete class.
+///
+/// Return a null pointer if the input pointer is null or if it
+/// points to a grid that is not of type @c GridType.
+///
+/// @note Calling gridPtrCast<GridType>(grid) is equivalent to calling
+/// GridBase::grid<GridType>(grid).
+template<typename GridType>
+inline typename GridType::Ptr
+gridPtrCast(const GridBase::Ptr& grid)
+{
+    return GridBase::grid<GridType>(grid);
+}
+
+
+/// @brief Cast a generic const grid pointer to a const pointer to a grid
+/// of a concrete class.
+///
+/// Return a null pointer if the input pointer is null or if it
+/// points to a grid that is not of type @c GridType.
+///
+/// @note Calling gridConstPtrCast<GridType>(grid) is equivalent to calling
+/// GridBase::constGrid<GridType>(grid).
+template<typename GridType>
+inline typename GridType::ConstPtr
+gridConstPtrCast(const GridBase::ConstPtr& grid)
+{
+    return GridBase::constGrid<GridType>(grid);
+}
+
+
+////////////////////////////////////////
+
+
+/// @{
+/// @brief Return a pointer to a deep copy of the given grid, provided that
+/// the grid's concrete type is @c GridType.
+///
+/// Return a null pointer if the input pointer is null or if it
+/// points to a grid that is not of type @c GridType.
+template<typename GridType>
+inline typename GridType::Ptr
+deepCopyTypedGrid(const GridBase::ConstPtr& grid)
+{
+    if (!grid || !grid->isType<GridType>()) return typename GridType::Ptr();
+    return gridPtrCast<GridType>(grid->deepCopyGrid());
+}
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+deepCopyTypedGrid(const GridBase& grid)
+{
+    if (!grid.isType<GridType>()) return typename GridType::Ptr();
+    return gridPtrCast<GridType>(grid.deepCopyGrid());
+}
+/// @}
+
+
+////////////////////////////////////////
+
+
+//@{
+/// @brief This adapter allows code that is templated on a Tree type to
+/// accept either a Tree type or a Grid type.
+template<typename _TreeType>
+struct TreeAdapter
+{
+    typedef _TreeType                           TreeType;
+    typedef typename boost::remove_const<TreeType>::type NonConstTreeType;
+    typedef typename TreeType::Ptr              TreePtrType;
+    typedef typename TreeType::ConstPtr         ConstTreePtrType;
+    typedef typename NonConstTreeType::Ptr      NonConstTreePtrType;
+    typedef Grid<TreeType>                      GridType;
+    typedef Grid<NonConstTreeType>              NonConstGridType;
+    typedef typename GridType::Ptr              GridPtrType;
+    typedef typename NonConstGridType::Ptr      NonConstGridPtrType;
+    typedef typename GridType::ConstPtr         ConstGridPtrType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename tree::ValueAccessor<TreeType>         AccessorType;
+    typedef typename tree::ValueAccessor<const TreeType>   ConstAccessorType;
+    typedef typename tree::ValueAccessor<NonConstTreeType> NonConstAccessorType;
+
+    static TreeType& tree(TreeType& t) { return t; }
+    static TreeType& tree(GridType& g) { return g.tree(); }
+    static const TreeType& tree(const TreeType& t) { return t; }
+    static const TreeType& tree(const GridType& g) { return g.tree(); }
+    static const TreeType& constTree(TreeType& t) { return t; }
+    static const TreeType& constTree(GridType& g) { return g.constTree(); }
+    static const TreeType& constTree(const TreeType& t) { return t; }
+    static const TreeType& constTree(const GridType& g) { return g.constTree(); }
+};
+
+
+/// Partial specialization for Grid types
+template<typename _TreeType>
+struct TreeAdapter<Grid<_TreeType> >
+{
+    typedef _TreeType                           TreeType;
+    typedef typename boost::remove_const<TreeType>::type NonConstTreeType;
+    typedef typename TreeType::Ptr              TreePtrType;
+    typedef typename TreeType::ConstPtr         ConstTreePtrType;
+    typedef typename NonConstTreeType::Ptr      NonConstTreePtrType;
+    typedef Grid<TreeType>                      GridType;
+    typedef Grid<NonConstTreeType>              NonConstGridType;
+    typedef typename GridType::Ptr              GridPtrType;
+    typedef typename NonConstGridType::Ptr      NonConstGridPtrType;
+    typedef typename GridType::ConstPtr         ConstGridPtrType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename tree::ValueAccessor<TreeType>         AccessorType;
+    typedef typename tree::ValueAccessor<const TreeType>   ConstAccessorType;
+    typedef typename tree::ValueAccessor<NonConstTreeType> NonConstAccessorType;
+
+    static TreeType& tree(TreeType& t) { return t; }
+    static TreeType& tree(GridType& g) { return g.tree(); }
+    static const TreeType& tree(const TreeType& t) { return t; }
+    static const TreeType& tree(const GridType& g) { return g.tree(); }
+    static const TreeType& constTree(TreeType& t) { return t; }
+    static const TreeType& constTree(GridType& g) { return g.constTree(); }
+    static const TreeType& constTree(const TreeType& t) { return t; }
+    static const TreeType& constTree(const GridType& g) { return g.constTree(); }
+};
+
+/// Partial specialization for ValueAccessor types
+template<typename _TreeType>
+struct TreeAdapter<tree::ValueAccessor<_TreeType> >
+{
+    typedef _TreeType                           TreeType;
+    typedef typename boost::remove_const<TreeType>::type NonConstTreeType;
+    typedef typename TreeType::Ptr              TreePtrType;
+    typedef typename TreeType::ConstPtr         ConstTreePtrType;
+    typedef typename NonConstTreeType::Ptr      NonConstTreePtrType;
+    typedef Grid<TreeType>                      GridType;
+    typedef Grid<NonConstTreeType>              NonConstGridType;
+    typedef typename GridType::Ptr              GridPtrType;
+    typedef typename NonConstGridType::Ptr      NonConstGridPtrType;
+    typedef typename GridType::ConstPtr         ConstGridPtrType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename tree::ValueAccessor<TreeType>         AccessorType;
+    typedef typename tree::ValueAccessor<const TreeType>   ConstAccessorType;
+    typedef typename tree::ValueAccessor<NonConstTreeType> NonConstAccessorType;
+
+    static TreeType& tree(TreeType& t) { return t; }
+    static TreeType& tree(GridType& g) { return g.tree(); }
+    static TreeType& tree(AccessorType& a) { return a.tree(); }
+    static const TreeType& tree(const TreeType& t) { return t; }
+    static const TreeType& tree(const GridType& g) { return g.tree(); }
+    static const TreeType& tree(const AccessorType& a) { return a.tree(); }
+    static const TreeType& constTree(TreeType& t) { return t; }
+    static const TreeType& constTree(GridType& g) { return g.constTree(); }
+    static const TreeType& constTree(const TreeType& t) { return t; }
+    static const TreeType& constTree(const GridType& g) { return g.constTree(); }
+};
+
+//@}
+
+
+////////////////////////////////////////
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+GridBase::grid(const GridBase::Ptr& grid)
+{
+    // The string comparison on type names is slower than a dynamic_pointer_cast, but
+    // it is safer when pointers cross dso boundaries, as they do in many Houdini nodes.
+    if (grid && grid->type() == GridType::gridType()) {
+        return boost::static_pointer_cast<GridType>(grid);
+    }
+    return typename GridType::Ptr();
+}
+
+
+template<typename GridType>
+inline typename GridType::ConstPtr
+GridBase::grid(const GridBase::ConstPtr& grid)
+{
+    return boost::const_pointer_cast<const GridType>(
+        GridBase::grid<GridType>(boost::const_pointer_cast<GridBase>(grid)));
+}
+
+
+template<typename GridType>
+inline typename GridType::ConstPtr
+GridBase::constGrid(const GridBase::Ptr& grid)
+{
+    return boost::const_pointer_cast<const GridType>(GridBase::grid<GridType>(grid));
+}
+
+
+template<typename GridType>
+inline typename GridType::ConstPtr
+GridBase::constGrid(const GridBase::ConstPtr& grid)
+{
+    return boost::const_pointer_cast<const GridType>(
+        GridBase::grid<GridType>(boost::const_pointer_cast<GridBase>(grid)));
+}
+
+
+inline TreeBase::Ptr
+GridBase::baseTreePtr()
+{
+    return boost::const_pointer_cast<TreeBase>(this->constBaseTreePtr());
+}
+
+
+inline void
+GridBase::setTransform(math::Transform::Ptr xform)
+{
+    if (!xform) OPENVDB_THROW(ValueError, "Transform pointer is null");
+    mTransform = xform;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+inline Grid<TreeT>::Grid(): mTree(new TreeType)
+{
+}
+
+
+template<typename TreeT>
+inline Grid<TreeT>::Grid(const ValueType &background): mTree(new TreeType(background))
+{
+}
+
+
+template<typename TreeT>
+inline Grid<TreeT>::Grid(TreePtrType tree): mTree(tree)
+{
+    if (!tree) OPENVDB_THROW(ValueError, "Tree pointer is null");
+}
+
+
+template<typename TreeT>
+inline Grid<TreeT>::Grid(const Grid& other):
+    GridBase(other),
+    mTree(boost::static_pointer_cast<TreeType>(other.mTree->copy()))
+{
+}
+
+
+template<typename TreeT>
+template<typename OtherTreeType>
+inline Grid<TreeT>::Grid(const Grid<OtherTreeType>& other):
+    GridBase(other),
+    mTree(new TreeType(other.constTree()))
+{
+}
+
+
+template<typename TreeT>
+inline Grid<TreeT>::Grid(const Grid& other, ShallowCopy):
+    GridBase(other, ShallowCopy()),
+    mTree(other.mTree)
+{
+}
+
+
+template<typename TreeT>
+inline Grid<TreeT>::Grid(const GridBase& other):
+    GridBase(other),
+    mTree(new TreeType)
+{
+}
+
+
+//static
+template<typename TreeT>
+inline typename Grid<TreeT>::Ptr
+Grid<TreeT>::create()
+{
+    return Grid::create(zeroVal<ValueType>());
+}
+
+
+//static
+template<typename TreeT>
+inline typename Grid<TreeT>::Ptr
+Grid<TreeT>::create(const ValueType& background)
+{
+    return Ptr(new Grid(background));
+}
+
+
+//static
+template<typename TreeT>
+inline typename Grid<TreeT>::Ptr
+Grid<TreeT>::create(TreePtrType tree)
+{
+    return Ptr(new Grid(tree));
+}
+
+
+//static
+template<typename TreeT>
+inline typename Grid<TreeT>::Ptr
+Grid<TreeT>::create(const GridBase& other)
+{
+    return Ptr(new Grid(other));
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+inline typename Grid<TreeT>::Ptr
+Grid<TreeT>::copy(CopyPolicy treePolicy) const
+{
+    Ptr ret;
+    switch (treePolicy) {
+        case CP_NEW:
+            ret.reset(new Grid(*this, ShallowCopy()));
+            ret->newTree();
+            break;
+        case CP_COPY:
+            ret.reset(new Grid(*this));
+            break;
+        case CP_SHARE:
+            ret.reset(new Grid(*this, ShallowCopy()));
+            break;
+    }
+    return ret;
+}
+
+
+template<typename TreeT>
+inline GridBase::Ptr
+Grid<TreeT>::copyGrid(CopyPolicy treePolicy) const
+{
+    return this->copy(treePolicy);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::setTree(TreeBase::Ptr tree)
+{
+    if (!tree) OPENVDB_THROW(ValueError, "Tree pointer is null");
+    if (tree->type() != TreeType::treeType()) {
+        OPENVDB_THROW(TypeError, "Cannot assign a tree of type "
+            + tree->type() + " to a grid of type " + this->type());
+    }
+    mTree = boost::static_pointer_cast<TreeType>(tree);
+}
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::newTree()
+{
+    mTree.reset(new TreeType(this->background()));
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::fill(const CoordBBox& bbox, const ValueType& value, bool active, bool sparse)
+{
+    tree().fill(bbox, value, active, sparse);
+}
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::pruneGrid(float tolerance)
+{
+    this->tree().prune(ValueType(zeroVal<ValueType>() + tolerance));
+}
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<typename TreeT>
+inline void
+Grid<TreeT>::clip(const CoordBBox& bbox)
+{
+    tree().clip(bbox);
+}
+#endif
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::merge(Grid& other, MergePolicy policy)
+{
+    tree().merge(other.tree(), policy);
+}
+
+
+template<typename TreeT>
+template<typename OtherTreeType>
+inline void
+Grid<TreeT>::topologyUnion(const Grid<OtherTreeType>& other)
+{
+    tree().topologyUnion(other.tree());
+}
+
+
+template<typename TreeT>
+template<typename OtherTreeType>
+inline void
+Grid<TreeT>::topologyIntersection(const Grid<OtherTreeType>& other)
+{
+    tree().topologyIntersection(other.tree());
+}
+
+
+template<typename TreeT>
+template<typename OtherTreeType>
+inline void
+Grid<TreeT>::topologyDifference(const Grid<OtherTreeType>& other)
+{
+    tree().topologyDifference(other.tree());
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::evalMinMax(ValueType& minVal, ValueType& maxVal) const
+{
+    tree().evalMinMax(minVal, maxVal);
+}
+
+
+template<typename TreeT>
+inline CoordBBox
+Grid<TreeT>::evalActiveVoxelBoundingBox() const
+{
+    CoordBBox bbox;
+    tree().evalActiveVoxelBoundingBox(bbox);
+    return bbox;
+}
+
+
+template<typename TreeT>
+inline Coord
+Grid<TreeT>::evalActiveVoxelDim() const
+{
+    Coord dim;
+    const bool nonempty = tree().evalActiveVoxelDim(dim);
+    return (nonempty ? dim : Coord());
+}
+
+
+////////////////////////////////////////
+
+
+/// @internal Consider using the stream tagging mechanism (see io::Archive)
+/// to specify the float precision, but note that the setting is per-grid.
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::readTopology(std::istream& is)
+{
+    tree().readTopology(is, saveFloatAsHalf());
+}
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::writeTopology(std::ostream& os) const
+{
+    tree().writeTopology(os, saveFloatAsHalf());
+}
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::readBuffers(std::istream& is)
+{
+    tree().readBuffers(is, saveFloatAsHalf());
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::readBuffers(std::istream& is, const CoordBBox& bbox)
+{
+    tree().readBuffers(is, bbox, saveFloatAsHalf());
+}
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::readNonresidentBuffers() const
+{
+    tree().readNonresidentBuffers();
+}
+
+#endif // !OPENVDB_2_ABI_COMPATIBLE
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::writeBuffers(std::ostream& os) const
+{
+    tree().writeBuffers(os, saveFloatAsHalf());
+}
+
+
+template<typename TreeT>
+inline void
+Grid<TreeT>::print(std::ostream& os, int verboseLevel) const
+{
+    tree().print(os, verboseLevel);
+
+    if (metaCount() > 0) {
+        os << "Additional metadata:" << std::endl;
+        for (ConstMetaIterator it = beginMeta(), end = endMeta(); it != end; ++it) {
+            os << "  " << it->first;
+            if (it->second) {
+                const std::string value = it->second->str();
+                if (!value.empty()) os << ": " << value;
+            }
+            os << "\n";
+        }
+    }
+
+    os << "Transform:" << std::endl;
+    transform().print(os, /*indent=*/"  ");
+    os << std::endl;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+createGrid(const typename GridType::ValueType& background)
+{
+    return GridType::create(background);
+}
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+createGrid()
+{
+    return GridType::create();
+}
+
+
+template<typename TreePtrType>
+inline typename Grid<typename TreePtrType::element_type>::Ptr
+createGrid(TreePtrType tree)
+{
+    typedef typename TreePtrType::element_type TreeType;
+    return Grid<TreeType>::create(tree);
+}
+
+
+template<typename GridType>
+typename GridType::Ptr
+createLevelSet(Real voxelSize, Real halfWidth)
+{
+    typedef typename GridType::ValueType ValueType;
+
+    // GridType::ValueType is required to be a floating-point scalar.
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+
+    typename GridType::Ptr grid = GridType::create(
+        /*background=*/static_cast<ValueType>(voxelSize * halfWidth));
+    grid->setTransform(math::Transform::createLinearTransform(voxelSize));
+    grid->setGridClass(GRID_LEVEL_SET);
+    return grid;
+}
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_GRID_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/Metadata.h b/nuparu/include/openvdb_new/Metadata.h
new file mode 100644
index 00000000..82a81ee3
--- /dev/null
+++ b/nuparu/include/openvdb_new/Metadata.h
@@ -0,0 +1,41 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_METADATA_HAS_BEEN_INCLUDED
+#define OPENVDB_METADATA_HAS_BEEN_INCLUDED
+
+#include <openvdb/metadata/Metadata.h>
+#include <openvdb/metadata/StringMetadata.h>
+
+#endif // OPENVDB_METADATA_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/Platform.h b/nuparu/include/openvdb_new/Platform.h
new file mode 100644
index 00000000..d9fac2ff
--- /dev/null
+++ b/nuparu/include/openvdb_new/Platform.h
@@ -0,0 +1,210 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file Platform.h
+
+#ifndef OPENVDB_PLATFORM_HAS_BEEN_INCLUDED
+#define OPENVDB_PLATFORM_HAS_BEEN_INCLUDED
+
+#include "PlatformConfig.h"
+
+#define PRAGMA(x) _Pragma(#x)
+
+/// Use OPENVDB_DEPRECATED to mark functions as deprecated.
+/// It should be placed right before the signature of the function,
+/// e.g., "OPENVDB_DEPRECATED void functionName();".
+#ifdef OPENVDB_DEPRECATED
+#undef OPENVDB_DEPRECATED
+#endif
+#ifdef _MSC_VER
+    #define OPENVDB_DEPRECATED  __declspec(deprecated)
+#else
+    #define OPENVDB_DEPRECATED  __attribute__ ((deprecated))
+#endif
+
+/// Macro for determining if GCC version is >= than X.Y
+#if defined(__GNUC__)
+    #define OPENVDB_CHECK_GCC(MAJOR, MINOR) \
+        (__GNUC__ > MAJOR || (__GNUC__ == MAJOR && __GNUC_MINOR__ >= MINOR))
+#else
+    #define OPENVDB_CHECK_GCC(MAJOR, MINOR) 0
+#endif
+
+/// Macro for determining if there are sufficient C++0x/C++11 features
+#ifdef __INTEL_COMPILER
+    #ifdef __INTEL_CXX11_MODE__
+        #define OPENVDB_HAS_CXX11 1
+    #endif
+#elif defined(__clang__)
+    #ifndef _LIBCPP_VERSION
+        #include <ciso646>
+    #endif
+    #ifdef _LIBCPP_VERSION
+        #define OPENVDB_HAS_CXX11 1
+    #endif
+#elif defined(__GXX_EXPERIMENTAL_CXX0X__) || (__cplusplus > 199711L)
+    #define OPENVDB_HAS_CXX11 1
+#elif defined(_MSC_VER)
+    #if (_MSC_VER >= 1700)
+        #define OPENVDB_HAS_CXX11 1
+    #endif
+#endif
+#if defined(__GNUC__) && !OPENVDB_CHECK_GCC(4, 4)
+    // ICC uses GCC's standard library headers, so even if the ICC version
+    // is recent enough for C++11, the GCC version might not be.
+    #undef OPENVDB_HAS_CXX11
+#endif
+
+/// For compilers that need templated function specializations to have
+/// storage qualifiers, we need to declare the specializations as static inline.
+/// Otherwise, we'll get linker errors about multiply defined symbols.
+#if defined(__GNUC__) && OPENVDB_CHECK_GCC(4, 4)
+    #define OPENVDB_STATIC_SPECIALIZATION
+#else
+    #define OPENVDB_STATIC_SPECIALIZATION static
+#endif
+
+
+/// Bracket code with OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN/_END,
+/// as in the following example, to inhibit ICC remarks about unreachable code:
+/// @code
+/// template<typename NodeType>
+/// void processNode(NodeType& node)
+/// {
+///     OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+///     if (NodeType::LEVEL == 0) return; // ignore leaf nodes
+///     int i = 0;
+///     ...
+///     OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+/// }
+/// @endcode
+/// In the above, <tt>NodeType::LEVEL == 0</tt> is a compile-time constant expression,
+/// so for some template instantiations, the line below it is unreachable.
+#if defined(__INTEL_COMPILER)
+    // Disable ICC remarks 111 ("statement is unreachable"), 128 ("loop is not reachable"),
+    // 185 ("dynamic initialization in unreachable code"), and 280 ("selector expression
+    // is constant").
+    #define OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN \
+        _Pragma("warning (push)") \
+        _Pragma("warning (disable:111)") \
+        _Pragma("warning (disable:128)") \
+        _Pragma("warning (disable:185)") \
+        _Pragma("warning (disable:280)")
+    #define OPENVDB_NO_UNREACHABLE_CODE_WARNING_END \
+        _Pragma("warning (pop)")
+#elif defined(__clang__)
+    #define OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN \
+        PRAGMA(clang diagnostic push) \
+        PRAGMA(clang diagnostic ignored "-Wunreachable-code")
+    #define OPENVDB_NO_UNREACHABLE_CODE_WARNING_END \
+        PRAGMA(clang diagnostic pop)
+#else
+    #define OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    #define OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+#endif
+
+
+/// Visual C++ does not have constants like M_PI unless this is defined.
+/// @note This is needed even though the core library is built with this but
+/// hcustom 12.1 doesn't define it. So this is needed for HDK operators.
+#ifndef _USE_MATH_DEFINES
+    #define _USE_MATH_DEFINES
+#endif
+
+/// Visual C++ does not have round
+#ifdef _MSC_VER
+    #include <boost/math/special_functions/round.hpp>
+    using boost::math::round;
+#endif
+
+/// Visual C++ uses _copysign() instead of copysign()
+#ifdef _MSC_VER
+    #include <float.h>
+    static inline double copysign(double x, double y) { return _copysign(x, y); }
+#endif
+
+/// Visual C++ does not have stdint.h which defines types like uint64_t.
+/// So for portability we instead include boost/cstdint.hpp.
+#include <boost/cstdint.hpp>
+using boost::int8_t;
+using boost::int16_t;
+using boost::int32_t;
+using boost::int64_t;
+using boost::uint8_t;
+using boost::uint16_t;
+using boost::uint32_t;
+using boost::uint64_t;
+
+/// Helper macros for defining library symbol visibility
+#ifdef OPENVDB_EXPORT
+#undef OPENVDB_EXPORT
+#endif
+#ifdef OPENVDB_IMPORT
+#undef OPENVDB_IMPORT
+#endif
+#ifdef __GNUC__
+    #define OPENVDB_EXPORT __attribute__((visibility("default")))
+    #define OPENVDB_IMPORT __attribute__((visibility("default")))
+#endif
+#ifdef _WIN32
+    #ifdef OPENVDB_DLL
+        #define OPENVDB_EXPORT __declspec(dllexport)
+        #define OPENVDB_IMPORT __declspec(dllimport)
+    #else
+        #define OPENVDB_EXPORT
+        #define OPENVDB_IMPORT
+    #endif
+#endif
+
+/// All classes and public free standing functions must be explicitly marked
+/// as \<lib\>_API to be exported. The \<lib\>_PRIVATE macros are defined when
+/// building that particular library.
+#ifdef OPENVDB_API
+#undef OPENVDB_API
+#endif
+#ifdef OPENVDB_PRIVATE
+    #define OPENVDB_API OPENVDB_EXPORT
+#else
+    #define OPENVDB_API OPENVDB_IMPORT
+#endif
+#ifdef OPENVDB_HOUDINI_API
+#undef OPENVDB_HOUDINI_API
+#endif
+#ifdef OPENVDB_HOUDINI_PRIVATE
+    #define OPENVDB_HOUDINI_API OPENVDB_EXPORT
+#else
+    #define OPENVDB_HOUDINI_API OPENVDB_IMPORT
+#endif
+
+#endif // OPENVDB_PLATFORM_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/PlatformConfig.h b/nuparu/include/openvdb_new/PlatformConfig.h
new file mode 100644
index 00000000..23276edb
--- /dev/null
+++ b/nuparu/include/openvdb_new/PlatformConfig.h
@@ -0,0 +1,57 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file PlatformConfig.h
+
+#ifndef OPENVDB_PLATFORMCONFIG_HAS_BEEN_INCLUDED
+#define OPENVDB_PLATFORMCONFIG_HAS_BEEN_INCLUDED
+
+// Windows specific configuration
+#ifdef _WIN32
+
+    // By default, assume we're building OpenVDB as a DLL if we're dynamically
+    // linking in the CRT, unless OPENVDB_STATICLIB is defined.
+    #if defined(_DLL) && !defined(OPENVDB_STATICLIB) && !defined(OPENVDB_DLL)
+        #define OPENVDB_DLL
+    #endif
+
+    // By default, assume that we're dynamically linking OpenEXR, unless
+    // OPENVDB_OPENEXR_STATICLIB is defined.
+    #if !defined(OPENVDB_OPENEXR_STATICLIB) && !defined(OPENEXR_DLL)
+        #define OPENEXR_DLL
+    #endif
+
+#endif // _WIN32
+
+#endif // OPENVDB_PLATFORMCONFIG_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/Types.h b/nuparu/include/openvdb_new/Types.h
new file mode 100644
index 00000000..0a2ec16d
--- /dev/null
+++ b/nuparu/include/openvdb_new/Types.h
@@ -0,0 +1,504 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TYPES_HAS_BEEN_INCLUDED
+#define OPENVDB_TYPES_HAS_BEEN_INCLUDED
+
+#include "version.h"
+#include "Platform.h"
+#include <OpenEXR/half.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/BBox.h>
+#include <openvdb/math/Quat.h>
+#include <openvdb/math/Vec2.h>
+#include <openvdb/math/Vec3.h>
+#include <openvdb/math/Vec4.h>
+#include <openvdb/math/Mat3.h>
+#include <openvdb/math/Mat4.h>
+#include <openvdb/math/Coord.h>
+#include <boost/type_traits/is_convertible.hpp>
+#include <boost/type_traits/is_integral.hpp>
+#include <boost/static_assert.hpp>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+// One-dimensional scalar types
+typedef uint32_t            Index32;
+typedef uint64_t            Index64;
+typedef Index32             Index;
+typedef int16_t             Int16;
+typedef int32_t             Int32;
+typedef int64_t             Int64;
+typedef Int32               Int;
+typedef unsigned char       Byte;
+typedef double              Real;
+
+// Two-dimensional vector types
+typedef math::Vec2<Real>    Vec2R;
+typedef math::Vec2<Index32> Vec2I;
+typedef math::Vec2<float>   Vec2f;
+typedef math::Vec2<half>    Vec2H;
+using math::Vec2i;
+using math::Vec2s;
+using math::Vec2d;
+
+// Three-dimensional vector types
+typedef math::Vec3<Real>    Vec3R;
+typedef math::Vec3<Index32> Vec3I;
+typedef math::Vec3<float>   Vec3f;
+typedef math::Vec3<half>    Vec3H;
+using math::Vec3i;
+using math::Vec3s;
+using math::Vec3d;
+
+using math::Coord;
+using math::CoordBBox;
+typedef math::BBox<Vec3d>   BBoxd;
+
+// Four-dimensional vector types
+typedef math::Vec4<Real>    Vec4R;
+typedef math::Vec4<Index32> Vec4I;
+typedef math::Vec4<float>   Vec4f;
+typedef math::Vec4<half>    Vec4H;
+using math::Vec4i;
+using math::Vec4s;
+using math::Vec4d;
+
+// Three-dimensional matrix types
+typedef math::Mat3<Real>    Mat3R;
+
+// Four-dimensional matrix types
+typedef math::Mat4<Real>    Mat4R;
+typedef math::Mat4<double>  Mat4d;
+typedef math::Mat4<float>   Mat4s;
+
+// Quaternions
+typedef math::Quat<Real>    QuatR;
+
+// Dummy type for a voxel with a binary mask value, e.g. the active state
+class ValueMask {};
+
+
+////////////////////////////////////////
+
+
+/// @brief  Integer wrapper, required to distinguish PointIndexGrid and
+///         PointDataGrid from Int32Grid and Int64Grid
+/// @note   @c Kind is a dummy parameter used to create distinct types.
+template<typename IntType_, Index Kind>
+struct PointIndex
+{
+    BOOST_STATIC_ASSERT(boost::is_integral<IntType_>::value);
+
+    typedef IntType_ IntType;
+
+    PointIndex(IntType i = IntType(0)): mIndex(i) {}
+
+    operator IntType() const { return mIndex; }
+
+    /// Needed to support the <tt>(zeroVal<PointIndex>() + val)</tt> idiom.
+    template<typename T>
+    PointIndex operator+(T x) { return PointIndex(mIndex + IntType(x)); }
+
+private:
+    IntType mIndex;
+};
+
+
+typedef PointIndex<Index32, 0> PointIndex32;
+typedef PointIndex<Index64, 0> PointIndex64;
+
+typedef PointIndex<Index32, 1> PointDataIndex32;
+typedef PointIndex<Index64, 1> PointDataIndex64;
+
+
+////////////////////////////////////////
+
+
+template<typename T> struct VecTraits {
+    static const bool IsVec = false;
+    static const int Size = 1;
+    typedef T ElementType;
+};
+template<typename T> struct VecTraits<math::Vec2<T> > {
+    static const bool IsVec = true;
+    static const int Size = 2;
+    typedef T ElementType;
+};
+template<typename T> struct VecTraits<math::Vec3<T> > {
+    static const bool IsVec = true;
+    static const int Size = 3;
+    typedef T ElementType;
+};
+template<typename T> struct VecTraits<math::Vec4<T> > {
+    static const bool IsVec = true;
+    static const int Size = 4;
+    typedef T ElementType;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief CanConvertType<FromType, ToType>::value is @c true if a value
+/// of type @a ToType can be constructed from a value of type @a FromType.
+///
+/// @note @c boost::is_convertible tests for implicit convertibility only.
+/// What we want is the equivalent of C++11's @c std::is_constructible,
+/// which allows for explicit conversions as well.  Unfortunately, not all
+/// compilers support @c std::is_constructible yet, so for now, types that
+/// can only be converted explicitly have to be indicated with specializations
+/// of this template.
+template<typename FromType, typename ToType>
+struct CanConvertType { enum { value = boost::is_convertible<FromType, ToType>::value }; };
+
+// Specializations for vector types, which can be constructed from values
+// of their own ValueTypes (or values that can be converted to their ValueTypes),
+// but only explicitly
+template<typename T> struct CanConvertType<T, math::Vec2<T> > { enum { value = true }; };
+template<typename T> struct CanConvertType<T, math::Vec3<T> > { enum { value = true }; };
+template<typename T> struct CanConvertType<T, math::Vec4<T> > { enum { value = true }; };
+template<typename T> struct CanConvertType<math::Vec2<T>, math::Vec2<T> > { enum {value = true}; };
+template<typename T> struct CanConvertType<math::Vec3<T>, math::Vec3<T> > { enum {value = true}; };
+template<typename T> struct CanConvertType<math::Vec4<T>, math::Vec4<T> > { enum {value = true}; };
+template<typename T0, typename T1>
+struct CanConvertType<T0, math::Vec2<T1> > { enum { value = CanConvertType<T0, T1>::value }; };
+template<typename T0, typename T1>
+struct CanConvertType<T0, math::Vec3<T1> > { enum { value = CanConvertType<T0, T1>::value }; };
+template<typename T0, typename T1>
+struct CanConvertType<T0, math::Vec4<T1> > { enum { value = CanConvertType<T0, T1>::value }; };
+template<> struct CanConvertType<PointIndex32, PointDataIndex32> { enum {value = true}; };
+template<> struct CanConvertType<PointDataIndex32, PointIndex32> { enum {value = true}; };    
+template<typename T>
+struct CanConvertType<T, ValueMask> { enum {value = CanConvertType<T, bool>::value}; };
+template<typename T>
+struct CanConvertType<ValueMask, T> { enum {value = CanConvertType<bool, T>::value}; };
+    
+////////////////////////////////////////
+
+
+// Add new items to the *end* of this list, and update NUM_GRID_CLASSES.
+enum GridClass {
+    GRID_UNKNOWN = 0,
+    GRID_LEVEL_SET,
+    GRID_FOG_VOLUME,
+    GRID_STAGGERED
+};
+enum { NUM_GRID_CLASSES = GRID_STAGGERED + 1 };
+
+static const Real LEVEL_SET_HALF_WIDTH = 3;
+
+/// The type of a vector determines how transforms are applied to it:
+/// <dl>
+/// <dt><b>Invariant</b>
+/// <dd>Does not transform (e.g., tuple, uvw, color)
+///
+/// <dt><b>Covariant</b>
+/// <dd>Apply inverse-transpose transformation: @e w = 0, ignores translation
+///     (e.g., gradient/normal)
+///
+/// <dt><b>Covariant Normalize</b>
+/// <dd>Apply inverse-transpose transformation: @e w = 0, ignores translation,
+///     vectors are renormalized (e.g., unit normal)
+///
+/// <dt><b>Contravariant Relative</b>
+/// <dd>Apply "regular" transformation: @e w = 0, ignores translation
+///     (e.g., displacement, velocity, acceleration)
+///
+/// <dt><b>Contravariant Absolute</b>
+/// <dd>Apply "regular" transformation: @e w = 1, vector translates (e.g., position)
+/// </dl>
+enum VecType {
+    VEC_INVARIANT = 0,
+    VEC_COVARIANT,
+    VEC_COVARIANT_NORMALIZE,
+    VEC_CONTRAVARIANT_RELATIVE,
+    VEC_CONTRAVARIANT_ABSOLUTE
+};
+enum { NUM_VEC_TYPES = VEC_CONTRAVARIANT_ABSOLUTE + 1 };
+
+
+/// Specify how grids should be merged during certain (typically multithreaded) operations.
+/// <dl>
+/// <dt><b>MERGE_ACTIVE_STATES</b>
+/// <dd>The output grid is active wherever any of the input grids is active.
+///
+/// <dt><b>MERGE_NODES</b>
+/// <dd>The output grid's tree has a node wherever any of the input grids' trees
+///     has a node, regardless of any active states.
+///
+/// <dt><b>MERGE_ACTIVE_STATES_AND_NODES</b>
+/// <dd>The output grid is active wherever any of the input grids is active,
+///     and its tree has a node wherever any of the input grids' trees has a node.
+/// </dl>
+enum MergePolicy {
+    MERGE_ACTIVE_STATES = 0,
+    MERGE_NODES,
+    MERGE_ACTIVE_STATES_AND_NODES
+};
+
+
+////////////////////////////////////////
+
+
+template<typename T> const char* typeNameAsString()                 { return typeid(T).name(); }
+template<> inline const char* typeNameAsString<bool>()              { return "bool"; }
+template<> inline const char* typeNameAsString<ValueMask>()         { return "mask"; }
+template<> inline const char* typeNameAsString<float>()             { return "float"; }
+template<> inline const char* typeNameAsString<double>()            { return "double"; }
+template<> inline const char* typeNameAsString<int32_t>()           { return "int32"; }
+template<> inline const char* typeNameAsString<uint32_t>()          { return "uint32"; }
+template<> inline const char* typeNameAsString<int64_t>()           { return "int64"; }
+template<> inline const char* typeNameAsString<Vec2i>()             { return "vec2i"; }
+template<> inline const char* typeNameAsString<Vec2s>()             { return "vec2s"; }
+template<> inline const char* typeNameAsString<Vec2d>()             { return "vec2d"; }
+template<> inline const char* typeNameAsString<Vec3i>()             { return "vec3i"; }
+template<> inline const char* typeNameAsString<Vec3f>()             { return "vec3s"; }
+template<> inline const char* typeNameAsString<Vec3d>()             { return "vec3d"; }
+template<> inline const char* typeNameAsString<std::string>()       { return "string"; }
+template<> inline const char* typeNameAsString<Mat4s>()             { return "mat4s"; }
+template<> inline const char* typeNameAsString<Mat4d>()             { return "mat4d"; }
+template<> inline const char* typeNameAsString<PointIndex32>()      { return "ptidx32"; }
+template<> inline const char* typeNameAsString<PointIndex64>()      { return "ptidx64"; }
+template<> inline const char* typeNameAsString<PointDataIndex32>()  { return "ptdataidx32"; }
+template<> inline const char* typeNameAsString<PointDataIndex64>()  { return "ptdataidx64"; }
+
+
+////////////////////////////////////////
+
+
+/// @brief This struct collects both input and output arguments to "grid combiner" functors
+/// used with the tree::TypedGrid::combineExtended() and combine2Extended() methods.
+/// AValueType and BValueType are the value types of the two grids being combined.
+///
+/// @see openvdb/tree/Tree.h for usage information.
+///
+/// Setter methods return references to this object, to facilitate the following usage:
+/// @code
+///     CombineArgs<float> args;
+///     myCombineOp(args.setARef(aVal).setBRef(bVal).setAIsActive(true).setBIsActive(false));
+/// @endcode
+template<typename AValueType, typename BValueType = AValueType>
+class CombineArgs
+{
+public:
+    typedef AValueType AValueT;
+    typedef BValueType BValueT;
+
+    CombineArgs()
+        : mAValPtr(NULL)
+        , mBValPtr(NULL)
+        , mResultValPtr(&mResultVal)
+        , mAIsActive(false)
+        , mBIsActive(false)
+        , mResultIsActive(false)
+    {
+    }
+
+    /// Use this constructor when the result value is stored externally.
+    CombineArgs(const AValueType& a, const BValueType& b, AValueType& result,
+                bool aOn = false, bool bOn = false)
+        : mAValPtr(&a)
+        , mBValPtr(&b)
+        , mResultValPtr(&result)
+        , mAIsActive(aOn)
+        , mBIsActive(bOn)
+    {
+        this->updateResultActive();
+    }
+
+    /// Use this constructor when the result value should be stored in this struct.
+    CombineArgs(const AValueType& a, const BValueType& b, bool aOn = false, bool bOn = false)
+        : mAValPtr(&a)
+        , mBValPtr(&b)
+        , mResultValPtr(&mResultVal)
+        , mAIsActive(aOn)
+        , mBIsActive(bOn)
+    {
+        this->updateResultActive();
+    }
+
+    /// Get the A input value.
+    const AValueType& a() const { return *mAValPtr; }
+    /// Get the B input value.
+    const BValueType& b() const { return *mBValPtr; }
+    //@{
+    /// Get the output value.
+    const AValueType& result() const { return *mResultValPtr; }
+    AValueType& result() { return *mResultValPtr; }
+    //@}
+
+    /// Set the output value.
+    CombineArgs& setResult(const AValueType& val) { *mResultValPtr = val; return *this; }
+
+    /// Redirect the A value to a new external source.
+    CombineArgs& setARef(const AValueType& a) { mAValPtr = &a; return *this; }
+    /// Redirect the B value to a new external source.
+    CombineArgs& setBRef(const BValueType& b) { mBValPtr = &b; return *this; }
+    /// Redirect the result value to a new external destination.
+    CombineArgs& setResultRef(AValueType& val) { mResultValPtr = &val; return *this; }
+
+    /// @return true if the A value is active
+    bool aIsActive() const { return mAIsActive; }
+    /// @return true if the B value is active
+    bool bIsActive() const { return mBIsActive; }
+    /// @return true if the output value is active
+    bool resultIsActive() const { return mResultIsActive; }
+
+    /// Set the active state of the A value.
+    CombineArgs& setAIsActive(bool b) { mAIsActive = b; updateResultActive(); return *this; }
+    /// Set the active state of the B value.
+    CombineArgs& setBIsActive(bool b) { mBIsActive = b; updateResultActive(); return *this; }
+    /// Set the active state of the output value.
+    CombineArgs& setResultIsActive(bool b) { mResultIsActive = b; return *this; }
+
+protected:
+    /// By default, the result value is active if either of the input values is active,
+    /// but this behavior can be overridden by calling setResultIsActive().
+    void updateResultActive() { mResultIsActive = mAIsActive || mBIsActive; }
+
+    const AValueType* mAValPtr;   // pointer to input value from A grid
+    const BValueType* mBValPtr;   // pointer to input value from B grid
+    AValueType mResultVal;        // computed output value (unused if stored externally)
+    AValueType* mResultValPtr;    // pointer to either mResultVal or an external value
+    bool mAIsActive, mBIsActive;  // active states of A and B values
+    bool mResultIsActive;         // computed active state (default: A active || B active)
+};
+
+
+/// This struct adapts a "grid combiner" functor to swap the A and B grid values
+/// (e.g., so that if the original functor computes a + 2 * b, the adapted functor
+/// will compute b + 2 * a).
+template<typename ValueType, typename CombineOp>
+struct SwappedCombineOp
+{
+    SwappedCombineOp(CombineOp& _op): op(_op) {}
+
+    void operator()(CombineArgs<ValueType>& args)
+    {
+        CombineArgs<ValueType> swappedArgs(args.b(), args.a(), args.result(),
+            args.bIsActive(), args.aIsActive());
+        op(swappedArgs);
+    }
+
+    CombineOp& op;
+};
+
+
+////////////////////////////////////////
+
+
+/// In copy constructors, members stored as shared pointers can be handled
+/// in several ways:
+/// <dl>
+/// <dt><b>CP_NEW</b>
+/// <dd>Don't copy the member; default construct a new member object instead.
+///
+/// <dt><b>CP_SHARE</b>
+/// <dd>Copy the shared pointer, so that the original and new objects share
+///     the same member.
+///
+/// <dt><b>CP_COPY</b>
+/// <dd>Create a deep copy of the member.
+/// </dl>
+enum CopyPolicy { CP_NEW, CP_SHARE, CP_COPY };
+
+
+// Dummy class that distinguishes shallow copy constructors from
+// deep copy constructors
+class ShallowCopy {};
+// Dummy class that distinguishes topology copy constructors from
+// deep copy constructors
+class TopologyCopy {};
+// Dummy class that distinguishes constructors during file input
+class PartialCreate {};
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#if defined(__ICC)
+
+// Use these defines to bracket a region of code that has safe static accesses.
+// Keep the region as small as possible.
+#define OPENVDB_START_THREADSAFE_STATIC_REFERENCE   __pragma(warning(disable:1710))
+#define OPENVDB_FINISH_THREADSAFE_STATIC_REFERENCE  __pragma(warning(default:1710))
+#define OPENVDB_START_THREADSAFE_STATIC_WRITE       __pragma(warning(disable:1711))
+#define OPENVDB_FINISH_THREADSAFE_STATIC_WRITE      __pragma(warning(default:1711))
+#define OPENVDB_START_THREADSAFE_STATIC_ADDRESS     __pragma(warning(disable:1712))
+#define OPENVDB_FINISH_THREADSAFE_STATIC_ADDRESS    __pragma(warning(default:1712))
+
+// Use these defines to bracket a region of code that has unsafe static accesses.
+// Keep the region as small as possible.
+#define OPENVDB_START_NON_THREADSAFE_STATIC_REFERENCE   __pragma(warning(disable:1710))
+#define OPENVDB_FINISH_NON_THREADSAFE_STATIC_REFERENCE  __pragma(warning(default:1710))
+#define OPENVDB_START_NON_THREADSAFE_STATIC_WRITE       __pragma(warning(disable:1711))
+#define OPENVDB_FINISH_NON_THREADSAFE_STATIC_WRITE      __pragma(warning(default:1711))
+#define OPENVDB_START_NON_THREADSAFE_STATIC_ADDRESS     __pragma(warning(disable:1712))
+#define OPENVDB_FINISH_NON_THREADSAFE_STATIC_ADDRESS    __pragma(warning(default:1712))
+
+// Simpler version for one-line cases
+#define OPENVDB_THREADSAFE_STATIC_REFERENCE(CODE) \
+    __pragma(warning(disable:1710)); CODE; __pragma(warning(default:1710))
+#define OPENVDB_THREADSAFE_STATIC_WRITE(CODE) \
+    __pragma(warning(disable:1711)); CODE; __pragma(warning(default:1711))
+#define OPENVDB_THREADSAFE_STATIC_ADDRESS(CODE) \
+    __pragma(warning(disable:1712)); CODE; __pragma(warning(default:1712))
+
+#else // GCC does not support these compiler warnings
+
+#define OPENVDB_START_THREADSAFE_STATIC_REFERENCE
+#define OPENVDB_FINISH_THREADSAFE_STATIC_REFERENCE
+#define OPENVDB_START_THREADSAFE_STATIC_WRITE
+#define OPENVDB_FINISH_THREADSAFE_STATIC_WRITE
+#define OPENVDB_START_THREADSAFE_STATIC_ADDRESS
+#define OPENVDB_FINISH_THREADSAFE_STATIC_ADDRESS
+
+#define OPENVDB_START_NON_THREADSAFE_STATIC_REFERENCE
+#define OPENVDB_FINISH_NON_THREADSAFE_STATIC_REFERENCE
+#define OPENVDB_START_NON_THREADSAFE_STATIC_WRITE
+#define OPENVDB_FINISH_NON_THREADSAFE_STATIC_WRITE
+#define OPENVDB_START_NON_THREADSAFE_STATIC_ADDRESS
+#define OPENVDB_FINISH_NON_THREADSAFE_STATIC_ADDRESS
+
+#define OPENVDB_THREADSAFE_STATIC_REFERENCE(CODE) CODE
+#define OPENVDB_THREADSAFE_STATIC_WRITE(CODE) CODE
+#define OPENVDB_THREADSAFE_STATIC_ADDRESS(CODE) CODE
+
+#endif // defined(__ICC)
+
+#endif // OPENVDB_TYPES_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/Archive.h b/nuparu/include/openvdb_new/io/Archive.h
new file mode 100644
index 00000000..a195fc8b
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/Archive.h
@@ -0,0 +1,228 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_IO_ARCHIVE_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_ARCHIVE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <iosfwd>
+#include <map>
+#include <string>
+#include <boost/uuid/uuid.hpp>
+#include <boost/cstdint.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/shared_ptr.hpp>
+#include <openvdb/Grid.h>
+#include <openvdb/metadata/MetaMap.h>
+#include <openvdb/version.h> // for VersionId
+#include "Compression.h" // for COMPRESS_ZIP, etc.
+
+
+class TestFile;
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+class GridDescriptor;
+
+
+/// Grid serializer/unserializer
+class OPENVDB_API Archive
+{
+public:
+    typedef boost::shared_ptr<Archive> Ptr;
+    typedef boost::shared_ptr<const Archive> ConstPtr;
+
+    static const uint32_t DEFAULT_COMPRESSION_FLAGS;
+
+    Archive();
+    virtual ~Archive();
+
+    /// @brief Return a copy of this archive.
+    virtual Ptr copy() const;
+
+    /// @brief Return the UUID that was most recently written (or read,
+    /// if no UUID has been written yet).
+    std::string getUniqueTag() const;
+    /// @brief Return @c true if the given UUID matches this archive's UUID.
+    bool isIdentical(const std::string& uuidStr) const;
+
+    /// @brief Return the file format version number of the input stream.
+    uint32_t fileVersion() const { return mFileVersion; }
+    /// @brief Return the (major, minor) version number of the library that was
+    /// used to write the input stream.
+    VersionId libraryVersion() const { return mLibraryVersion; }
+    /// @brief Return a string of the form "<major>.<minor>/<format>", giving the
+    /// library and file format version numbers associated with the input stream.
+    std::string version() const;
+
+    /// @brief Return @c true if trees shared by multiple grids are written out
+    /// only once, @c false if they are written out once per grid.
+    bool isInstancingEnabled() const { return mEnableInstancing; }
+    /// @brief Specify whether trees shared by multiple grids should be
+    /// written out only once (@c true) or once per grid (@c false).
+    /// @note Instancing is enabled by default.
+    void setInstancingEnabled(bool b) { mEnableInstancing = b; }
+
+    /// Return @c true if the OpenVDB library includes support for the Blosc compressor.
+    static bool hasBloscCompression();
+
+    /// Return a bit mask specifying compression options for the data stream.
+    uint32_t compression() const { return mCompression; }
+    /// @brief Specify whether and how the data stream should be compressed.
+    /// @param c bitwise OR (e.g., COMPRESS_ZIP | COMPRESS_ACTIVE_MASK) of
+    ///     compression option flags (see Compression.h for the available flags)
+    /// @note Not all combinations of compression options are supported.
+    void setCompression(uint32_t c) { mCompression = c; }
+
+    /// @brief Return @c true if grid statistics (active voxel count and
+    /// bounding box, etc.) are computed and written as grid metadata.
+    bool isGridStatsMetadataEnabled() const { return mEnableGridStats; }
+    /// @brief Specify whether grid statistics (active voxel count and
+    /// bounding box, etc.) should be computed and written as grid metadata.
+    void setGridStatsMetadataEnabled(bool b) { mEnableGridStats = b; }
+
+    /// @brief Write the grids in the given container to this archive's output stream.
+    virtual void write(const GridCPtrVec&, const MetaMap& = MetaMap()) const {}
+
+    /// @brief Return @c true if delayed loading is enabled.
+    /// @details If enabled, delayed loading can be disabled for individual files,
+    /// but not vice-versa.
+    /// @note Define the environment variable @c OPENVDB_DISABLE_DELAYED_LOAD
+    /// to disable delayed loading unconditionally.
+    static bool isDelayedLoadingEnabled();
+
+protected:
+    /// @brief Return @c true if the input stream contains grid offsets
+    /// that allow for random access or partial reading.
+    bool inputHasGridOffsets() const { return mInputHasGridOffsets; }
+    void setInputHasGridOffsets(bool b) { mInputHasGridOffsets = b; }
+
+    /// @brief Tag the given input stream with the input file format version number.
+    ///
+    /// The tag can be retrieved with getFormatVersion().
+    /// @sa getFormatVersion()
+    void setFormatVersion(std::istream&);
+
+    /// @brief Tag the given input stream with the version number of
+    /// the library with which the input stream was created.
+    ///
+    /// The tag can be retrieved with getLibraryVersion().
+    /// @sa getLibraryVersion()
+    void setLibraryVersion(std::istream&);
+
+    /// @brief Tag the given input stream with flags indicating whether
+    /// the input stream contains compressed data and how it is compressed.
+    void setDataCompression(std::istream&);
+
+    /// @brief Tag an output stream with flags specifying only those
+    /// compression options that are applicable to the given grid.
+    void setGridCompression(std::ostream&, const GridBase&) const;
+    /// @brief Read in the compression flags for a grid and
+    /// tag the given input stream with those flags.
+    static void readGridCompression(std::istream&);
+
+    /// Read in and return the number of grids on the input stream.
+    static int32_t readGridCount(std::istream&);
+
+    /// Populate the given grid from the input stream.
+    static void readGrid(GridBase::Ptr, const GridDescriptor&, std::istream&);
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Populate the given grid from the input stream, but only where it
+    /// intersects the given world-space bounding box.
+    static void readGrid(GridBase::Ptr, const GridDescriptor&, std::istream&, const BBoxd&);
+    /// @brief Populate the given grid from the input stream, but only where it
+    /// intersects the given index-space bounding box.
+    static void readGrid(GridBase::Ptr, const GridDescriptor&, std::istream&, const CoordBBox&);
+#endif
+
+    typedef std::map<Name /*uniqueName*/, GridBase::Ptr> NamedGridMap;
+
+    /// @brief If the grid represented by the given grid descriptor
+    /// is an instance, connect it with its instance parent.
+    void connectInstance(const GridDescriptor&, const NamedGridMap&) const;
+
+    /// Write the given grid descriptor and grid to an output stream
+    /// and update the GridDescriptor offsets.
+    /// @param seekable  if true, the output stream supports seek operations
+    void writeGrid(GridDescriptor&, GridBase::ConstPtr, std::ostream&, bool seekable) const;
+    /// Write the given grid descriptor and grid metadata to an output stream
+    /// and update the GridDescriptor offsets, but don't write the grid's tree,
+    /// since it is shared with another grid.
+    /// @param seekable  if true, the output stream supports seek operations
+    void writeGridInstance(GridDescriptor&, GridBase::ConstPtr,
+        std::ostream&, bool seekable) const;
+
+    /// @brief Read the magic number, version numbers, UUID, etc. from the given input stream.
+    /// @return @c true if the input UUID differs from the previously-read UUID.
+    bool readHeader(std::istream&);
+    /// @brief Write the magic number, version numbers, UUID, etc. to the given output stream.
+    /// @param seekable  if true, the output stream supports seek operations
+    /// @todo This method should not be const since it actually redefines the UUID!
+    void writeHeader(std::ostream&, bool seekable) const;
+
+    //@{
+    /// Write the given grids to an output stream.
+    void write(std::ostream&, const GridPtrVec&, bool seekable, const MetaMap& = MetaMap()) const;
+    void write(std::ostream&, const GridCPtrVec&, bool seekable, const MetaMap& = MetaMap()) const;
+    //@}
+
+private:
+    friend class ::TestFile;
+
+    /// The version of the file that was read
+    uint32_t mFileVersion;
+    /// The version of the library that was used to create the file that was read
+    VersionId mLibraryVersion;
+    /// 16-byte (128-bit) UUID
+    mutable boost::uuids::uuid mUuid;// needs to be mutable since writeHeader is const!
+    /// Flag indicating whether the input stream contains grid offsets
+    /// and therefore supports partial reading
+    bool mInputHasGridOffsets;
+    /// Flag indicating whether a tree shared by multiple grids should be
+    /// written out only once (true) or once per grid (false)
+    bool mEnableInstancing;
+    /// Flags indicating whether and how the data stream is compressed
+    uint32_t mCompression;
+    /// Flag indicating whether grid statistics metadata should be written
+    bool mEnableGridStats;
+}; // class Archive
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_ARCHIVE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/Compression.h b/nuparu/include/openvdb_new/io/Compression.h
new file mode 100644
index 00000000..405d61ac
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/Compression.h
@@ -0,0 +1,619 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_IO_COMPRESSION_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_COMPRESSION_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h> // for negative()
+#include "io.h" // for getDataCompression(), etc.
+#include <boost/scoped_array.hpp>
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+/// @brief OR-able bit flags for compression options on input and output streams
+/// @details
+/// <dl>
+/// <dt><tt>COMPRESS_NONE</tt>
+/// <dd>On write, don't compress data.<br>
+///     On read, the input stream contains uncompressed data.
+///
+/// <dt><tt>COMPRESS_ZIP</tt>
+/// <dd>When writing grids other than level sets or fog volumes, apply
+///     ZLIB compression to internal and leaf node value buffers.<br>
+///     When reading grids other than level sets or fog volumes, indicate that
+///     the value buffers of internal and leaf nodes are ZLIB-compressed.<br>
+///     ZLIB compresses well but is slow.
+///
+/// <dt><tt>COMPRESS_ACTIVE_MASK</tt>
+/// <dd>When writing a grid of any class, don't output a node's inactive values
+///     if it has two or fewer distinct values.  Instead, output minimal information
+///     to permit the lossless reconstruction of inactive values.<br>
+///     On read, nodes might have been stored without inactive values.
+///     Where necessary, reconstruct inactive values from available information.
+///
+/// <dt><tt>COMPRESS_BLOSC</tt>
+/// <dd>When writing grids other than level sets or fog volumes, apply
+///     Blosc compression to internal and leaf node value buffers.<br>
+///     When reading grids other than level sets or fog volumes, indicate that
+///     the value buffers of internal and leaf nodes are Blosc-compressed.<br>
+///     Blosc is much faster than ZLIB and produces comparable file sizes.
+/// </dl>
+enum {
+    COMPRESS_NONE           = 0,
+    COMPRESS_ZIP            = 0x1,
+    COMPRESS_ACTIVE_MASK    = 0x2,
+    COMPRESS_BLOSC          = 0x4
+};
+
+/// Return a string describing the given compression flags.
+OPENVDB_API std::string compressionToString(uint32_t flags);
+
+
+////////////////////////////////////////
+
+
+/// @internal Per-node indicator byte that specifies what additional metadata
+/// is stored to permit reconstruction of inactive values
+enum {
+    /*0*/ NO_MASK_OR_INACTIVE_VALS,     // no inactive vals, or all inactive vals are +background
+    /*1*/ NO_MASK_AND_MINUS_BG,         // all inactive vals are -background
+    /*2*/ NO_MASK_AND_ONE_INACTIVE_VAL, // all inactive vals have the same non-background val
+    /*3*/ MASK_AND_NO_INACTIVE_VALS,    // mask selects between -background and +background
+    /*4*/ MASK_AND_ONE_INACTIVE_VAL,    // mask selects between backgd and one other inactive val
+    /*5*/ MASK_AND_TWO_INACTIVE_VALS,   // mask selects between two non-background inactive vals
+    /*6*/ NO_MASK_AND_ALL_VALS          // > 2 inactive vals, so no mask compression at all
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief RealToHalf and its specializations define a mapping from
+/// floating-point data types to analogous half float types.
+template<typename T>
+struct RealToHalf {
+    enum { isReal = false }; // unless otherwise specified, type T is not a floating-point type
+    typedef T HalfT; // type T's half float analogue is T itself
+    static HalfT convert(const T& val) { return val; }
+};
+template<> struct RealToHalf<float> {
+    enum { isReal = true };
+    typedef half HalfT;
+    static HalfT convert(float val) { return HalfT(val); }
+};
+template<> struct RealToHalf<double> {
+    enum { isReal = true };
+    typedef half HalfT;
+    // A half can only be constructed from a float, so cast the value to a float first.
+    static HalfT convert(double val) { return HalfT(float(val)); }
+};
+template<> struct RealToHalf<Vec2s> {
+    enum { isReal = true };
+    typedef Vec2H HalfT;
+    static HalfT convert(const Vec2s& val) { return HalfT(val); }
+};
+template<> struct RealToHalf<Vec2d> {
+    enum { isReal = true };
+    typedef Vec2H HalfT;
+    // A half can only be constructed from a float, so cast the vector's elements to floats first.
+    static HalfT convert(const Vec2d& val) { return HalfT(Vec2s(val)); }
+};
+template<> struct RealToHalf<Vec3s> {
+    enum { isReal = true };
+    typedef Vec3H HalfT;
+    static HalfT convert(const Vec3s& val) { return HalfT(val); }
+};
+template<> struct RealToHalf<Vec3d> {
+    enum { isReal = true };
+    typedef Vec3H HalfT;
+    // A half can only be constructed from a float, so cast the vector's elements to floats first.
+    static HalfT convert(const Vec3d& val) { return HalfT(Vec3s(val)); }
+};
+
+
+/// Return the given value truncated to 16-bit float precision.
+template<typename T>
+inline T
+truncateRealToHalf(const T& val)
+{
+    return T(RealToHalf<T>::convert(val));
+}
+
+
+////////////////////////////////////////
+
+
+OPENVDB_API void zipToStream(std::ostream&, const char* data, size_t numBytes);
+OPENVDB_API void unzipFromStream(std::istream&, char* data, size_t numBytes);
+OPENVDB_API void bloscToStream(std::ostream&, const char* data, size_t valSize, size_t numVals);
+OPENVDB_API void bloscFromStream(std::istream&, char* data, size_t numBytes);
+
+/// @brief Read data from a stream.
+/// @param is           the input stream
+/// @param data         the contiguous array of data to read in
+/// @param count        the number of elements to read in
+/// @param compression  whether and how the data is compressed (either COMPRESS_NONE,
+///                     COMPRESS_ZIP, COMPRESS_ACTIVE_MASK or COMPRESS_BLOSC)
+/// @throw IoError if @a compression is COMPRESS_BLOSC but OpenVDB was compiled
+/// without Blosc support.
+/// @details This default implementation is instantiated only for types
+/// whose size can be determined by the sizeof() operator.
+template<typename T>
+inline void
+readData(std::istream& is, T* data, Index count, uint32_t compression)
+{
+    if (compression & COMPRESS_BLOSC) {
+        bloscFromStream(is, reinterpret_cast<char*>(data), sizeof(T) * count);
+    } else if (compression & COMPRESS_ZIP) {
+        unzipFromStream(is, reinterpret_cast<char*>(data), sizeof(T) * count);
+    } else {
+        is.read(reinterpret_cast<char*>(data), sizeof(T) * count);
+    }
+}
+
+/// Specialization for std::string input
+template<>
+inline void
+readData<std::string>(std::istream& is, std::string* data, Index count, uint32_t /*compression*/)
+{
+    for (Index i = 0; i < count; ++i) {
+        size_t len = 0;
+        is >> len;
+        //data[i].resize(len);
+        //is.read(&(data[i][0]), len);
+
+        std::string buffer(len+1, ' ');
+        is.read(&buffer[0], len+1 );
+        data[i].assign(buffer, 0, len);
+    }
+}
+
+/// HalfReader wraps a static function, read(), that is analogous to readData(), above,
+/// except that it is partially specialized for floating-point types in order to promote
+/// 16-bit half float values to full float.  A wrapper class is required because
+/// only classes, not functions, can be partially specialized.
+template<bool IsReal, typename T> struct HalfReader;
+/// Partial specialization for non-floating-point types (no half to float promotion)
+template<typename T>
+struct HalfReader</*IsReal=*/false, T> {
+    static inline void read(std::istream& is, T* data, Index count, uint32_t compression) {
+        readData(is, data, count, compression);
+    }
+};
+/// Partial specialization for floating-point types
+template<typename T>
+struct HalfReader</*IsReal=*/true, T> {
+    typedef typename RealToHalf<T>::HalfT HalfT;
+    static inline void read(std::istream& is, T* data, Index count, uint32_t compression) {
+        if (count < 1) return;
+        std::vector<HalfT> halfData(count); // temp buffer into which to read half float values
+        readData<HalfT>(is, reinterpret_cast<HalfT*>(&halfData[0]), count, compression);
+        // Copy half float values from the temporary buffer to the full float output array.
+        std::copy(halfData.begin(), halfData.end(), data);
+    }
+};
+
+
+/// Write data to a stream.
+/// @param os           the output stream
+/// @param data         the contiguous array of data to write
+/// @param count        the number of elements to write out
+/// @param compression  whether and how to compress the data (either COMPRESS_NONE,
+///                     COMPRESS_ZIP, COMPRESS_ACTIVE_MASK or COMPRESS_BLOSC)
+/// @throw IoError if @a compression is COMPRESS_BLOSC but OpenVDB was compiled
+/// without Blosc support.
+/// @details This default implementation is instantiated only for types
+/// whose size can be determined by the sizeof() operator.
+template<typename T>
+inline void
+writeData(std::ostream &os, const T *data, Index count, uint32_t compression)
+{
+    if (compression & COMPRESS_BLOSC) {
+        bloscToStream(os, reinterpret_cast<const char*>(data), sizeof(T), count);
+    } else if (compression & COMPRESS_ZIP) {
+        zipToStream(os, reinterpret_cast<const char*>(data), sizeof(T) * count);
+    } else {
+        os.write(reinterpret_cast<const char*>(data), sizeof(T) * count);
+    }
+}
+
+/// Specialization for std::string output
+template<>
+inline void
+writeData<std::string>(std::ostream& os, const std::string* data, Index count,
+    uint32_t /*compression*/) ///< @todo add compression
+{
+    for (Index i = 0; i < count; ++i) {
+        const size_t len = data[i].size();
+        os << len;
+        os.write(data[i].c_str(), len+1);
+        //os.write(&(data[i][0]), len );
+    }
+}
+
+/// HalfWriter wraps a static function, write(), that is analogous to writeData(), above,
+/// except that it is partially specialized for floating-point types in order to quantize
+/// floating-point values to 16-bit half float.  A wrapper class is required because
+/// only classes, not functions, can be partially specialized.
+template<bool IsReal, typename T> struct HalfWriter;
+/// Partial specialization for non-floating-point types (no float to half quantization)
+template<typename T>
+struct HalfWriter</*IsReal=*/false, T> {
+    static inline void write(std::ostream& os, const T* data, Index count, uint32_t compression) {
+        writeData(os, data, count, compression);
+    }
+};
+/// Partial specialization for floating-point types
+template<typename T>
+struct HalfWriter</*IsReal=*/true, T> {
+    typedef typename RealToHalf<T>::HalfT HalfT;
+    static inline void write(std::ostream& os, const T* data, Index count, uint32_t compression) {
+        if (count < 1) return;
+        // Convert full float values to half float, then output the half float array.
+        std::vector<HalfT> halfData(count);
+        for (Index i = 0; i < count; ++i) halfData[i] = RealToHalf<T>::convert(data[i]);
+        writeData<HalfT>(os, reinterpret_cast<const HalfT*>(&halfData[0]), count, compression);
+    }
+};
+#ifdef _MSC_VER
+/// Specialization to avoid double to float warnings in MSVC
+template<>
+struct HalfWriter</*IsReal=*/true, double> {
+    typedef RealToHalf<double>::HalfT HalfT;
+    static inline void write(std::ostream& os, const double* data, Index count,
+        uint32_t compression)
+    {
+        if (count < 1) return;
+        // Convert full float values to half float, then output the half float array.
+        std::vector<HalfT> halfData(count);
+        for (Index i = 0; i < count; ++i) halfData[i] = RealToHalf<double>::convert(data[i]);
+        writeData<HalfT>(os, reinterpret_cast<const HalfT*>(&halfData[0]), count, compression);
+    }
+};
+#endif // _MSC_VER
+
+
+////////////////////////////////////////
+
+
+/// Populate the given buffer with @a destCount values of type @c ValueT
+/// read from the given stream, taking into account that the stream might
+/// have been compressed via one of several supported schemes.
+/// [Mainly for internal use]
+/// @param is         a stream from which to read data (possibly compressed,
+///                   depending on the stream's compression settings)
+/// @param destBuf    a buffer into which to read values of type @c ValueT
+/// @param destCount  the number of values to be stored in the buffer
+/// @param valueMask  a bitmask (typically, a node's value mask) indicating
+///                   which positions in the buffer correspond to active values
+/// @param fromHalf   if true, read 16-bit half floats from the input stream
+///                   and convert them to full floats
+template<typename ValueT, typename MaskT>
+inline void
+readCompressedValues(std::istream& is, ValueT* destBuf, Index destCount,
+    const MaskT& valueMask, bool fromHalf)
+{
+    // Get the stream's compression settings.
+    const uint32_t compression = getDataCompression(is);
+    const bool maskCompressed = compression & COMPRESS_ACTIVE_MASK;
+
+    int8_t metadata = NO_MASK_AND_ALL_VALS;
+    if (getFormatVersion(is) >= OPENVDB_FILE_VERSION_NODE_MASK_COMPRESSION) {
+        // Read the flag that specifies what, if any, additional metadata
+        // (selection mask and/or inactive value(s)) is saved.
+        is.read(reinterpret_cast<char*>(&metadata), /*bytes=*/1);
+    }
+
+    ValueT background = zeroVal<ValueT>();
+    if (const void* bgPtr = getGridBackgroundValuePtr(is)) {
+        background = *static_cast<const ValueT*>(bgPtr);
+    }
+    ValueT inactiveVal1 = background;
+    ValueT inactiveVal0 =
+        ((metadata == NO_MASK_OR_INACTIVE_VALS) ? background : math::negative(background));
+
+    if (metadata == NO_MASK_AND_ONE_INACTIVE_VAL ||
+        metadata == MASK_AND_ONE_INACTIVE_VAL ||
+        metadata == MASK_AND_TWO_INACTIVE_VALS)
+    {
+        // Read one of at most two distinct inactive values.
+        is.read(reinterpret_cast<char*>(&inactiveVal0), sizeof(ValueT));
+        if (metadata == MASK_AND_TWO_INACTIVE_VALS) {
+            // Read the second of two distinct inactive values.
+            is.read(reinterpret_cast<char*>(&inactiveVal1), sizeof(ValueT));
+        }
+    }
+
+    MaskT selectionMask;
+    if (metadata == MASK_AND_NO_INACTIVE_VALS ||
+        metadata == MASK_AND_ONE_INACTIVE_VAL ||
+        metadata == MASK_AND_TWO_INACTIVE_VALS)
+    {
+        // For use in mask compression (only), read the bitmask that selects
+        // between two distinct inactive values.
+        selectionMask.load(is);
+    }
+
+    ValueT* tempBuf = destBuf;
+    boost::scoped_array<ValueT> scopedTempBuf;
+
+    Index tempCount = destCount;
+    if (maskCompressed && metadata != NO_MASK_AND_ALL_VALS
+        && getFormatVersion(is) >= OPENVDB_FILE_VERSION_NODE_MASK_COMPRESSION)
+    {
+        tempCount = valueMask.countOn();
+        if (tempCount != destCount) {
+            // If this node has inactive voxels, allocate a temporary buffer
+            // into which to read just the active values.
+            scopedTempBuf.reset(new ValueT[tempCount]);
+            tempBuf = scopedTempBuf.get();
+        }
+    }
+
+    // Read in the buffer.
+    if (fromHalf) {
+        HalfReader<RealToHalf<ValueT>::isReal, ValueT>::read(is, tempBuf, tempCount, compression);
+    } else {
+        readData<ValueT>(is, tempBuf, tempCount, compression);
+    }
+
+    // If mask compression is enabled and the number of active values read into
+    // the temp buffer is smaller than the size of the destination buffer,
+    // then there are missing (inactive) values.
+    if (maskCompressed && tempCount != destCount) {
+        // Restore inactive values, using the background value and, if available,
+        // the inside/outside mask.  (For fog volumes, the destination buffer is assumed
+        // to be initialized to background value zero, so inactive values can be ignored.)
+        for (Index destIdx = 0, tempIdx = 0; destIdx < MaskT::SIZE; ++destIdx) {
+            if (valueMask.isOn(destIdx)) {
+                // Copy a saved active value into this node's buffer.
+                destBuf[destIdx] = tempBuf[tempIdx];
+                ++tempIdx;
+            } else {
+                // Reconstruct an unsaved inactive value and copy it into this node's buffer.
+                destBuf[destIdx] = (selectionMask.isOn(destIdx) ? inactiveVal1 : inactiveVal0);
+            }
+        }
+    }
+}
+
+
+/// Write @a srcCount values of type @c ValueT to the given stream, optionally
+/// after compressing the values via one of several supported schemes.
+/// [Mainly for internal use]
+/// @param os         a stream to which to write data (possibly compressed, depending
+///                   on the stream's compression settings)
+/// @param srcBuf     a buffer containing values of type @c ValueT to be written
+/// @param srcCount   the number of values stored in the buffer
+/// @param valueMask  a bitmask (typically, a node's value mask) indicating
+///                   which positions in the buffer correspond to active values
+/// @param childMask  a bitmask (typically, a node's child mask) indicating
+///                   which positions in the buffer correspond to child node pointers
+/// @param toHalf     if true, convert floating-point values to 16-bit half floats
+template<typename ValueT, typename MaskT>
+inline void
+writeCompressedValues(std::ostream& os, ValueT* srcBuf, Index srcCount,
+    const MaskT& valueMask, const MaskT& childMask, bool toHalf)
+{
+    struct Local {
+        // Comparison function for values
+        static inline bool eq(const ValueT& a, const ValueT& b) {
+            return math::isExactlyEqual(a, b);
+        }
+    };
+
+    // Get the stream's compression settings.
+    const uint32_t compress = getDataCompression(os);
+    const bool maskCompress = compress & COMPRESS_ACTIVE_MASK;
+
+    Index tempCount = srcCount;
+    ValueT* tempBuf = srcBuf;
+    boost::scoped_array<ValueT> scopedTempBuf;
+
+    int8_t metadata = NO_MASK_AND_ALL_VALS;
+
+    if (!maskCompress) {
+        os.write(reinterpret_cast<const char*>(&metadata), /*bytes=*/1);
+    } else {
+        // A valid level set's inactive values are either +background (outside)
+        // or -background (inside), and a fog volume's inactive values are all zero.
+        // Rather than write out all of these values, we can store just the active values
+        // (given that the value mask specifies their positions) and, if necessary,
+        // an inside/outside bitmask.
+
+        const ValueT zero = zeroVal<ValueT>();
+        ValueT background = zero;
+        if (const void* bgPtr = getGridBackgroundValuePtr(os)) {
+            background = *static_cast<const ValueT*>(bgPtr);
+        }
+
+        /// @todo Consider all values, not just inactive values?
+        ValueT inactiveVal[2] = { background, background };
+        int numUniqueInactiveVals = 0;
+        for (typename MaskT::OffIterator it = valueMask.beginOff();
+            numUniqueInactiveVals < 3 && it; ++it)
+        {
+            const Index32 idx = it.pos();
+
+            // Skip inactive values that are actually child node pointers.
+            if (childMask.isOn(idx)) continue;
+
+            const ValueT& val = srcBuf[idx];
+            const bool unique = !(
+                (numUniqueInactiveVals > 0 && Local::eq(val, inactiveVal[0])) ||
+                (numUniqueInactiveVals > 1 && Local::eq(val, inactiveVal[1]))
+            );
+            if (unique) {
+                if (numUniqueInactiveVals < 2) inactiveVal[numUniqueInactiveVals] = val;
+                ++numUniqueInactiveVals;
+            }
+        }
+
+        metadata = NO_MASK_OR_INACTIVE_VALS;
+
+        if (numUniqueInactiveVals == 1) {
+            if (!Local::eq(inactiveVal[0], background)) {
+                if (Local::eq(inactiveVal[0], math::negative(background))) {
+                    metadata = NO_MASK_AND_MINUS_BG;
+                } else {
+                    metadata = NO_MASK_AND_ONE_INACTIVE_VAL;
+                }
+            }
+        } else if (numUniqueInactiveVals == 2) {
+            metadata = NO_MASK_OR_INACTIVE_VALS;
+            if (!Local::eq(inactiveVal[0], background) && !Local::eq(inactiveVal[1], background)) {
+                // If neither inactive value is equal to the background, both values
+                // need to be saved, along with a mask that selects between them.
+                metadata = MASK_AND_TWO_INACTIVE_VALS;
+
+            } else if (Local::eq(inactiveVal[1], background)) {
+                if (Local::eq(inactiveVal[0], math::negative(background))) {
+                    // If the second inactive value is equal to the background and
+                    // the first is equal to -background, neither value needs to be saved,
+                    // but save a mask that selects between -background and +background.
+                    metadata = MASK_AND_NO_INACTIVE_VALS;
+                } else {
+                    // If the second inactive value is equal to the background, only
+                    // the first value needs to be saved, along with a mask that selects
+                    // between it and the background.
+                    metadata = MASK_AND_ONE_INACTIVE_VAL;
+                }
+            } else if (Local::eq(inactiveVal[0], background)) {
+                if (Local::eq(inactiveVal[1], math::negative(background))) {
+                    // If the first inactive value is equal to the background and
+                    // the second is equal to -background, neither value needs to be saved,
+                    // but save a mask that selects between -background and +background.
+                    metadata = MASK_AND_NO_INACTIVE_VALS;
+                    std::swap(inactiveVal[0], inactiveVal[1]);
+                } else {
+                    // If the first inactive value is equal to the background, swap it
+                    // with the second value and save only that value, along with a mask
+                    // that selects between it and the background.
+                    std::swap(inactiveVal[0], inactiveVal[1]);
+                    metadata = MASK_AND_ONE_INACTIVE_VAL;
+                }
+            }
+        } else if (numUniqueInactiveVals > 2) {
+            metadata = NO_MASK_AND_ALL_VALS;
+        }
+
+        os.write(reinterpret_cast<const char*>(&metadata), /*bytes=*/1);
+
+        if (metadata == NO_MASK_AND_ONE_INACTIVE_VAL ||
+            metadata == MASK_AND_ONE_INACTIVE_VAL ||
+            metadata == MASK_AND_TWO_INACTIVE_VALS)
+        {
+            if (!toHalf) {
+                // Write one of at most two distinct inactive values.
+                os.write(reinterpret_cast<const char*>(&inactiveVal[0]), sizeof(ValueT));
+                if (metadata == MASK_AND_TWO_INACTIVE_VALS) {
+                    // Write the second of two distinct inactive values.
+                    os.write(reinterpret_cast<const char*>(&inactiveVal[1]), sizeof(ValueT));
+                }
+            } else {
+                // Write one of at most two distinct inactive values.
+                ValueT truncatedVal = static_cast<ValueT>(truncateRealToHalf(inactiveVal[0]));
+                os.write(reinterpret_cast<const char*>(&truncatedVal), sizeof(ValueT));
+                if (metadata == MASK_AND_TWO_INACTIVE_VALS) {
+                    // Write the second of two distinct inactive values.
+                    truncatedVal = truncateRealToHalf(inactiveVal[1]);
+                    os.write(reinterpret_cast<const char*>(&truncatedVal), sizeof(ValueT));
+                }
+            }
+        }
+
+        if (metadata == NO_MASK_AND_ALL_VALS) {
+            // If there are more than two unique inactive values, the entire input buffer
+            // needs to be saved (both active and inactive values).
+            /// @todo Save the selection mask as long as most of the inactive values
+            /// are one of two values?
+        } else {
+            // Create a new array to hold just the active values.
+            scopedTempBuf.reset(new ValueT[srcCount]);
+            tempBuf = scopedTempBuf.get();
+
+            if (metadata == NO_MASK_OR_INACTIVE_VALS ||
+                metadata == NO_MASK_AND_MINUS_BG ||
+                metadata == NO_MASK_AND_ONE_INACTIVE_VAL)
+            {
+                // Copy active values to the contiguous array.
+                tempCount = 0;
+                for (typename MaskT::OnIterator it = valueMask.beginOn(); it; ++it, ++tempCount) {
+                    tempBuf[tempCount] = srcBuf[it.pos()];
+                }
+            } else {
+                // Copy active values to a new, contiguous array and populate a bitmask
+                // that selects between two distinct inactive values.
+                MaskT selectionMask;
+                tempCount = 0;
+                for (Index srcIdx = 0; srcIdx < srcCount; ++srcIdx) {
+                    if (valueMask.isOn(srcIdx)) { // active value
+                        tempBuf[tempCount] = srcBuf[srcIdx];
+                        ++tempCount;
+                    } else { // inactive value
+                        if (Local::eq(srcBuf[srcIdx], inactiveVal[1])) {
+                            selectionMask.setOn(srcIdx); // inactive value 1
+                        } // else inactive value 0
+                    }
+                }
+                assert(tempCount == valueMask.countOn());
+
+                // Write out the mask that selects between two inactive values.
+                selectionMask.save(os);
+            }
+        }
+    }
+
+    // Write out the buffer.
+    if (toHalf) {
+        HalfWriter<RealToHalf<ValueT>::isReal, ValueT>::write(os, tempBuf, tempCount, compress);
+    } else {
+        writeData(os, tempBuf, tempCount, compress);
+    }
+}
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_COMPRESSION_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/File.h b/nuparu/include/openvdb_new/io/File.h
new file mode 100644
index 00000000..90689685
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/File.h
@@ -0,0 +1,276 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file File.h
+
+#ifndef OPENVDB_IO_FILE_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_FILE_HAS_BEEN_INCLUDED
+
+#include "io.h" // for MappedFile::Notifier
+#include "Archive.h"
+#include "GridDescriptor.h"
+#include <iosfwd>
+#include <map>
+#include <string>
+#include <boost/scoped_ptr.hpp>
+
+
+class TestFile;
+class TestStream;
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+/// Grid archive associated with a file on disk
+class OPENVDB_API File: public Archive
+{
+public:
+    typedef std::multimap<Name, GridDescriptor> NameMap;
+    typedef NameMap::const_iterator NameMapCIter;
+
+    explicit File(const std::string& filename);
+    virtual ~File();
+
+    /// @brief Copy constructor
+    /// @details The copy will be closed and will not reference the same
+    /// file descriptor as the original.
+    File(const File& other);
+    /// @brief Assignment
+    /// @details After assignment, this File will be closed and will not
+    /// reference the same file descriptor as the source File.
+    File& operator=(const File& other);
+
+    /// @brief Return a copy of this archive.
+    /// @details The copy will be closed and will not reference the same
+    /// file descriptor as the original.
+    virtual boost::shared_ptr<Archive> copy() const;
+
+    /// @brief Return the name of the file with which this archive is associated.
+    /// @details The file does not necessarily exist on disk yet.
+    const std::string& filename() const;
+
+    /// @brief Open the file, read the file header and the file-level metadata,
+    /// and populate the grid descriptors, but do not load any grids into memory.
+    /// @details If @a delayLoad is true, map the file into memory and enable delayed loading
+    /// of grids, and if a notifier is provided, call it when the file gets unmapped.
+    /// @note Define the environment variable @c OPENVDB_DISABLE_DELAYED_LOAD to disable
+    /// delayed loading unconditionally.
+    /// @throw IoError if the file is not a valid VDB file.
+    /// @return @c true if the file's UUID has changed since it was last read.
+    /// @see setCopyMaxBytes
+    bool open(bool delayLoad = true, const MappedFile::Notifier& = MappedFile::Notifier());
+
+    /// Return @c true if the file has been opened for reading.
+    bool isOpen() const;
+
+    /// Close the file once we are done reading from it.
+    void close();
+
+    /// @brief Return this file's current size on disk in bytes.
+    /// @throw IoError if the file size cannot be determined.
+    Index64 getSize() const;
+
+    /// @brief Return the size in bytes above which this file will not be
+    /// automatically copied during delayed loading.
+    Index64 copyMaxBytes() const;
+    /// @brief If this file is opened with delayed loading enabled, make a private copy
+    /// of the file if its size in bytes is less than the specified value.
+    /// @details Making a private copy ensures that the file can't change on disk
+    /// before it has been fully read.
+    /// @warning If the file is larger than this size, it is the user's responsibility
+    /// to ensure that it does not change on disk before it has been fully read.
+    /// Undefined behavior and/or a crash might result otherwise.
+    /// @note Copying is enabled by default, but it can be disabled for individual files
+    /// by setting the maximum size to zero bytes.  A default size limit can be specified
+    /// by setting the environment variable @c OPENVDB_DELAYED_LOAD_COPY_MAX_BYTES
+    /// to the desired number of bytes.
+    void setCopyMaxBytes(Index64 bytes);
+
+    /// Return @c true if a grid of the given name exists in this file.
+    bool hasGrid(const Name&) const;
+
+    /// Return (in a newly created MetaMap) the file-level metadata.
+    MetaMap::Ptr getMetadata() const;
+
+    /// Read the entire contents of the file and return a list of grid pointers.
+    GridPtrVecPtr getGrids() const;
+
+    /// @brief Read just the grid metadata and transforms from the file and return a list
+    /// of pointers to grids that are empty except for their metadata and transforms.
+    /// @throw IoError if this file is not open for reading.
+    GridPtrVecPtr readAllGridMetadata();
+
+    /// @brief Read a grid's metadata and transform only.
+    /// @return A pointer to a grid that is empty except for its metadata and transform.
+    /// @throw IoError if this file is not open for reading.
+    /// @throw KeyError if no grid with the given name exists in this file.
+    GridBase::Ptr readGridMetadata(const Name&);
+
+    /// @brief Read a grid's metadata, topology, transform, etc., but not
+    /// any of its leaf node data blocks.
+    /// @return the grid pointer to the partially loaded grid.
+    /// @note This returns a @c const pointer, so that the grid can't be
+    /// changed before its data blocks have been loaded.  A non-<tt>const</tt>
+    /// pointer is only returned when readGrid() is called.
+    GridBase::ConstPtr readGridPartial(const Name&);
+
+    /// Read an entire grid, including all of its data blocks.
+    GridBase::Ptr readGrid(const Name&);
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Read a grid, including its data blocks, but only where it
+    /// intersects the given world-space bounding box.
+    GridBase::Ptr readGrid(const Name&, const BBoxd&);
+#endif
+
+    /// @todo GridPtrVec readAllGridsPartial(const Name&)
+    /// @todo GridPtrVec readAllGrids(const Name&)
+
+    /// @brief Write the grids in the given container to the file whose name
+    /// was given in the constructor.
+    virtual void write(const GridCPtrVec&, const MetaMap& = MetaMap()) const;
+
+    /// @brief Write the grids in the given container to the file whose name
+    /// was given in the constructor.
+    template<typename GridPtrContainerT>
+    void write(const GridPtrContainerT&, const MetaMap& = MetaMap()) const;
+
+    /// A const iterator that iterates over all names in the file. This is only
+    /// valid once the file has been opened.
+    class NameIterator
+    {
+    public:
+        NameIterator(const NameMapCIter& iter): mIter(iter) {}
+        ~NameIterator() {}
+
+        NameIterator& operator++() { mIter++; return *this; }
+
+        bool operator==(const NameIterator& iter) const { return mIter == iter.mIter; }
+        bool operator!=(const NameIterator& iter) const { return mIter != iter.mIter; }
+
+        Name operator*() const { return this->gridName(); }
+
+        Name gridName() const { return GridDescriptor::nameAsString(mIter->second.uniqueName()); }
+
+    private:
+        NameMapCIter mIter;
+    };
+
+    /// @return a NameIterator to iterate over all grid names in the file.
+    NameIterator beginName() const;
+
+    /// @return the ending iterator for all grid names in the file.
+    NameIterator endName() const;
+
+private:
+    /// Read in all grid descriptors that are stored in the given stream.
+    void readGridDescriptors(std::istream&);
+
+    /// @brief Return an iterator to the descriptor for the grid with the given name.
+    /// If the name is non-unique, return an iterator to the first matching descriptor.
+    NameMapCIter findDescriptor(const Name&) const;
+
+    /// Return a newly created, empty grid of the type specified by the given grid descriptor.
+    GridBase::Ptr createGrid(const GridDescriptor&) const;
+
+    /// @brief Read a grid, including its data blocks, but only where it
+    /// intersects the given world-space bounding box.
+    GridBase::Ptr readGridByName(const Name&, const BBoxd&);
+
+    /// Read in and return the partially-populated grid specified by the given grid descriptor.
+    GridBase::ConstPtr readGridPartial(const GridDescriptor&, bool readTopology) const;
+
+    /// Read in and return the grid specified by the given grid descriptor.
+    GridBase::Ptr readGrid(const GridDescriptor&) const;
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Read in and return the region of the grid specified by the given grid descriptor
+    /// that intersects the given world-space bounding box.
+    GridBase::Ptr readGrid(const GridDescriptor&, const BBoxd&) const;
+    /// Read in and return the region of the grid specified by the given grid descriptor
+    /// that intersects the given index-space bounding box.
+    GridBase::Ptr readGrid(const GridDescriptor&, const CoordBBox&) const;
+#endif
+
+    /// @brief Partially populate the given grid by reading its metadata and transform and,
+    /// if the grid is not an instance, its tree structure, but not the tree's leaf nodes.
+    void readGridPartial(GridBase::Ptr, std::istream&, bool isInstance, bool readTopology) const;
+
+    /// @brief Retrieve a grid from @c mNamedGrids.  Return a null pointer
+    /// if @c mNamedGrids was not populated (because this file is random-access).
+    /// @throw KeyError if no grid with the given name exists in this file.
+    GridBase::Ptr retrieveCachedGrid(const Name&) const;
+
+    void writeGrids(const GridCPtrVec&, const MetaMap&) const;
+
+    MetaMap::Ptr fileMetadata();
+    MetaMap::ConstPtr fileMetadata() const;
+
+    const NameMap& gridDescriptors() const;
+    NameMap& gridDescriptors();
+
+    std::istream& inputStream() const;
+
+    friend class ::TestFile;
+    friend class ::TestStream;
+
+    struct Impl;
+    boost::scoped_ptr<Impl> mImpl;
+};
+
+
+////////////////////////////////////////
+
+
+inline void
+File::write(const GridCPtrVec& grids, const MetaMap& meta) const
+{
+    this->writeGrids(grids, meta);
+}
+
+
+template<typename GridPtrContainerT>
+inline void
+File::write(const GridPtrContainerT& container, const MetaMap& meta) const
+{
+    GridCPtrVec grids;
+    std::copy(container.begin(), container.end(), std::back_inserter(grids));
+    this->writeGrids(grids, meta);
+}
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_FILE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/GridDescriptor.h b/nuparu/include/openvdb_new/io/GridDescriptor.h
new file mode 100644
index 00000000..364a2b0d
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/GridDescriptor.h
@@ -0,0 +1,135 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_IO_GRIDDESCRIPTOR_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_GRIDDESCRIPTOR_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <iostream>
+#include <boost/cstdint.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+/// This structure stores useful information that describes a grid on disk.
+/// It can be used to retrieve I/O information about the grid such as
+/// offsets into the file where the grid is located, its type, etc.
+class OPENVDB_API GridDescriptor
+{
+public:
+    GridDescriptor();
+    GridDescriptor(const Name& name, const Name& gridType, bool saveFloatAsHalf = false);
+
+    ~GridDescriptor();
+
+    const Name& gridType() const { return mGridType; }
+    const Name& gridName() const { return mGridName; }
+    const Name& uniqueName() const { return mUniqueName; }
+
+    const Name& instanceParentName() const { return mInstanceParentName; }
+    void setInstanceParentName(const Name& name) { mInstanceParentName = name; }
+    bool isInstance() const { return !mInstanceParentName.empty(); }
+
+    bool saveFloatAsHalf() const { return mSaveFloatAsHalf; }
+
+    void setGridPos(boost::int64_t pos) { mGridPos = pos; }
+    boost::int64_t getGridPos() const { return mGridPos; }
+
+    void setBlockPos(boost::int64_t pos) { mBlockPos = pos; }
+    boost::int64_t getBlockPos() const { return mBlockPos; }
+
+    void setEndPos(boost::int64_t pos) { mEndPos = pos; }
+    boost::int64_t getEndPos() const { return mEndPos; }
+
+    // These methods seek to the right position in the given stream.
+    void seekToGrid(std::istream&) const;
+    void seekToBlocks(std::istream&) const;
+    void seekToEnd(std::istream&) const;
+
+    void seekToGrid(std::ostream&) const;
+    void seekToBlocks(std::ostream&) const;
+    void seekToEnd(std::ostream&) const;
+
+    /// @brief Write out this descriptor's header information (all data except for
+    /// stream offsets).
+    void writeHeader(std::ostream&) const;
+
+    /// @brief Since positions into the stream are known at a later time, they are
+    /// written out separately.
+    void writeStreamPos(std::ostream&) const;
+
+    /// @brief Read a grid descriptor from the given stream.
+    /// @return an empty grid of the type specified by the grid descriptor.
+    GridBase::Ptr read(std::istream&);
+
+    /// @brief Append the number @a n to the given name (separated by an ASCII
+    /// "record separator" character) and return the resulting name.
+    static Name addSuffix(const Name&, int n);
+    /// @brief Strip from the given name any suffix that is separated by an ASCII
+    /// "record separator" character and return the resulting name.
+    static Name stripSuffix(const Name&);
+    /// @brief Given a name with suffix N, return "name[N]", otherwise just return "name".
+    /// Use this to produce a human-readable string from a descriptor's unique name.
+    static std::string nameAsString(const Name&);
+    /// @brief Given a string of the form "name[N]", return "name" with the suffix N
+    /// separated by an ASCII "record separator" character).  Otherwise just return
+    /// the string as is.
+    static Name stringAsUniqueName(const std::string&);
+
+private:
+    /// Name of the grid
+    Name mGridName;
+    /// Unique name for this descriptor
+    Name mUniqueName;
+    /// If nonempty, the name of another grid that shares this grid's tree
+    Name mInstanceParentName;
+    /// The type of the grid
+    Name mGridType;
+    /// Are floats quantized to 16 bits on disk?
+    bool mSaveFloatAsHalf;
+    /// Location in the stream where the grid data is stored
+    boost::int64_t mGridPos;
+    /// Location in the stream where the grid blocks are stored
+    boost::int64_t mBlockPos;
+    /// Location in the stream where the next grid descriptor begins
+    boost::int64_t mEndPos;
+};
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_GRIDDESCRIPTOR_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/Queue.h b/nuparu/include/openvdb_new/io/Queue.h
new file mode 100644
index 00000000..7ac298ab
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/Queue.h
@@ -0,0 +1,277 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Queue.h
+/// @author Peter Cucka
+
+#ifndef OPENVDB_IO_QUEUE_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_QUEUE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <boost/function.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm> // for std::copy
+#include <iterator> // for std::back_inserter
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+class Archive;
+
+/// @brief Queue for asynchronous output of grids to files or streams
+///
+/// @warning The queue holds shared pointers to grids.  It is not safe
+/// to modify a grid that has been placed in the queue.  Instead,
+/// make a deep copy of the grid (Grid::deepCopy()).
+///
+/// @par Example:
+/// @code
+/// #include <boost/bind.hpp>
+/// #include <tbb/concurrent_hash_map.h>
+/// #include <openvdb/openvdb.h>
+/// #include <openvdb/io/Queue.h>
+///
+/// using openvdb::io::Queue;
+///
+/// struct MyNotifier
+/// {
+///     // Use a concurrent container, because queue callback functions
+///     // must be thread-safe.
+///     typedef tbb::concurrent_hash_map<Queue::Id, std::string> FilenameMap;
+///     FilenameMap filenames;
+///
+///     // Callback function that prints the status of a completed task.
+///     void callback(Queue::Id id, Queue::Status status)
+///     {
+///         const bool ok = (status == Queue::SUCCEEDED);
+///         FilenameMap::accessor acc;
+///         if (filenames.find(acc, id)) {
+///             std::cout << (ok ? "wrote " : "failed to write ")
+///                 << acc->second << std::endl;
+///             filenames.erase(acc);
+///         }
+///     }
+/// };
+///
+/// int main()
+/// {
+///     // Construct an object to receive notifications from the queue.
+///     // The object's lifetime must exceed the queue's.
+///     MyNotifier notifier;
+///
+///     Queue queue;
+///
+///     // Register the callback() method of the MyNotifier object
+///     // to receive notifications of completed tasks.
+///     queue.addNotifier(boost::bind(&MyNotifier::callback, &notifier, _1, _2));
+///
+///     // Queue grids for output (e.g., for each step of a simulation).
+///     for (int step = 1; step <= 10; ++step) {
+///         openvdb::FloatGrid::Ptr grid = ...;
+///
+///         std::ostringstream os;
+///         os << "mygrid." << step << ".vdb";
+///         const std::string filename = os.str();
+///
+///         Queue::Id id = queue.writeGrid(grid, openvdb::io::File(filename));
+///
+///         // Associate the filename with the ID of the queued task.
+///         MyNotifier::FilenameMap::accessor acc;
+///         notifier.filenames.insert(acc, id);
+///         acc->second = filename;
+///     }
+/// }
+/// @endcode
+/// Output:
+/// @code
+/// wrote mygrid.1.vdb
+/// wrote mygrid.2.vdb
+/// wrote mygrid.4.vdb
+/// wrote mygrid.3.vdb
+/// ...
+/// wrote mygrid.10.vdb
+/// @endcode
+/// Note that tasks do not necessarily complete in the order in which they were queued.
+class OPENVDB_API Queue
+{
+public:
+    /// Default maximum queue length (see setCapacity())
+    static const Index32 DEFAULT_CAPACITY = 100;
+    /// @brief Default maximum time in seconds to wait to queue a task
+    /// when the queue is full (see setTimeout())
+    static const Index32 DEFAULT_TIMEOUT = 120; // seconds
+
+    /// ID number of a queued task or of a registered notification callback
+    typedef Index32 Id;
+
+    /// Status of a queued task
+    enum Status { UNKNOWN, PENDING, SUCCEEDED, FAILED };
+
+
+    /// Construct a queue with the given capacity.
+    explicit Queue(Index32 capacity = DEFAULT_CAPACITY);
+    /// Block until all queued tasks complete (successfully or unsuccessfully).
+    ~Queue();
+
+    /// @brief Return @c true if the queue is empty.
+    bool empty() const;
+    /// @brief Return the number of tasks currently in the queue.
+    Index32 size() const;
+
+    /// @brief Return the maximum number of tasks allowed in the queue.
+    /// @details Once the queue has reached its maximum size, adding
+    /// a new task will block until an existing task has executed.
+    Index32 capacity() const;
+    /// Set the maximum number of tasks allowed in the queue.
+    void setCapacity(Index32);
+
+    /// Return the maximum number of seconds to wait to queue a task when the queue is full.
+    Index32 timeout() const;
+    /// Set the maximum number of seconds to wait to queue a task when the queue is full.
+    void setTimeout(Index32 seconds = DEFAULT_TIMEOUT);
+
+    /// @brief Return the status of the task with the given ID.
+    /// @note Querying the status of a task that has already completed
+    /// (whether successfully or not) removes the task from the status registry.
+    /// Subsequent queries of its status will return UNKNOWN.
+    Status status(Id) const;
+
+    typedef boost::function<void (Id, Status)> Notifier;
+    /// @brief Register a function that will be called with a task's ID
+    /// and status when that task completes, whether successfully or not.
+    /// @return an ID that can be passed to removeNotifier() to deregister the function
+    /// @details When multiple notifiers are registered, they are called
+    /// in the order in which they were registered.
+    /// @warning Notifiers are called from worker threads, so they must be thread-safe
+    /// and their lifetimes must exceed that of the queue.  They must also not call,
+    /// directly or indirectly, addNotifier(), removeNotifier() or clearNotifiers(),
+    /// as that can result in a deadlock.
+    Id addNotifier(Notifier);
+    /// Deregister the notifier with the given ID.
+    void removeNotifier(Id);
+    /// Deregister all notifiers.
+    void clearNotifiers();
+
+    /// @brief Queue a single grid for output to a file or stream.
+    /// @param grid  the grid to be serialized
+    /// @param archive  the io::File or io::Stream to which to output the grid
+    /// @param fileMetadata  optional file-level metadata
+    /// @return an ID with which the status of the queued task can be queried
+    /// @throw RuntimeError if the task cannot be queued within the time limit
+    /// (see setTimeout()) because the queue is full
+    /// @par Example:
+    /// @code
+    /// openvdb::FloatGrid::Ptr grid = ...;
+    ///
+    /// openvdb::io::Queue queue;
+    ///
+    /// // Write the grid to the file mygrid.vdb.
+    /// queue.writeGrid(grid, openvdb::io::File("mygrid.vdb"));
+    ///
+    /// // Stream the grid to a binary string.
+    /// std::ostringstream ostr(std::ios_base::binary);
+    /// queue.writeGrid(grid, openvdb::io::Stream(ostr));
+    /// @endcode
+    Id writeGrid(GridBase::ConstPtr grid, const Archive& archive,
+        const MetaMap& fileMetadata = MetaMap());
+
+    /// @brief Queue a container of grids for output to a file.
+    /// @param grids  any iterable container of grid pointers
+    ///     (e.g., a GridPtrVec or GridPtrSet)
+    /// @param archive  the io::File or io::Stream to which to output the grids
+    /// @param fileMetadata  optional file-level metadata
+    /// @return an ID with which the status of the queued task can be queried
+    /// @throw RuntimeError if the task cannot be queued within the time limit
+    /// (see setTimeout()) because the queue is full
+    /// @par Example:
+    /// @code
+    /// openvdb::FloatGrid::Ptr floatGrid = ...;
+    /// openvdb::BoolGrid::Ptr boolGrid = ...;
+    /// openvdb::GridPtrVec grids;
+    /// grids.push_back(floatGrid);
+    /// grids.push_back(boolGrid);
+    ///
+    /// openvdb::io::Queue queue;
+    ///
+    /// // Write the grids to the file mygrid.vdb.
+    /// queue.write(grids, openvdb::io::File("mygrid.vdb"));
+    ///
+    /// // Stream the grids to a (binary) string.
+    /// std::ostringstream ostr(std::ios_base::binary);
+    /// queue.write(grids, openvdb::io::Stream(ostr));
+    /// @endcode
+    template<typename GridPtrContainer>
+    Id write(const GridPtrContainer& grids, const Archive& archive,
+        const MetaMap& fileMetadata = MetaMap());
+
+private:
+    // Disallow copying of instances of this class.
+    Queue(const Queue&);
+    Queue& operator=(const Queue&);
+
+    Id writeGridVec(const GridCPtrVec&, const Archive&, const MetaMap&);
+
+    struct Impl;
+    boost::shared_ptr<Impl> mImpl;
+}; // class Queue
+
+
+template<typename GridPtrContainer>
+inline Queue::Id
+Queue::write(const GridPtrContainer& container,
+    const Archive& archive, const MetaMap& metadata)
+{
+    GridCPtrVec grids;
+    std::copy(container.begin(), container.end(), std::back_inserter(grids));
+    return this->writeGridVec(grids, archive, metadata);
+}
+
+// Specialization for vectors of const Grid pointers; no copying necessary
+template<>
+inline Queue::Id
+Queue::write<GridCPtrVec>(const GridCPtrVec& grids,
+    const Archive& archive, const MetaMap& metadata)
+{
+    return this->writeGridVec(grids, archive, metadata);
+}
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_QUEUE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/Stream.h b/nuparu/include/openvdb_new/io/Stream.h
new file mode 100644
index 00000000..161e0c7f
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/Stream.h
@@ -0,0 +1,120 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_IO_STREAM_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_STREAM_HAS_BEEN_INCLUDED
+
+#include "Archive.h"
+#include <boost/scoped_ptr.hpp>
+#include <iosfwd>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+class GridDescriptor;
+
+
+/// Grid archive associated with arbitrary input and output streams (not necessarily files)
+class OPENVDB_API Stream: public Archive
+{
+public:
+    /// @brief Read grids from an input stream.
+    /// @details If @a delayLoad is true, map the contents of the input stream
+    /// into memory and enable delayed loading of grids.
+    /// @note Define the environment variable @c OPENVDB_DISABLE_DELAYED_LOAD
+    /// to disable delayed loading unconditionally.
+    explicit Stream(std::istream&, bool delayLoad = true);
+
+    /// Construct an archive for stream output.
+    Stream();
+    /// Construct an archive for output to the given stream.
+    explicit Stream(std::ostream&);
+
+    Stream(const Stream&);
+    Stream& operator=(const Stream&);
+
+    virtual ~Stream();
+
+    /// @brief Return a copy of this archive.
+    virtual Archive::Ptr copy() const;
+
+    /// Return the file-level metadata in a newly created MetaMap.
+    MetaMap::Ptr getMetadata() const;
+
+    /// Return pointers to the grids that were read from the input stream.
+    GridPtrVecPtr getGrids();
+
+    /// @brief Write the grids in the given container to this archive's output stream.
+    /// @throw ValueError if this archive was constructed without specifying an output stream.
+    virtual void write(const GridCPtrVec&, const MetaMap& = MetaMap()) const;
+
+    /// @brief Write the grids in the given container to this archive's output stream.
+    /// @throw ValueError if this archive was constructed without specifying an output stream.
+    template<typename GridPtrContainerT>
+    void write(const GridPtrContainerT&, const MetaMap& = MetaMap()) const;
+
+private:
+    /// Create a new grid of the type specified by the given descriptor,
+    /// then populate the grid from the given input stream.
+    /// @return the newly created grid.
+    GridBase::Ptr readGrid(const GridDescriptor&, std::istream&) const;
+
+    void writeGrids(std::ostream&, const GridCPtrVec&, const MetaMap&) const;
+
+
+    struct Impl;
+    boost::scoped_ptr<Impl> mImpl;
+};
+
+
+////////////////////////////////////////
+
+
+template<typename GridPtrContainerT>
+inline void
+Stream::write(const GridPtrContainerT& container, const MetaMap& metadata) const
+{
+    GridCPtrVec grids;
+    std::copy(container.begin(), container.end(), std::back_inserter(grids));
+    this->write(grids, metadata);
+}
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_STREAM_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/TempFile.h b/nuparu/include/openvdb_new/io/TempFile.h
new file mode 100644
index 00000000..94f52a4c
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/TempFile.h
@@ -0,0 +1,81 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file TempFile.h
+
+#ifndef OPENVDB_IO_TEMPFILE_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_TEMPFILE_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <boost/scoped_ptr.hpp>
+#include <ostream>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace io {
+
+/// Output stream to a unique temporary file
+class OPENVDB_API TempFile: public std::ostream
+{
+public:
+    /// @brief Create and open a unique file.
+    /// @details On UNIX systems, the file is created in the directory specified by
+    /// the environment variable @c OPENVDB_TEMP_DIR, if that variable is defined,
+    /// or else in the directory specified by @c TMPDIR, if that variable is defined.
+    /// Otherwise (and on non-UNIX systems), the file is created in the system default
+    /// temporary directory.
+    TempFile();
+    ~TempFile();
+
+    /// Return the path to the temporary file.
+    const std::string& filename() const;
+
+    /// Return @c true if the file is open for writing.
+    bool is_open() const;
+
+    /// Close the file.
+    void close();
+
+private:
+    struct TempFileImpl;
+    boost::scoped_ptr<TempFileImpl> mImpl;
+};
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_TEMPFILE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/io/io.h b/nuparu/include/openvdb_new/io/io.h
new file mode 100644
index 00000000..f40f2032
--- /dev/null
+++ b/nuparu/include/openvdb_new/io/io.h
@@ -0,0 +1,277 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_IO_IO_HAS_BEEN_INCLUDED
+#define OPENVDB_IO_IO_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <openvdb/version.h>
+#include <boost/any.hpp>
+#include <boost/function.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/shared_ptr.hpp>
+#include <iosfwd> // for std::ios_base
+#include <map>
+#include <string>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+class MetaMap;
+
+namespace io {
+
+/// @brief Container for metadata describing how to unserialize grids from and/or
+/// serialize grids to a stream (which file format, compression scheme, etc. to use)
+/// @details This class is mainly for internal use.
+class OPENVDB_API StreamMetadata
+{
+public:
+    typedef boost::shared_ptr<StreamMetadata> Ptr;
+    typedef boost::shared_ptr<const StreamMetadata> ConstPtr;
+
+    StreamMetadata();
+    StreamMetadata(const StreamMetadata&);
+    explicit StreamMetadata(std::ios_base&);
+    ~StreamMetadata();
+
+    StreamMetadata& operator=(const StreamMetadata&);
+
+    /// @brief Transfer metadata items directly to the given stream.
+    /// @todo Deprecate direct transfer; use StreamMetadata structs everywhere.
+    void transferTo(std::ios_base&) const;
+
+    uint32_t fileVersion() const;
+    void setFileVersion(uint32_t);
+
+    VersionId libraryVersion() const;
+    void setLibraryVersion(VersionId);
+
+    uint32_t compression() const;
+    void setCompression(uint32_t);
+
+    uint32_t gridClass() const;
+    void setGridClass(uint32_t);
+
+    const void* backgroundPtr() const;
+    void setBackgroundPtr(const void*);
+
+    bool halfFloat() const;
+    void setHalfFloat(bool);
+
+    bool writeGridStats() const;
+    void setWriteGridStats(bool);
+
+    //@{
+    /// @brief Return a (reference to a) copy of the metadata of the grid currently
+    /// being read or written.
+    /// @details Some grid metadata might duplicate information returned by
+    /// gridClass(), backgroundPtr() and other accessors, but those values
+    /// are not guaranteed to be kept in sync.
+    MetaMap& gridMetadata();
+    const MetaMap& gridMetadata() const;
+    //@}
+
+    typedef std::map<std::string, boost::any> AuxDataMap;
+    //@{
+    /// @brief Return a map that can be populated with arbitrary user data.
+    AuxDataMap& auxData();
+    const AuxDataMap& auxData() const;
+    //@}
+
+    /// Return a string describing this stream metadata.
+    std::string str() const;
+
+private:
+    struct Impl;
+    boost::scoped_ptr<Impl> mImpl;
+}; // class StreamMetadata
+
+
+/// Write a description of the given metadata to an output stream.
+std::ostream& operator<<(std::ostream&, const StreamMetadata&);
+
+std::ostream& operator<<(std::ostream&, const StreamMetadata::AuxDataMap&);
+
+
+////////////////////////////////////////
+
+
+class File;
+
+/// @brief Handle to control the lifetime of a memory-mapped .vdb file
+class OPENVDB_API MappedFile
+{
+public:
+    typedef boost::shared_ptr<MappedFile> Ptr;
+
+    ~MappedFile();
+
+    /// Return the filename of the mapped file.
+    std::string filename() const;
+
+    /// @brief Return a new stream buffer for the mapped file.
+    /// @details Typical usage is
+    /// @code
+    /// openvdb::io::MappedFile::Ptr mappedFile = ...;
+    /// boost::shared_ptr<std::streambuf> buf = mappedFile->createBuffer();
+    /// std::istream istrm(buf.get());
+    /// // Read from istrm...
+    /// @endcode
+    /// The buffer must persist as long as the stream is open.
+    boost::shared_ptr<std::streambuf> createBuffer() const;
+
+    typedef boost::function<void(std::string /*filename*/)> Notifier;
+    /// @brief Register a function that will be called with this file's name
+    /// when the file is unmapped.
+    void setNotifier(const Notifier&);
+    /// Deregister the notifier.
+    void clearNotifier();
+
+private:
+    friend class File;
+
+    explicit MappedFile(const std::string& filename, bool autoDelete = false);
+
+    MappedFile(const MappedFile&); // not copyable
+    MappedFile& operator=(const MappedFile&); // not copyable
+
+    class Impl;
+    boost::scoped_ptr<Impl> mImpl;
+}; // class MappedFile
+
+
+////////////////////////////////////////
+
+
+/// Return a string (possibly empty) describing the given system error code.
+std::string getErrorString(int errorNum);
+
+
+/// Return a string (possibly empty) describing the most recent system error.
+std::string getErrorString();
+
+
+////////////////////////////////////////
+
+
+/// @brief Return the file format version number associated with the given input stream.
+/// @sa File::setFormatVersion()
+OPENVDB_API uint32_t getFormatVersion(std::ios_base&);
+
+/// @brief Return the (major, minor) library version number associated with the given input stream.
+/// @sa File::setLibraryVersion()
+OPENVDB_API VersionId getLibraryVersion(std::ios_base&);
+
+/// @brief Return a string of the form "<major>.<minor>/<format>", giving the library
+/// and file format version numbers associated with the given input stream.
+OPENVDB_API std::string getVersion(std::ios_base&);
+
+/// Associate the current file format and library version numbers with the given input stream.
+OPENVDB_API void setCurrentVersion(std::istream&);
+
+/// @brief Associate specific file format and library version numbers with the given stream.
+/// @details This is typically called immediately after reading a header that contains
+/// the version numbers.  Data read subsequently can then be interpreted appropriately.
+OPENVDB_API void setVersion(std::ios_base&, const VersionId& libraryVersion, uint32_t fileVersion);
+
+/// @brief Return a bitwise OR of compression option flags (COMPRESS_ZIP,
+/// COMPRESS_ACTIVE_MASK, etc.) specifying whether and how input data is compressed
+/// or output data should be compressed.
+OPENVDB_API uint32_t getDataCompression(std::ios_base&);
+/// @brief Associate with the given stream a bitwise OR of compression option flags
+/// (COMPRESS_ZIP, COMPRESS_ACTIVE_MASK, etc.) specifying whether and how input data
+/// is compressed or output data should be compressed.
+OPENVDB_API void setDataCompression(std::ios_base&, uint32_t compressionFlags);
+
+/// @brief Return the class (GRID_LEVEL_SET, GRID_UNKNOWN, etc.) of the grid
+/// currently being read from or written to the given stream.
+OPENVDB_API uint32_t getGridClass(std::ios_base&);
+/// @brief Associate with the given stream the class (GRID_LEVEL_SET, GRID_UNKNOWN, etc.)
+/// of the grid currently being read or written.
+OPENVDB_API void setGridClass(std::ios_base&, uint32_t);
+
+/// @brief Return true if floating-point values should be quantized to 16 bits when writing
+/// to the given stream or promoted back from 16-bit to full precision when reading from it.
+OPENVDB_API bool getHalfFloat(std::ios_base&);
+/// @brief Specify whether floating-point values should be quantized to 16 bits when writing
+/// to the given stream or promoted back from 16-bit to full precision when reading from it.
+OPENVDB_API void setHalfFloat(std::ios_base&, bool);
+
+/// @brief Return a pointer to the background value of the grid
+/// currently being read from or written to the given stream.
+OPENVDB_API const void* getGridBackgroundValuePtr(std::ios_base&);
+/// @brief Specify (a pointer to) the background value of the grid
+/// currently being read from or written to the given stream.
+/// @note The pointer must remain valid until the entire grid has been read or written.
+OPENVDB_API void setGridBackgroundValuePtr(std::ios_base&, const void* background);
+
+/// @brief Return @c true if grid statistics (active voxel count and bounding box, etc.)
+/// should be computed and stored as grid metadata when writing to the given stream.
+OPENVDB_API bool getWriteGridStatsMetadata(std::ios_base&);
+/// @brief Specify whether to compute grid statistics (active voxel count and bounding box, etc.)
+/// and store them as grid metadata when writing to the given stream.
+OPENVDB_API void setWriteGridStatsMetadata(std::ios_base&, bool writeGridStats);
+
+/// @brief Return a shared pointer to the memory-mapped file with which the given stream
+/// is associated, or a null pointer if the stream is not associated with a memory-mapped file.
+OPENVDB_API boost::shared_ptr<MappedFile> getMappedFilePtr(std::ios_base&);
+/// @brief Associate the given stream with (a shared pointer to) a memory-mapped file.
+/// @note The shared pointer object (not just the io::MappedFile object to which it points)
+/// must remain valid until the file is closed.
+OPENVDB_API void setMappedFilePtr(std::ios_base&, boost::shared_ptr<MappedFile>&);
+
+/// @brief Return a shared pointer to an object that stores metadata (file format,
+/// compression scheme, etc.) for use when reading from or writing to the given stream.
+OPENVDB_API boost::shared_ptr<StreamMetadata> getStreamMetadataPtr(std::ios_base&);
+/// @brief Associate the given stream with (a shared pointer to) an object that stores
+/// metadata (file format, compression scheme, etc.) for use when reading from
+/// or writing to the stream.
+/// @details If @a transfer is true, copy metadata from the object directly to the stream
+/// (for backward compatibility with older versions of the library).
+/// @note The shared pointer object (not just the io::StreamMetadata object to which it points)
+/// must remain valid until the file is closed.
+OPENVDB_API void setStreamMetadataPtr(std::ios_base&,
+    boost::shared_ptr<StreamMetadata>&, bool transfer = true);
+/// @brief Dissociate the given stream from its metadata object (if it has one)
+/// and return a shared pointer to the object.
+OPENVDB_API boost::shared_ptr<StreamMetadata> clearStreamMetadataPtr(std::ios_base&);
+
+} // namespace io
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_IO_IO_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/BBox.h b/nuparu/include/openvdb_new/math/BBox.h
new file mode 100644
index 00000000..15cc8bd6
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/BBox.h
@@ -0,0 +1,467 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_BBOX_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_BBOX_HAS_BEEN_INCLUDED
+
+#include "Math.h" // for math::isApproxEqual() and math::Tolerance()
+#include "Vec3.h"
+#include <ostream>
+#include <algorithm> // for min/max
+#include <boost/type_traits/is_integral.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @brief Axis-aligned bounding box
+template<typename Vec3T>
+class BBox
+{
+public:
+    typedef Vec3T Vec3Type;
+    typedef Vec3T ValueType;
+    typedef Vec3T VectorType;
+    typedef typename Vec3Type::ValueType ElementType;
+
+    /// @brief Default constructor creates an invalid BBox
+    BBox();
+
+    /// @brief Constructor based on a minimum and maximum point.
+    BBox(const Vec3T& xyzMin, const Vec3T& xyzMax);
+
+    /// @brief Constructor based on a minimum and maximum point.
+    /// If sorted is false the points will be sorted by x,y,z component.
+    BBox(const Vec3T& xyzMin, const Vec3T& xyzMax, bool sorted);
+
+    /// @brief Contruct a cubical BBox from a minimum coordinate and a
+    /// single edge length.
+    /// @note inclusive for integral <tt>ElementType</tt>s
+    BBox(const Vec3T& xyzMin, const ElementType& length);
+
+    /// @brief Constructor based on a raw array of six points. If
+    /// sorted is false the points will be sorted by x,y,z component.
+    explicit BBox(const ElementType* xyz, bool sorted = true);
+
+    /// @brief Copy constructor
+    BBox(const BBox& other);
+
+    /// @brief Sort the min/max by x,y,z component.
+    void sort();
+
+    /// @brief Return a const reference to the minimum point of the BBox
+    const Vec3T& min() const { return mMin; }
+
+    /// @brief Return a const reference to the maximum point of the BBox
+    const Vec3T& max() const { return mMax; }
+
+    /// @brief Return a non-const reference to the minimum point of the BBox
+    Vec3T& min() { return mMin; }
+
+    /// @brief Return a non-const reference to the maximum point of the BBox
+    Vec3T& max() { return mMax; }
+
+    /// @brief Return true if the two BBox'es are identical
+    bool operator==(const BBox& rhs) const;
+
+    /// @brief Return true if the two BBox'es are not identical
+    bool operator!=(const BBox& rhs) const { return !(*this == rhs); }
+
+    /// @brief Return true if the BBox is empty, i.e. has no
+    /// (positive) volume.
+    bool empty() const;
+
+    /// @brief Return true if the BBox has a (positive) volume.
+    bool hasVolume() const { return !this->empty(); }
+
+    /// @brief Return true if the BBox is valid, i.e. as a (positive) volume.
+    operator bool() const { return !this->empty(); }
+
+    /// @brief Return true if the all components of mMin <= mMax,
+    /// i.e. the volume is not negative.
+    /// @note For floating point values a tolerance is used for this test.
+    bool isSorted() const;
+
+    /// @brief Return the center point of the BBox
+    Vec3d getCenter() const;
+
+    /// @brief Returns the extents of the BBox, i.e. the length per axis
+    /// for floating points values or number of grids per axis points
+    /// integral values.
+    /// @note inclusive for integral <tt>ElementType</tt>s
+    Vec3T extents() const;
+
+    /// @brief Return the volume spanned by this BBox.
+    ElementType volume() const { Vec3T e = this->extents(); return e[0] * e[1] * e[2]; }
+
+    /// Return the index (0, 1 or 2) of the longest axis.
+    size_t maxExtent() const { return MaxIndex(mMax - mMin); }
+
+    /// Return the index (0, 1 or 2) of the shortest axis.
+    size_t minExtent() const { return MinIndex(mMax - mMin); }
+
+    /// Return @c true if point (x, y, z) is inside this bounding box.
+    bool isInside(const Vec3T& xyz) const;
+
+    /// Return @c true if the given bounding box is inside this bounding box.
+    bool isInside(const BBox&) const;
+
+    /// Return @c true if the given bounding box overlaps with this bounding box.
+    bool hasOverlap(const BBox&) const;
+
+    /// Pad this bounding box.
+    void expand(ElementType padding);
+
+    /// Expand this bounding box to enclose point (x, y, z).
+    void expand(const Vec3T& xyz);
+
+    /// Union this bounding box with the given bounding box.
+    void expand(const BBox&);
+    // @brief Union this bbox with the cubical bbox defined from xyzMin and
+    // length
+    /// @note inclusive for integral <tt>ElementType</tt>s
+    void expand(const Vec3T& xyzMin, const ElementType& length);
+
+    /// Translate this bounding box by \f$(t_x, t_y, t_z)\f$.
+    void translate(const Vec3T& t);
+
+    /// Apply a map to this bounding box
+    template<typename MapType>
+    BBox applyMap(const MapType& map) const;
+
+     /// Apply the inverse of a map to this bounding box
+    template<typename MapType>
+    BBox applyInverseMap(const MapType& map) const;
+
+    /// Unserialize this bounding box from the given stream.
+    void read(std::istream& is) { mMin.read(is); mMax.read(is); }
+
+    /// Serialize this bounding box to the given stream.
+    void write(std::ostream& os) const { mMin.write(os); mMax.write(os); }
+
+private:
+    Vec3T mMin, mMax;
+}; // class BBox
+
+
+////////////////////////////////////////
+
+
+template<typename Vec3T>
+inline
+BBox<Vec3T>::BBox():
+    mMin( std::numeric_limits<ElementType>::max()),
+    mMax(-std::numeric_limits<ElementType>::max())
+{
+}
+
+template<typename Vec3T>
+inline
+BBox<Vec3T>::BBox(const Vec3T& xyzMin, const Vec3T& xyzMax):
+    mMin(xyzMin), mMax(xyzMax)
+{
+}
+
+template<typename Vec3T>
+inline
+BBox<Vec3T>::BBox(const Vec3T& xyzMin, const Vec3T& xyzMax, bool sorted):
+    mMin(xyzMin), mMax(xyzMax)
+{
+    if (!sorted) this->sort();
+}
+
+template<typename Vec3T>
+inline
+BBox<Vec3T>::BBox(const Vec3T& xyzMin, const ElementType& length):
+    mMin(xyzMin), mMax(xyzMin)
+{
+    // min and max are inclusive for integral ElementType
+    const ElementType size = boost::is_integral<ElementType>::value ? length-1 : length;
+    mMax[0] += size;
+    mMax[1] += size;
+    mMax[2] += size;
+}
+
+template<typename Vec3T>
+inline
+BBox<Vec3T>::BBox(const ElementType* xyz, bool sorted):
+    mMin(xyz[0], xyz[1], xyz[2]),
+    mMax(xyz[3], xyz[4], xyz[5])
+{
+    if (!sorted) this->sort();
+}
+
+
+template<typename Vec3T>
+inline
+BBox<Vec3T>::BBox(const BBox& other):
+    mMin(other.mMin), mMax(other.mMax)
+{
+}
+
+
+////////////////////////////////////////
+
+
+template<typename Vec3T>
+inline bool
+BBox<Vec3T>::empty() const
+{
+    if (boost::is_integral<ElementType>::value) {
+        // min and max are inclusive for integral ElementType
+        return (mMin[0] > mMax[0] || mMin[1] > mMax[1] || mMin[2] > mMax[2]);
+    }
+    return mMin[0] >= mMax[0] || mMin[1] >= mMax[1] || mMin[2] >= mMax[2];
+}
+
+
+template<typename Vec3T>
+inline bool
+BBox<Vec3T>::operator==(const BBox& rhs) const
+{
+    if (boost::is_integral<ElementType>::value) {
+        return mMin == rhs.min() && mMax == rhs.max();
+    } else {
+        return math::isApproxEqual(mMin, rhs.min()) && math::isApproxEqual(mMax, rhs.max());
+    }
+}
+
+
+template<typename Vec3T>
+inline void
+BBox<Vec3T>::sort()
+{
+    Vec3T tMin(mMin), tMax(mMax);
+    for (int i = 0; i < 3; ++i) {
+        mMin[i] = std::min(tMin[i], tMax[i]);
+        mMax[i] = std::max(tMin[i], tMax[i]);
+    }
+}
+
+
+template<typename Vec3T>
+inline bool
+BBox<Vec3T>::isSorted() const
+{
+    if (boost::is_integral<ElementType>::value) {
+        return (mMin[0] <= mMax[0] && mMin[1] <= mMax[1] && mMin[2] <= mMax[2]);
+    } else {
+        ElementType t = math::Tolerance<ElementType>::value();
+        return (mMin[0] < (mMax[0] + t) && mMin[1] < (mMax[1] + t) && mMin[2] < (mMax[2] + t));
+    }
+}
+
+
+template<typename Vec3T>
+inline Vec3d
+BBox<Vec3T>::getCenter() const
+{
+    return (Vec3d(mMin.asPointer()) + Vec3d(mMax.asPointer())) * 0.5;
+}
+
+
+template<typename Vec3T>
+inline Vec3T
+BBox<Vec3T>::extents() const
+{
+    if (boost::is_integral<ElementType>::value) {
+        return (mMax - mMin) + Vec3T(1, 1, 1);
+    } else {
+        return (mMax - mMin);
+    }
+}
+
+////////////////////////////////////////
+
+
+template<typename Vec3T>
+inline bool
+BBox<Vec3T>::isInside(const Vec3T& xyz) const
+{
+    if (boost::is_integral<ElementType>::value) {
+        return xyz[0] >= mMin[0] && xyz[0] <= mMax[0] &&
+               xyz[1] >= mMin[1] && xyz[1] <= mMax[1] &&
+               xyz[2] >= mMin[2] && xyz[2] <= mMax[2];
+    } else {
+        ElementType t = math::Tolerance<ElementType>::value();
+        return xyz[0] > (mMin[0]-t) && xyz[0] < (mMax[0]+t) &&
+               xyz[1] > (mMin[1]-t) && xyz[1] < (mMax[1]+t) &&
+               xyz[2] > (mMin[2]-t) && xyz[2] < (mMax[2]+t);
+    }
+}
+
+
+template<typename Vec3T>
+inline bool
+BBox<Vec3T>::isInside(const BBox& b) const
+{
+    if (boost::is_integral<ElementType>::value) {
+        return b.min()[0] >= mMin[0]  && b.max()[0] <= mMax[0] &&
+               b.min()[1] >= mMin[1]  && b.max()[1] <= mMax[1] &&
+               b.min()[2] >= mMin[2]  && b.max()[2] <= mMax[2];
+    } else {
+        ElementType t = math::Tolerance<ElementType>::value();
+        return (b.min()[0]-t) > mMin[0]  && (b.max()[0]+t) < mMax[0] &&
+               (b.min()[1]-t) > mMin[1]  && (b.max()[1]+t) < mMax[1] &&
+               (b.min()[2]-t) > mMin[2]  && (b.max()[2]+t) < mMax[2];
+    }
+}
+
+
+template<typename Vec3T>
+inline bool
+BBox<Vec3T>::hasOverlap(const BBox& b) const
+{
+    if (boost::is_integral<ElementType>::value) {
+        return mMax[0] >= b.min()[0] && mMin[0] <= b.max()[0] &&
+               mMax[1] >= b.min()[1] && mMin[1] <= b.max()[1] &&
+               mMax[2] >= b.min()[2] && mMin[2] <= b.max()[2];
+    } else {
+        ElementType t = math::Tolerance<ElementType>::value();
+        return mMax[0] > (b.min()[0]-t) && mMin[0] < (b.max()[0]+t) &&
+               mMax[1] > (b.min()[1]-t) && mMin[1] < (b.max()[1]+t) &&
+               mMax[2] > (b.min()[2]-t) && mMin[2] < (b.max()[2]+t);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename Vec3T>
+inline void
+BBox<Vec3T>::expand(ElementType dx)
+{
+    dx = std::abs(dx);
+    for (int i = 0; i < 3; ++i) {
+        mMin[i] -= dx;
+        mMax[i] += dx;
+    }
+}
+
+
+template<typename Vec3T>
+inline void
+BBox<Vec3T>::expand(const Vec3T& xyz)
+{
+    for (int i = 0; i < 3; ++i) {
+        mMin[i] = std::min(mMin[i], xyz[i]);
+        mMax[i] = std::max(mMax[i], xyz[i]);
+    }
+}
+
+
+template<typename Vec3T>
+inline void
+BBox<Vec3T>::expand(const BBox& b)
+{
+    for (int i = 0; i < 3; ++i) {
+        mMin[i] = std::min(mMin[i], b.min()[i]);
+        mMax[i] = std::max(mMax[i], b.max()[i]);
+    }
+}
+
+template<typename Vec3T>
+inline void
+BBox<Vec3T>::expand(const Vec3T& xyzMin, const ElementType& length)
+{
+    const ElementType size = boost::is_integral<ElementType>::value ? length-1 : length;
+    for (int i = 0; i < 3; ++i) {
+        mMin[i] = std::min(mMin[i], xyzMin[i]);
+        mMax[i] = std::max(mMax[i], xyzMin[i] + size);
+    }
+}
+
+
+template<typename Vec3T>
+inline void
+BBox<Vec3T>::translate(const Vec3T& dx)
+{
+    mMin += dx;
+    mMax += dx;
+}
+
+template<typename Vec3T>
+template<typename MapType>
+inline BBox<Vec3T>
+BBox<Vec3T>::applyMap(const MapType& map) const
+{
+    typedef Vec3<double> Vec3R;
+    BBox<Vec3T> bbox;
+    bbox.expand(map.applyMap(Vec3R(mMin[0], mMin[1], mMin[2])));
+    bbox.expand(map.applyMap(Vec3R(mMin[0], mMin[1], mMax[2])));
+    bbox.expand(map.applyMap(Vec3R(mMin[0], mMax[1], mMin[2])));
+    bbox.expand(map.applyMap(Vec3R(mMax[0], mMin[1], mMin[2])));
+    bbox.expand(map.applyMap(Vec3R(mMax[0], mMax[1], mMin[2])));
+    bbox.expand(map.applyMap(Vec3R(mMax[0], mMin[1], mMax[2])));
+    bbox.expand(map.applyMap(Vec3R(mMin[0], mMax[1], mMax[2])));
+    bbox.expand(map.applyMap(Vec3R(mMax[0], mMax[1], mMax[2])));
+    return bbox;
+}
+
+template<typename Vec3T>
+template<typename MapType>
+inline BBox<Vec3T>
+BBox<Vec3T>::applyInverseMap(const MapType& map) const
+{
+    typedef Vec3<double> Vec3R;
+    BBox<Vec3T> bbox;
+    bbox.expand(map.applyInverseMap(Vec3R(mMin[0], mMin[1], mMin[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMin[0], mMin[1], mMax[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMin[0], mMax[1], mMin[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMax[0], mMin[1], mMin[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMax[0], mMax[1], mMin[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMax[0], mMin[1], mMax[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMin[0], mMax[1], mMax[2])));
+    bbox.expand(map.applyInverseMap(Vec3R(mMax[0], mMax[1], mMax[2])));
+    return bbox;
+}
+
+////////////////////////////////////////
+
+
+template<typename Vec3T>
+inline std::ostream&
+operator<<(std::ostream& os, const BBox<Vec3T>& b)
+{
+    os << b.min() << " -> " << b.max();
+    return os;
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_BBOX_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/ConjGradient.h b/nuparu/include/openvdb_new/math/ConjGradient.h
new file mode 100644
index 00000000..6e752fef
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/ConjGradient.h
@@ -0,0 +1,1841 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+/// @file   ConjGradient.h
+/// @authors D.J. Hill, Peter Cucka
+/// @brief  Preconditioned conjugate gradient solver (solves @e Ax = @e b using
+///         the conjugate gradient method with one of a selection of preconditioners)
+
+#ifndef OPENVDB_MATH_CONJGRADIENT_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_CONJGRADIENT_HAS_BEEN_INCLUDED
+
+#include <openvdb/Exceptions.h>
+#include <openvdb/Types.h>
+#include <openvdb/util/logging.h>
+#include <openvdb/util/NullInterrupter.h>
+#include "Math.h" // for Abs(), isZero(), Max(), Sqrt()
+#include <boost/shared_ptr.hpp>
+#include <boost/scoped_array.hpp>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <algorithm> // for std::lower_bound()
+#include <cassert>
+#include <cmath> // for std::isfinite()
+#include <limits>
+#include <sstream>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+namespace pcg {
+
+typedef Index32 SizeType;
+
+typedef tbb::blocked_range<SizeType> SizeRange;
+
+template<typename ValueType> class Vector;
+
+template<typename ValueType, SizeType STENCIL_SIZE> class SparseStencilMatrix;
+
+template<typename ValueType> class Preconditioner;
+template<typename MatrixType> class JacobiPreconditioner;
+template<typename MatrixType> class IncompleteCholeskyPreconditioner;
+
+/// Information about the state of a conjugate gradient solution
+struct State {
+    bool    success;
+    int     iterations;
+    double  relativeError;
+    double  absoluteError;
+};
+
+
+/// Return default termination conditions for a conjugate gradient solver.
+template<typename ValueType>
+inline State
+terminationDefaults()
+{
+    State s;
+    s.success = false;
+    s.iterations = 50;
+    s.relativeError = 1.0e-6;
+    s.absoluteError = std::numeric_limits<ValueType>::epsilon() * 100.0;
+    return s;
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Solve @e Ax = @e b via the preconditioned conjugate gradient method.
+///
+/// @param A  a symmetric, positive-definite, @e N x @e N matrix
+/// @param b  a vector of size @e N
+/// @param x  a vector of size @e N
+/// @param preconditioner  a Preconditioner matrix
+/// @param termination  termination conditions given as a State object with the following fields:
+///     <dl>
+///     <dt><i>success</i>
+///     <dd>ignored
+///     <dt><i>iterations</i>
+///     <dd>the maximum number of iterations, with or without convergence
+///     <dt><i>relativeError</i>
+///     <dd>the relative error ||<i>b</i> &minus; <i>A</i>@f$\hat{x}@f$|| / ||<i>b</i>||
+///         that denotes convergence
+///     <dt><i>absoluteError</i>
+///     <dd>the absolute error ||<i>b</i> &minus; <i>A</i>@f$\hat{x}@f$|| that denotes convergence
+///
+/// @throw ArithmeticError if either @a x or @a b is not of the appropriate size.
+template<typename PositiveDefMatrix>
+inline State
+solve(
+    const PositiveDefMatrix& A,
+    const Vector<typename PositiveDefMatrix::ValueType>& b,
+    Vector<typename PositiveDefMatrix::ValueType>& x,
+    Preconditioner<typename PositiveDefMatrix::ValueType>& preconditioner,
+    const State& termination = terminationDefaults<typename PositiveDefMatrix::ValueType>());
+
+
+/// @brief Solve @e Ax = @e b via the preconditioned conjugate gradient method.
+///
+/// @param A  a symmetric, positive-definite, @e N x @e N matrix
+/// @param b  a vector of size @e N
+/// @param x  a vector of size @e N
+/// @param preconditioner  a Preconditioner matrix
+/// @param termination  termination conditions given as a State object with the following fields:
+///     <dl>
+///     <dt><i>success</i>
+///     <dd>ignored
+///     <dt><i>iterations</i>
+///     <dd>the maximum number of iterations, with or without convergence
+///     <dt><i>relativeError</i>
+///     <dd>the relative error ||<i>b</i> &minus; <i>A</i>@f$\hat{x}@f$|| / ||<i>b</i>||
+///         that denotes convergence
+///     <dt><i>absoluteError</i>
+///     <dd>the absolute error ||<i>b</i> &minus; <i>A</i>@f$\hat{x}@f$|| that denotes convergence
+/// @param interrupter  an object adhering to the util::NullInterrupter interface
+///     with which computation can be interrupted
+///
+/// @throw ArithmeticError if either @a x or @a b is not of the appropriate size.
+/// @throw RuntimeError if the computation is interrupted.
+template<typename PositiveDefMatrix, typename Interrupter>
+inline State
+solve(
+    const PositiveDefMatrix& A,
+    const Vector<typename PositiveDefMatrix::ValueType>& b,
+    Vector<typename PositiveDefMatrix::ValueType>& x,
+    Preconditioner<typename PositiveDefMatrix::ValueType>& preconditioner,
+    Interrupter& interrupter,
+    const State& termination = terminationDefaults<typename PositiveDefMatrix::ValueType>());
+
+
+////////////////////////////////////////
+
+
+/// Lightweight, variable-length vector
+template<typename T>
+class Vector
+{
+public:
+    typedef T ValueType;
+    typedef boost::shared_ptr<Vector> Ptr;
+
+    /// Construct an empty vector.
+    Vector(): mData(NULL), mSize(0) {}
+    /// Construct a vector of @a n elements, with uninitialized values.
+    Vector(SizeType n): mData(new T[n]), mSize(n) {}
+    /// Construct a vector of @a n elements and initialize each element to the given value.
+    Vector(SizeType n, const ValueType& val): mData(new T[n]), mSize(n) { this->fill(val); }
+
+    ~Vector() { mSize = 0; delete[] mData; mData = NULL; }
+
+    /// Deep copy the given vector.
+    Vector(const Vector&);
+    /// Deep copy the given vector.
+    Vector& operator=(const Vector&);
+
+    /// Return the number of elements in this vector.
+    SizeType size() const { return mSize; }
+    /// Return @c true if this vector has no elements.
+    bool empty() const { return (mSize == 0); }
+
+    /// @brief Reset this vector to have @a n elements, with uninitialized values.
+    /// @warning All of this vector's existing values will be lost.
+    void resize(SizeType n);
+
+    /// Swap internal storage with another vector, which need not be the same size.
+    void swap(Vector& other) { std::swap(mData, other.mData); std::swap(mSize, other.mSize); }
+
+    /// Set all elements of this vector to @a value.
+    void fill(const ValueType& value);
+
+    //@{
+    /// @brief Multiply each element of this vector by @a s.
+    template<typename Scalar> void scale(const Scalar& s);
+    template<typename Scalar> Vector& operator*=(const Scalar& s) { this->scale(s); return *this; }
+    //@}
+
+    /// Return the dot product of this vector with the given vector, which must be the same size.
+    ValueType dot(const Vector&) const;
+
+    /// Return the infinity norm of this vector.
+    ValueType infNorm() const;
+    /// Return the L2 norm of this vector.
+    ValueType l2Norm() const { return Sqrt(this->dot(*this)); }
+
+    /// Return @c true if every element of this vector has a finite value.
+    bool isFinite() const;
+
+    /// @brief Return @c true if this vector is equivalent to the given vector
+    /// to within the specified tolerance.
+    template<typename OtherValueType>
+    bool eq(const Vector<OtherValueType>& other,
+        ValueType eps = Tolerance<ValueType>::value()) const;
+
+    /// Return a string representation of this vector.
+    std::string str() const;
+
+    //@{
+    /// @brief Return the value of this vector's ith element.
+    inline T& at(SizeType i) { return mData[i]; }
+    inline const T& at(SizeType i) const { return mData[i]; }
+    inline T& operator[](SizeType i) { return this->at(i); }
+    inline const T& operator[](SizeType i) const { return this->at(i); }
+    //@}
+
+    //@{
+    /// @brief Return a pointer to this vector's elements.
+    inline T* data() { return mData; }
+    inline const T* data() const { return mData; }
+    inline const T* constData() const { return mData; }
+    //@}
+
+private:
+    // Functor for use with tbb::parallel_for()
+    template<typename Scalar> struct ScaleOp;
+    struct DeterministicDotProductOp;
+    // Functors for use with tbb::parallel_reduce()
+    template<typename OtherValueType> struct EqOp;
+    struct InfNormOp;
+    struct IsFiniteOp;
+
+    T* mData;
+    SizeType mSize;
+};
+
+typedef Vector<float> VectorS;
+typedef Vector<double> VectorD;
+//typedef Vector<double> LinearVector;
+
+
+////////////////////////////////////////
+
+
+/// @brief Sparse, square matrix representing a 3D stencil operator of size @a STENCIL_SIZE
+/// @details The implementation is a variation on compressed row storage (CRS).
+template<typename ValueType_, SizeType STENCIL_SIZE>
+class SparseStencilMatrix
+{
+public:
+    typedef ValueType_ ValueType;
+    typedef Vector<ValueType> VectorType;
+    typedef boost::shared_ptr<SparseStencilMatrix> Ptr;
+
+    class ConstValueIter;
+    class ConstRow;
+    class RowEditor;
+
+    static const ValueType sZeroValue;
+
+    /// Construct an @a n x @a n matrix with at most @a STENCIL_SIZE nonzero elements per row.
+    SparseStencilMatrix(SizeType n);
+
+    /// Deep copy the given matrix.
+    SparseStencilMatrix(const SparseStencilMatrix&);
+
+    //@{
+    /// Return the number of rows in this matrix.
+    SizeType numRows() const { return mNumRows; }
+    SizeType size() const { return mNumRows; }
+    //@}
+
+    /// @brief Set the value at the given coordinates.
+    /// @warning It is not safe to set values in the same row simultaneously
+    /// from multiple threads.
+    void setValue(SizeType row, SizeType col, const ValueType&);
+
+    //@{
+    /// @brief Return the value at the given coordinates.
+    /// @warning It is not safe to get values from a row while another thread
+    /// is setting values in that row.
+    const ValueType& getValue(SizeType row, SizeType col) const;
+    const ValueType& operator()(SizeType row, SizeType col) const;
+    //@}
+
+    /// Return a read-only view onto the given row of this matrix.
+    ConstRow getConstRow(SizeType row) const;
+
+    /// Return a read/write view onto the given row of this matrix.
+    RowEditor getRowEditor(SizeType row);
+
+    //@{
+    /// @brief Multiply all elements in the matrix by @a s;
+    template<typename Scalar> void scale(const Scalar& s);
+    template<typename Scalar>
+    SparseStencilMatrix& operator*=(const Scalar& s) { this->scale(s); return *this; }
+    //@}
+
+    /// @brief Multiply this matrix by @a inVec and return the result in @a resultVec.
+    /// @throw ArithmeticError if either @a inVec or @a resultVec is not of size @e N,
+    /// where @e N x @e N is the size of this matrix.
+    template<typename VecValueType>
+    void vectorMultiply(const Vector<VecValueType>& inVec, Vector<VecValueType>& resultVec) const;
+
+    /// @brief Multiply this matrix by the vector represented by the array @a inVec
+    /// and return the result in @a resultVec.
+    /// @warning Both @a inVec and @a resultVec must have at least @e N elements,
+    /// where @e N x @e N is the size of this matrix.
+    template<typename VecValueType>
+    void vectorMultiply(const VecValueType* inVec, VecValueType* resultVec) const;
+
+    /// @brief Return @c true if this matrix is equivalent to the given matrix
+    /// to within the specified tolerance.
+    template<typename OtherValueType>
+    bool eq(const SparseStencilMatrix<OtherValueType, STENCIL_SIZE>& other,
+        ValueType eps = Tolerance<ValueType>::value()) const;
+
+    /// Return @c true if every element of this matrix has a finite value.
+    bool isFinite() const;
+
+    /// Return a string representation of this matrix.
+    std::string str() const;
+
+private:
+    struct RowData {
+        RowData(ValueType* v, SizeType* c, SizeType& s): mVals(v), mCols(c), mSize(s) {}
+        ValueType* mVals; SizeType* mCols; SizeType& mSize;
+    };
+
+    struct ConstRowData {
+        ConstRowData(const ValueType* v, const SizeType* c, const SizeType& s):
+            mVals(v), mCols(c), mSize(s) {}
+        const ValueType* mVals; const SizeType* mCols; const SizeType& mSize;
+    };
+
+    /// Base class for row accessors
+    template<typename DataType_ = RowData>
+    class RowBase
+    {
+    public:
+        typedef DataType_ DataType;
+
+        static SizeType capacity() { return STENCIL_SIZE; }
+
+        RowBase(const DataType& data): mData(data) {}
+
+        bool empty() const { return (mData.mSize == 0); }
+        const SizeType& size() const { return mData.mSize; }
+
+        const ValueType& getValue(SizeType columnIdx, bool& active) const;
+        const ValueType& getValue(SizeType columnIdx) const;
+
+        /// Return an iterator over the stored values in this row.
+        ConstValueIter cbegin() const;
+
+        /// @brief Return @c true if this row is equivalent to the given row
+        /// to within the specified tolerance.
+        template<typename OtherDataType>
+        bool eq(const RowBase<OtherDataType>& other,
+            ValueType eps = Tolerance<ValueType>::value()) const;
+
+        /// @brief Return the dot product of this row with the first
+        /// @a vecSize elements of @a inVec.
+        /// @warning @a inVec must have at least @a vecSize elements.
+        template<typename VecValueType>
+        VecValueType dot(const VecValueType* inVec, SizeType vecSize) const;
+
+        /// Return the dot product of this row with the given vector.
+        template<typename VecValueType>
+        VecValueType dot(const Vector<VecValueType>& inVec) const;
+
+        /// Return a string representation of this row.
+        std::string str() const;
+
+    protected:
+        friend class ConstValueIter;
+
+        const ValueType& value(SizeType i) const { return mData.mVals[i]; }
+        SizeType column(SizeType i) const { return mData.mCols[i]; }
+
+        /// @brief Return the array index of the first column index that is
+        /// equal to <i>or greater than</i> the given column index.
+        /// @note If @a columnIdx is larger than any existing column index,
+        /// the return value will point beyond the end of the array.
+        SizeType find(SizeType columnIdx) const;
+
+        DataType mData;
+    };
+
+    typedef RowBase<ConstRowData> ConstRowBase;
+
+public:
+    /// Iterator over the stored values in a row of this matrix
+    class ConstValueIter
+    {
+    public:
+        const ValueType& operator*() const
+        {
+            if (mData.mSize == 0) return SparseStencilMatrix::sZeroValue;
+            return mData.mVals[mCursor];
+        }
+
+        SizeType column() const { return mData.mCols[mCursor]; }
+
+        void increment() { mCursor++; }
+        ConstValueIter& operator++() { increment(); return *this; }
+        operator bool() const { return (mCursor < mData.mSize); }
+
+        void reset() { mCursor = 0; }
+
+    private:
+        friend class SparseStencilMatrix;
+        ConstValueIter(const RowData& d): mData(d.mVals, d.mCols, d.mSize), mCursor(0) {}
+        ConstValueIter(const ConstRowData& d): mData(d), mCursor(0) {}
+
+        const ConstRowData mData;
+        SizeType mCursor;
+    };
+
+
+    /// Read-only accessor to a row of this matrix
+    class ConstRow: public ConstRowBase
+    {
+    public:
+        ConstRow(const ValueType* valueHead, const SizeType* columnHead, const SizeType& rowSize);
+    }; // class ConstRow
+
+
+    /// Read/write accessor to a row of this matrix
+    class RowEditor: public RowBase<>
+    {
+    public:
+        RowEditor(ValueType* valueHead, SizeType* columnHead, SizeType& rowSize, SizeType colSize);
+
+        /// Set the number of entries in this row to zero.
+        void clear();
+
+        /// @brief Set the value of the entry in the specified column.
+        /// @return the current number of entries stored in this row.
+        SizeType setValue(SizeType column, const ValueType& value);
+
+        //@{
+        /// @brief Scale all of the entries in this row.
+        template<typename Scalar> void scale(const Scalar&);
+        template<typename Scalar>
+        RowEditor& operator*=(const Scalar& s) { this->scale(s); return *this; }
+        //@}
+
+    private:
+        const SizeType mNumColumns; // used only for bounds checking
+    }; // class RowEditor
+
+private:
+    // Functors for use with tbb::parallel_for()
+    struct MatrixCopyOp;
+    template<typename VecValueType> struct VecMultOp;
+    template<typename Scalar> struct RowScaleOp;
+
+    // Functors for use with tbb::parallel_reduce()
+    struct IsFiniteOp;
+    template<typename OtherValueType> struct EqOp;
+
+    const SizeType                  mNumRows;
+    boost::scoped_array<ValueType>  mValueArray;
+    boost::scoped_array<SizeType>   mColumnIdxArray;
+    boost::scoped_array<SizeType>   mRowSizeArray;
+}; // class SparseStencilMatrix
+
+
+////////////////////////////////////////
+
+
+/// Base class for conjugate gradient preconditioners
+template<typename T>
+class Preconditioner
+{
+public:
+    typedef T ValueType;
+    typedef boost::shared_ptr<Preconditioner> Ptr;
+
+    template<SizeType STENCIL_SIZE> Preconditioner(const SparseStencilMatrix<T, STENCIL_SIZE>&) {}
+    virtual ~Preconditioner() {}
+
+    virtual bool isValid() const { return true; }
+
+    /// @brief Apply this preconditioner to a residue vector:
+    ///     @e z = <i>M</i><sup><small>&minus;1</small></sup><i>r</i>
+    /// @param      r  residue vector
+    /// @param[out] z  preconditioned residue vector
+    virtual void apply(const Vector<T>& r, Vector<T>& z) = 0;
+};
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+// Functor for use with tbb::parallel_for() to copy data from one array to another
+template<typename T>
+struct CopyOp
+{
+    CopyOp(const T* from_, T* to_): from(from_), to(to_) {}
+
+    void operator()(const SizeRange& range) const {
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) to[n] = from[n];
+    }
+
+    const T* from;
+    T* to;
+};
+
+
+// Functor for use with tbb::parallel_for() to fill an array with a constant value
+template<typename T>
+struct FillOp
+{
+    FillOp(T* data_, const T& val_): data(data_), val(val_) {}
+
+    void operator()(const SizeRange& range) const {
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) data[n] = val;
+    }
+
+    T* data;
+    const T val;
+};
+
+
+// Functor for use with tbb::parallel_for() that computes a * x + y
+template<typename T>
+struct LinearOp
+{
+    LinearOp(const T& a_, const T* x_, const T* y_, T* out_): a(a_), x(x_), y(y_), out(out_) {}
+
+    void operator()(const SizeRange& range) const {
+        if (isExactlyEqual(a, T(1))) {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) out[n] = x[n] + y[n];
+        } else if (isExactlyEqual(a, T(-1))) {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) out[n] = -x[n] + y[n];
+        } else {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) out[n] = a * x[n] + y[n];
+        }
+    }
+
+    const T a, *x, *y;
+    T* out;
+};
+
+} // namespace internal
+
+
+////////////////////////////////////////
+
+
+inline std::ostream&
+operator<<(std::ostream& os, const State& state)
+{
+    os << (state.success ? "succeeded with " : "")
+        << "rel. err. " << state.relativeError << ", abs. err. " << state.absoluteError
+        << " after " << state.iterations << " iteration" << (state.iterations == 1 ? "" : "s");
+    return os;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T>
+inline
+Vector<T>::Vector(const Vector& other): mData(new T[other.mSize]), mSize(other.mSize)
+{
+    tbb::parallel_for(SizeRange(0, mSize),
+        internal::CopyOp<T>(/*from=*/other.mData, /*to=*/mData));
+}
+
+
+template<typename T>
+inline
+Vector<T>& Vector<T>::operator=(const Vector<T>& other)
+{
+    // Update the internal storage to the correct size
+
+    if (mSize != other.mSize) {
+        mSize = other.mSize;
+        delete[] mData;
+        mData = new T[mSize];
+    }
+
+    // Deep copy the data
+    tbb::parallel_for(SizeRange(0, mSize),
+        internal::CopyOp<T>(/*from=*/other.mData, /*to=*/mData));
+
+    return *this;
+}
+
+
+template<typename T>
+inline void
+Vector<T>::resize(SizeType n)
+{
+    if (n != mSize) {
+        if (mData) delete[] mData;
+        mData = new T[n];
+        mSize = n;
+    }
+}
+
+
+template<typename T>
+inline void
+Vector<T>::fill(const ValueType& value)
+{
+    tbb::parallel_for(SizeRange(0, mSize), internal::FillOp<T>(mData, value));
+}
+
+
+template<typename T>
+template<typename Scalar>
+struct Vector<T>::ScaleOp
+{
+    ScaleOp(T* data_, const Scalar& s_): data(data_), s(s_) {}
+
+    void operator()(const SizeRange& range) const {
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) data[n] *= s;
+    }
+
+    T* data;
+    const Scalar s;
+};
+
+
+template<typename T>
+template<typename Scalar>
+inline void
+Vector<T>::scale(const Scalar& s)
+{
+    tbb::parallel_for(SizeRange(0, mSize), ScaleOp<Scalar>(mData, s));
+}
+
+
+template<typename T>
+struct Vector<T>::DeterministicDotProductOp
+{
+    DeterministicDotProductOp(const T* a_, const T* b_, 
+                              const SizeType binCount_, const SizeType arraySize_, T* reducetmp_): 
+        a(a_), b(b_), binCount(binCount_), arraySize(arraySize_), reducetmp(reducetmp_) {}
+    
+    void operator()(const SizeRange& range) const
+    {
+        
+        const SizeType binSize = arraySize / binCount; 
+        
+        // Iterate over bins (array segments)
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+            const SizeType begin = n * binSize;
+            const SizeType end   = (n == binCount-1) ? arraySize : begin + binSize;
+
+            // Compute the partial sum for this array segment 
+            T sum = zeroVal<T>();
+            for (SizeType i = begin; i < end; ++i) {
+                
+                sum += a[i] * b[i];
+            }
+            // Store the partial sum
+            reducetmp[n] = sum;
+        }
+    }
+
+    
+    const T* a;
+    const T* b;
+    const SizeType binCount;
+    const SizeType arraySize; 
+    T* reducetmp;
+};
+
+template<typename T>
+inline T
+Vector<T>::dot(const Vector<T>& other) const
+{
+    assert(this->size() == other.size());
+
+    const T* aData = this->data();
+    const T* bData = other.data();
+
+    SizeType arraySize = this->size();
+
+    T result = zeroVal<T>();
+
+    if (arraySize < 1024) {
+
+        // Compute the dot product in serial for small arrays
+
+        for (SizeType n = 0; n < arraySize; ++n) {
+            result += aData[n] * bData[n];
+        }
+
+    } else {
+
+        // Compute the dot product by segmenting the arrays into 
+        // a predetermined number of sub arrays in parallel and 
+        // accumulate the finial result in series.
+        
+        const SizeType binCount = 100;
+        T partialSums[100];
+        
+        tbb::parallel_for(SizeRange(0, binCount), 
+                          DeterministicDotProductOp(aData, bData, binCount, arraySize, partialSums));
+
+        for (SizeType n = 0; n < binCount; ++n) {
+            result += partialSums[n];
+        }
+    }
+    
+    return result;
+}
+
+
+template<typename T>
+struct Vector<T>::InfNormOp
+{
+    InfNormOp(const T* data_): data(data_) {}
+
+    T operator()(const SizeRange& range, T maxValue) const
+    {
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+            maxValue = Max(maxValue, Abs(data[n]));
+        }
+        return maxValue;
+    }
+
+    static T join(T max1, T max2) { return Max(max1, max2); }
+
+    const T* data;
+};
+
+
+template<typename T>
+inline T
+Vector<T>::infNorm() const
+{
+    // Parallelize over the elements of this vector.
+    T result = tbb::parallel_reduce(SizeRange(0, this->size()), /*seed=*/zeroVal<T>(),
+        InfNormOp(this->data()), InfNormOp::join);
+    return result;
+}
+
+
+template<typename T>
+struct Vector<T>::IsFiniteOp
+{
+    IsFiniteOp(const T* data_): data(data_) {}
+
+    bool operator()(const SizeRange& range, bool finite) const
+    {
+        if (finite) {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                if (!std::isfinite(data[n])) return false;
+            }
+        }
+        return finite;
+    }
+
+    static bool join(bool finite1, bool finite2) { return (finite1 && finite2); }
+
+    const T* data;
+};
+
+
+template<typename T>
+inline bool
+Vector<T>::isFinite() const
+{
+    // Parallelize over the elements of this vector.
+    bool finite = tbb::parallel_reduce(SizeRange(0, this->size()), /*seed=*/true,
+        IsFiniteOp(this->data()), IsFiniteOp::join);
+    return finite;
+}
+
+
+template<typename T>
+template<typename OtherValueType>
+struct Vector<T>::EqOp
+{
+    EqOp(const T* a_, const OtherValueType* b_, T e): a(a_), b(b_), eps(e) {}
+
+    bool operator()(const SizeRange& range, bool equal) const
+    {
+        if (equal) {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                if (!isApproxEqual(a[n], b[n], eps)) return false;
+            }
+        }
+        return equal;
+    }
+
+    static bool join(bool eq1, bool eq2) { return (eq1 && eq2); }
+
+    const T* a;
+    const OtherValueType* b;
+    const T eps;
+};
+
+
+template<typename T>
+template<typename OtherValueType>
+inline bool
+Vector<T>::eq(const Vector<OtherValueType>& other, ValueType eps) const
+{
+    if (this->size() != other.size()) return false;
+    bool equal = tbb::parallel_reduce(SizeRange(0, this->size()), /*seed=*/true,
+        EqOp<OtherValueType>(this->data(), other.data(), eps), EqOp<OtherValueType>::join);
+    return equal;
+}
+
+
+template<typename T>
+inline std::string
+Vector<T>::str() const
+{
+    std::ostringstream ostr;
+    ostr << "[";
+    std::string sep;
+    for (SizeType n = 0, N = this->size(); n < N; ++n) {
+        ostr << sep << (*this)[n];
+        sep = ", ";
+    }
+    ostr << "]";
+    return ostr.str();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+const ValueType SparseStencilMatrix<ValueType, STENCIL_SIZE>::sZeroValue = zeroVal<ValueType>();
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::SparseStencilMatrix(SizeType numRows)
+    : mNumRows(numRows)
+    , mValueArray(new ValueType[mNumRows * STENCIL_SIZE])
+    , mColumnIdxArray(new SizeType[mNumRows * STENCIL_SIZE])
+    , mRowSizeArray(new SizeType[mNumRows])
+{
+    // Initialize the matrix to a null state by setting the size of each row to zero.
+    tbb::parallel_for(SizeRange(0, mNumRows),
+        internal::FillOp<SizeType>(mRowSizeArray.get(), /*value=*/0));
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+struct SparseStencilMatrix<ValueType, STENCIL_SIZE>::MatrixCopyOp
+{
+    MatrixCopyOp(const SparseStencilMatrix& from_, SparseStencilMatrix& to_):
+        from(&from_), to(&to_) {}
+
+    void operator()(const SizeRange& range) const
+    {
+        const ValueType* fromVal = from->mValueArray.get();
+        const SizeType* fromCol = from->mColumnIdxArray.get();
+        ValueType* toVal = to->mValueArray.get();
+        SizeType* toCol = to->mColumnIdxArray.get();
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+            toVal[n] = fromVal[n];
+            toCol[n] = fromCol[n];
+        }
+    }
+
+    const SparseStencilMatrix* from; SparseStencilMatrix* to;
+};
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::SparseStencilMatrix(const SparseStencilMatrix& other)
+    : mNumRows(other.mNumRows)
+    , mValueArray(new ValueType[mNumRows * STENCIL_SIZE])
+    , mColumnIdxArray(new SizeType[mNumRows * STENCIL_SIZE])
+    , mRowSizeArray(new SizeType[mNumRows])
+{
+    SizeType size = mNumRows * STENCIL_SIZE;
+
+    // Copy the value and column index arrays from the other matrix to this matrix.
+    tbb::parallel_for(SizeRange(0, size), MatrixCopyOp(/*from=*/other, /*to=*/*this));
+
+    // Copy the row size array from the other matrix to this matrix.
+    tbb::parallel_for(SizeRange(0, mNumRows),
+        internal::CopyOp<SizeType>(/*from=*/other.mRowSizeArray.get(), /*to=*/mRowSizeArray.get()));
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline void
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::setValue(SizeType row, SizeType col,
+    const ValueType& val)
+{
+    assert(row < mNumRows);
+    this->getRowEditor(row).setValue(col, val);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline const ValueType&
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::getValue(SizeType row, SizeType col) const
+{
+    assert(row < mNumRows);
+    return this->getConstRow(row).getValue(col);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline const ValueType&
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::operator()(SizeType row, SizeType col) const
+{
+    return this->getValue(row,col);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename Scalar>
+struct SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowScaleOp
+{
+    RowScaleOp(SparseStencilMatrix& m, const Scalar& s_): mat(&m), s(s_) {}
+
+    void operator()(const SizeRange& range) const
+    {
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+            RowEditor row = mat->getRowEditor(n);
+            row.scale(s);
+        }
+    }
+
+    SparseStencilMatrix* mat;
+    const Scalar s;
+};
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename Scalar>
+inline void
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::scale(const Scalar& s)
+{
+    // Parallelize over the rows in the matrix.
+    tbb::parallel_for(SizeRange(0, mNumRows), RowScaleOp<Scalar>(*this, s));
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename VecValueType>
+struct SparseStencilMatrix<ValueType, STENCIL_SIZE>::VecMultOp
+{
+    VecMultOp(const SparseStencilMatrix& m, const VecValueType* i, VecValueType* o):
+        mat(&m), in(i), out(o) {}
+
+    void operator()(const SizeRange& range) const
+    {
+        for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+            ConstRow row = mat->getConstRow(n);
+            out[n] = row.dot(in, mat->numRows());
+        }
+    }
+
+    const SparseStencilMatrix* mat;
+    const VecValueType* in;
+    VecValueType* out;
+};
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename VecValueType>
+inline void
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::vectorMultiply(
+    const Vector<VecValueType>& inVec, Vector<VecValueType>& resultVec) const
+{
+    if (inVec.size() != mNumRows) {
+        OPENVDB_THROW(ArithmeticError, "matrix and input vector have incompatible sizes ("
+            << mNumRows << "x" << mNumRows << " vs. " << inVec.size() << ")");
+    }
+    if (resultVec.size() != mNumRows) {
+        OPENVDB_THROW(ArithmeticError, "matrix and result vector have incompatible sizes ("
+            << mNumRows << "x" << mNumRows << " vs. " << resultVec.size() << ")");
+    }
+
+    vectorMultiply(inVec.data(), resultVec.data());
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename VecValueType>
+inline void
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::vectorMultiply(
+    const VecValueType* inVec, VecValueType* resultVec) const
+{
+    // Parallelize over the rows in the matrix.
+    tbb::parallel_for(SizeRange(0, mNumRows),
+        VecMultOp<VecValueType>(*this, inVec, resultVec));
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename OtherValueType>
+struct SparseStencilMatrix<ValueType, STENCIL_SIZE>::EqOp
+{
+    EqOp(const SparseStencilMatrix& a_,
+        const SparseStencilMatrix<OtherValueType, STENCIL_SIZE>& b_, ValueType e):
+        a(&a_), b(&b_), eps(e) {}
+
+    bool operator()(const SizeRange& range, bool equal) const
+    {
+        if (equal) {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                if (!a->getConstRow(n).eq(b->getConstRow(n), eps)) return false;
+            }
+        }
+        return equal;
+    }
+
+    static bool join(bool eq1, bool eq2) { return (eq1 && eq2); }
+
+    const SparseStencilMatrix* a;
+    const SparseStencilMatrix<OtherValueType, STENCIL_SIZE>* b;
+    const ValueType eps;
+};
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename OtherValueType>
+inline bool
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::eq(
+    const SparseStencilMatrix<OtherValueType, STENCIL_SIZE>& other, ValueType eps) const
+{
+    if (this->numRows() != other.numRows()) return false;
+    bool equal = tbb::parallel_reduce(SizeRange(0, this->numRows()), /*seed=*/true,
+        EqOp<OtherValueType>(*this, other, eps), EqOp<OtherValueType>::join);
+    return equal;
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+struct SparseStencilMatrix<ValueType, STENCIL_SIZE>::IsFiniteOp
+{
+    IsFiniteOp(const SparseStencilMatrix& m): mat(&m) {}
+
+    bool operator()(const SizeRange& range, bool finite) const
+    {
+        if (finite) {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                const ConstRow row = mat->getConstRow(n);
+                for (ConstValueIter it = row.cbegin(); it; ++it) {
+                    if (!std::isfinite(*it)) return false;
+                }
+            }
+        }
+        return finite;
+    }
+
+    static bool join(bool finite1, bool finite2) { return (finite1 && finite2); }
+
+    const SparseStencilMatrix* mat;
+};
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline bool
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::isFinite() const
+{
+    // Parallelize over the rows of this matrix.
+    bool finite = tbb::parallel_reduce(SizeRange(0, this->numRows()), /*seed=*/true,
+        IsFiniteOp(*this), IsFiniteOp::join);
+    return finite;
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline std::string
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::str() const
+{
+    std::ostringstream ostr;
+    for (SizeType n = 0, N = this->size(); n < N; ++n) {
+        ostr << n << ": " << this->getConstRow(n).str() << "\n";
+    }
+    return ostr.str();
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline typename SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::getRowEditor(SizeType i)
+{
+    assert(i < mNumRows);
+    const SizeType head = i * STENCIL_SIZE;
+    return RowEditor(&mValueArray[head], &mColumnIdxArray[head], mRowSizeArray[i], mNumRows);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline typename SparseStencilMatrix<ValueType, STENCIL_SIZE>::ConstRow
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::getConstRow(SizeType i) const
+{
+    assert(i < mNumRows);
+    const SizeType head = i * STENCIL_SIZE; // index for this row into main storage
+    return ConstRow(&mValueArray[head], &mColumnIdxArray[head], mRowSizeArray[i]);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+inline SizeType
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::find(SizeType columnIdx) const
+{
+    if (this->empty()) return mData.mSize;
+
+    // Get a pointer to the first column index that is equal to or greater than the given index.
+    // (This assumes that the data is sorted by column.)
+    const SizeType* colPtr = std::lower_bound(mData.mCols, mData.mCols + mData.mSize, columnIdx);
+    // Return the offset of the pointer from the beginning of the array.
+    return static_cast<SizeType>(colPtr - mData.mCols);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+inline const ValueType&
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::getValue(
+    SizeType columnIdx, bool& active) const
+{
+    active = false;
+    SizeType idx = this->find(columnIdx);
+    if (idx < this->size() && this->column(idx) == columnIdx) {
+        active = true;
+        return this->value(idx);
+    }
+    return SparseStencilMatrix::sZeroValue;
+}
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+inline const ValueType&
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::getValue(SizeType columnIdx) const
+{
+    SizeType idx = this->find(columnIdx);
+    if (idx < this->size() && this->column(idx) == columnIdx) {
+        return this->value(idx);
+    }
+    return SparseStencilMatrix::sZeroValue;
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+inline typename SparseStencilMatrix<ValueType, STENCIL_SIZE>::ConstValueIter
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::cbegin() const
+{
+    return ConstValueIter(mData);
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+template<typename OtherDataType>
+inline bool
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::eq(
+    const RowBase<OtherDataType>& other, ValueType eps) const
+{
+    if (this->size() != other.size()) return false;
+    for (ConstValueIter it = cbegin(), oit = other.cbegin(); it || oit; ++it, ++oit) {
+        if (it.column() != oit.column()) return false;
+        if (!isApproxEqual(*it, *oit, eps)) return false;
+    }
+    return true;
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+template<typename VecValueType>
+inline VecValueType
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::dot(
+    const VecValueType* inVec, SizeType vecSize) const
+{
+    VecValueType result = zeroVal<VecValueType>();
+    for (SizeType idx = 0, N = std::min(vecSize, this->size()); idx < N; ++idx) {
+        result += static_cast<VecValueType>(this->value(idx) * inVec[this->column(idx)]);
+    }
+    return result;
+}
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+template<typename VecValueType>
+inline VecValueType
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::dot(
+    const Vector<VecValueType>& inVec) const
+{
+    return dot(inVec.data(), inVec.size());
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename DataType>
+inline std::string
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowBase<DataType>::str() const
+{
+    std::ostringstream ostr;
+    std::string sep;
+    for (SizeType n = 0, N = this->size(); n < N; ++n) {
+        ostr << sep << "(" << this->column(n) << ", " << this->value(n) << ")";
+        sep = ", ";
+    }
+    return ostr.str();
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::ConstRow::ConstRow(
+    const ValueType* valueHead, const SizeType* columnHead, const SizeType& rowSize):
+    ConstRowBase(ConstRowData(const_cast<ValueType*>(valueHead),
+        const_cast<SizeType*>(columnHead), const_cast<SizeType&>(rowSize)))
+{
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor::RowEditor(
+    ValueType* valueHead, SizeType* columnHead, SizeType& rowSize, SizeType colSize):
+    RowBase<>(RowData(valueHead, columnHead, rowSize)), mNumColumns(colSize)
+{
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline void
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor::clear()
+{
+    // Note: since mSize is a reference, this modifies the underlying matrix.
+    RowBase<>::mData.mSize = 0;
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+inline SizeType
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor::setValue(
+    SizeType column, const ValueType& value)
+{
+    assert(column < mNumColumns);
+
+    RowData& data = RowBase<>::mData;
+
+    // Get the offset of the first column index that is equal to or greater than
+    // the column to be modified.
+    SizeType offset = this->find(column);
+
+    if (offset < data.mSize && data.mCols[offset] == column) {
+        // If the column already exists, just update its value.
+        data.mVals[offset] = value;
+        return data.mSize;
+    }
+
+    // Check that it is safe to add a new column.
+    assert(data.mSize < this->capacity());
+
+    if (offset >= data.mSize) {
+        // The new column's index is larger than any existing index.  Append the new column.
+        data.mVals[data.mSize] = value;
+        data.mCols[data.mSize] = column;
+    } else {
+        // Insert the new column at the computed offset after shifting subsequent columns.
+        for (SizeType i = data.mSize; i > offset; --i) {
+            data.mVals[i] = data.mVals[i - 1];
+            data.mCols[i] = data.mCols[i - 1];
+        }
+        data.mVals[offset] = value;
+        data.mCols[offset] = column;
+    }
+    ++data.mSize;
+
+    return data.mSize;
+}
+
+
+template<typename ValueType, SizeType STENCIL_SIZE>
+template<typename Scalar>
+inline void
+SparseStencilMatrix<ValueType, STENCIL_SIZE>::RowEditor::scale(const Scalar& s)
+{
+    for (int idx = 0, N = this->size(); idx < N; ++idx) {
+        RowBase<>::mData.mVals[idx] *= s;
+    }
+}
+
+
+////////////////////////////////////////
+
+
+/// Diagonal preconditioner
+template<typename MatrixType>
+class JacobiPreconditioner: public Preconditioner<typename MatrixType::ValueType>
+{
+private:
+    struct InitOp;
+    struct ApplyOp;
+
+public:
+    typedef typename MatrixType::ValueType ValueType;
+    typedef Preconditioner<ValueType> BaseType;
+    typedef Vector<ValueType> VectorType;
+    typedef boost::shared_ptr<JacobiPreconditioner> Ptr;
+
+    JacobiPreconditioner(const MatrixType& A): BaseType(A), mDiag(A.numRows())
+    {
+        // Initialize vector mDiag with the values from the matrix diagonal.
+        tbb::parallel_for(SizeRange(0, A.numRows()), InitOp(A, mDiag.data()));
+    }
+
+    virtual ~JacobiPreconditioner() {}
+
+    virtual void apply(const Vector<ValueType>& r, Vector<ValueType>& z)
+    {
+        const SizeType size = mDiag.size();
+
+        assert(r.size() == z.size());
+        assert(r.size() == size);
+
+        tbb::parallel_for(SizeRange(0, size), ApplyOp(mDiag.data(), r.data(), z.data()));
+    }
+
+    /// Return @c true if all values along the diagonal are finite.
+    bool isFinite() const { return mDiag.isFinite(); }
+
+private:
+    // Functor for use with tbb::parallel_for()
+    struct InitOp
+    {
+        InitOp(const MatrixType& m, ValueType* v): mat(&m), vec(v) {}
+        void operator()(const SizeRange& range) const {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                const ValueType val = mat->getValue(n, n);
+                assert(!isApproxZero(val, ValueType(0.0001)));
+                vec[n] = static_cast<ValueType>(1.0 / val);
+            }
+        }
+        const MatrixType* mat; ValueType* vec;
+    };
+
+    // Functor for use with tbb::parallel_reduce()
+    struct ApplyOp
+    {
+        ApplyOp(const ValueType* x_, const ValueType* y_, ValueType* out_):
+            x(x_), y(y_), out(out_) {}
+        void operator()(const SizeRange& range) const {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) out[n] = x[n] * y[n];
+        }
+        const ValueType *x, *y; ValueType* out;
+    };
+
+    // The Jacobi preconditioner is a diagonal matrix
+    VectorType mDiag;
+}; // class JacobiPreconditioner
+
+
+/// Preconditioner using incomplete Cholesky factorization
+template<typename MatrixType>
+class IncompleteCholeskyPreconditioner: public Preconditioner<typename MatrixType::ValueType>
+{
+private:
+    struct CopyToLowerOp;
+    struct TransposeOp;
+
+public:
+    typedef typename MatrixType::ValueType ValueType;
+    typedef Preconditioner<ValueType> BaseType;
+    typedef Vector<ValueType> VectorType;
+    typedef boost::shared_ptr<IncompleteCholeskyPreconditioner> Ptr;
+    typedef SparseStencilMatrix<ValueType, 4>    TriangularMatrix;
+    typedef typename TriangularMatrix::ConstRow  TriangleConstRow;
+    typedef typename TriangularMatrix::RowEditor TriangleRowEditor;
+
+    IncompleteCholeskyPreconditioner(const MatrixType& matrix)
+        : BaseType(matrix)
+        , mLowerTriangular(matrix.numRows())
+        , mUpperTriangular(matrix.numRows())
+        , mTempVec(matrix.numRows())
+    {
+        // Size of matrix
+        const SizeType numRows = mLowerTriangular.numRows();
+
+        // Copy the upper triangular part to the lower triangular part.
+        tbb::parallel_for(SizeRange(0, numRows), CopyToLowerOp(matrix, mLowerTriangular));
+
+        // Build the Incomplete Cholesky Matrix
+        //
+        // Algorithm:
+        //
+        // for (k = 0; k < size; ++k) {
+        //     A(k,k) = sqrt(A(k,k));
+        //     for (i = k +1, i < size; ++i) {
+        //         if (A(i,k) == 0) continue;
+        //         A(i,k) = A(i,k) / A(k,k);
+        //     }
+        //     for (j = k+1; j < size; ++j) {
+        //         for (i = j; i < size; ++i) {
+        //             if (A(i,j) == 0) continue;
+        //             A(i,j) -= A(i,k)*A(j,k);
+        //         }
+        //     }
+        // }
+
+        mPassedCompatibilityCondition = true;
+
+        for (SizeType k = 0; k < numRows; ++k) {
+
+            TriangleConstRow crow_k = mLowerTriangular.getConstRow(k);
+            ValueType diagonalValue = crow_k.getValue(k);
+
+            // Test if the matrix build has failed.
+            if (diagonalValue < 1.e-5) {
+                mPassedCompatibilityCondition = false;
+                break;
+            }
+
+            diagonalValue = Sqrt(diagonalValue);
+
+            TriangleRowEditor row_k = mLowerTriangular.getRowEditor(k);
+            row_k.setValue(k, diagonalValue);
+
+            // Exploit the fact that the matrix is symmetric.
+            typename MatrixType::ConstRow srcRow = matrix.getConstRow(k);
+            typename MatrixType::ConstValueIter citer = srcRow.cbegin();
+            for ( ; citer; ++citer) {
+                SizeType ii = citer.column();
+                if (ii < k+1) continue; // look above diagonal
+
+                TriangleRowEditor row_ii = mLowerTriangular.getRowEditor(ii);
+
+                row_ii.setValue(k, *citer / diagonalValue);
+            }
+
+            // for (j = k+1; j < size; ++j) replaced by row iter below
+            citer.reset(); // k,j entries
+            for ( ; citer; ++citer) {
+                SizeType j = citer.column();
+                if (j < k+1) continue;
+
+                TriangleConstRow row_j = mLowerTriangular.getConstRow(j);
+                ValueType a_jk = row_j.getValue(k);  // a_jk is non zero if a_kj is non zero
+
+                // Entry (i,j) is non-zero if matrix(j,i) is nonzero
+
+                typename MatrixType::ConstRow mask = matrix.getConstRow(j);
+                typename MatrixType::ConstValueIter maskIter = mask.cbegin();
+                for ( ; maskIter; ++maskIter) {
+                    SizeType i = maskIter.column();
+                    if (i < j) continue;
+
+                    TriangleConstRow crow_i = mLowerTriangular.getConstRow(i);
+                    ValueType a_ij = crow_i.getValue(j);
+                    ValueType a_ik = crow_i.getValue(k);
+                    TriangleRowEditor row_i = mLowerTriangular.getRowEditor(i);
+                    a_ij -= a_ik * a_jk;
+
+                    row_i.setValue(j, a_ij);
+                }
+            }
+        }
+
+        // Build the transpose of the IC matrix: mUpperTriangular
+        tbb::parallel_for(SizeRange(0, numRows),
+            TransposeOp(matrix, mLowerTriangular, mUpperTriangular));
+    }
+
+    virtual ~IncompleteCholeskyPreconditioner() {}
+
+    virtual bool isValid() const { return mPassedCompatibilityCondition; }
+
+    virtual void apply(const Vector<ValueType>& rVec, Vector<ValueType>& zVec)
+    {
+        if (!mPassedCompatibilityCondition) {
+            OPENVDB_THROW(ArithmeticError, "invalid Cholesky decomposition");
+        }
+
+        // Solve mUpperTriangular * mLowerTriangular * rVec = zVec;
+
+        SizeType size = mLowerTriangular.numRows();
+
+        zVec.fill(zeroVal<ValueType>());
+        ValueType* zData = zVec.data();
+
+        if (size == 0) return;
+
+        assert(rVec.size() == size);
+        assert(zVec.size() == size);
+
+        // Allocate a temp vector
+        mTempVec.fill(zeroVal<ValueType>());
+        ValueType* tmpData = mTempVec.data();
+        const ValueType* rData = rVec.data();
+
+        // Solve mLowerTriangular * tmp = rVec;
+        for (SizeType i = 0; i < size; ++i) {
+            typename TriangularMatrix::ConstRow row = mLowerTriangular.getConstRow(i);
+            ValueType diagonal = row.getValue(i);
+            ValueType dot = row.dot(mTempVec);
+            tmpData[i] = (rData[i] - dot) / diagonal;
+            if (!std::isfinite(tmpData[i])) {
+                OPENVDB_LOG_DEBUG_RUNTIME("1 diagonal was " << diagonal);
+                OPENVDB_LOG_DEBUG_RUNTIME("1a diagonal " << row.getValue(i));
+            }
+        }
+
+        // Solve mUpperTriangular * zVec = tmp;
+        for (SizeType ii = 0; ii < size; ++ii) {
+            SizeType i = size - 1 - ii;
+            typename TriangularMatrix::ConstRow row = mUpperTriangular.getConstRow(i);
+            ValueType diagonal = row.getValue(i);
+            ValueType dot = row.dot(zVec);
+            zData[i] = (tmpData[i] - dot) / diagonal;
+            if (!std::isfinite(zData[i])) {
+                OPENVDB_LOG_DEBUG_RUNTIME("2 diagonal was " << diagonal);
+            }
+        }
+    }
+
+    const TriangularMatrix& lowerMatrix() const { return mLowerTriangular; }
+    const TriangularMatrix& upperMatrix() const { return mUpperTriangular; }
+
+private:
+    // Functor for use with tbb::parallel_for()
+    struct CopyToLowerOp
+    {
+        CopyToLowerOp(const MatrixType& m, TriangularMatrix& l): mat(&m), lower(&l) {}
+        void operator()(const SizeRange& range) const {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                typename TriangularMatrix::RowEditor outRow = lower->getRowEditor(n);
+                outRow.clear();
+                typename MatrixType::ConstRow inRow = mat->getConstRow(n);
+                for (typename MatrixType::ConstValueIter it = inRow.cbegin(); it; ++it) {
+                    if (it.column() > n) continue; // skip above diagonal
+                    outRow.setValue(it.column(), *it);
+                }
+            }
+        }
+        const MatrixType* mat; TriangularMatrix* lower;
+    };
+
+    // Functor for use with tbb::parallel_for()
+    struct TransposeOp
+    {
+        TransposeOp(const MatrixType& m, const TriangularMatrix& l, TriangularMatrix& u):
+            mat(&m), lower(&l), upper(&u) {}
+        void operator()(const SizeRange& range) const {
+            for (SizeType n = range.begin(), N = range.end(); n < N; ++n) {
+                typename TriangularMatrix::RowEditor outRow = upper->getRowEditor(n);
+                outRow.clear();
+                // Use the fact that matrix is symmetric.
+                typename MatrixType::ConstRow inRow = mat->getConstRow(n);
+                for (typename MatrixType::ConstValueIter it = inRow.cbegin(); it; ++it) {
+                    const SizeType column = it.column();
+                    if (column < n) continue; // only set upper triangle
+                    outRow.setValue(column, lower->getValue(column, n));
+                }
+            }
+        }
+        const MatrixType* mat; const TriangularMatrix* lower; TriangularMatrix* upper;
+    };
+
+    TriangularMatrix  mLowerTriangular;
+    TriangularMatrix  mUpperTriangular;
+    Vector<ValueType> mTempVec;
+    bool              mPassedCompatibilityCondition;
+}; // class IncompleteCholeskyPreconditioner
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+/// Compute @e ax + @e y.
+template<typename T>
+inline void
+axpy(const T& a, const T* xVec, const T* yVec, T* resultVec, SizeType size)
+{
+    tbb::parallel_for(SizeRange(0, size), LinearOp<T>(a, xVec, yVec, resultVec));
+}
+
+/// Compute @e ax + @e y.
+template<typename T>
+inline void
+axpy(const T& a, const Vector<T>& xVec, const Vector<T>& yVec, Vector<T>& result)
+{
+    assert(xVec.size() == yVec.size());
+    assert(xVec.size() == result.size());
+    axpy(a, xVec.data(), yVec.data(), result.data(), xVec.size());
+}
+
+
+/// Compute @e r = @e b &minus; @e Ax.
+template<typename MatrixOperator, typename VecValueType>
+inline void
+computeResidual(const MatrixOperator& A, const VecValueType* x,
+    const VecValueType* b, VecValueType* r)
+{
+    // Compute r = A * x.
+    A.vectorMultiply(x, r);
+    // Compute r = b - r.
+    tbb::parallel_for(SizeRange(0, A.numRows()), LinearOp<VecValueType>(-1.0, r, b, r));
+}
+
+/// Compute @e r = @e b &minus; @e Ax.
+template<typename MatrixOperator, typename T>
+inline void
+computeResidual(const MatrixOperator& A, const Vector<T>& x, const Vector<T>& b, Vector<T>& r)
+{
+    assert(x.size() == b.size());
+    assert(x.size() == r.size());
+    assert(x.size() == A.numRows());
+
+    computeResidual(A, x.data(), b.data(), r.data());
+}
+
+} // namespace internal
+
+
+////////////////////////////////////////
+
+
+template<typename PositiveDefMatrix>
+inline State
+solve(
+    const PositiveDefMatrix& Amat,
+    const Vector<typename PositiveDefMatrix::ValueType>& bVec,
+    Vector<typename PositiveDefMatrix::ValueType>& xVec,
+    Preconditioner<typename PositiveDefMatrix::ValueType>& precond,
+    const State& termination)
+{
+    util::NullInterrupter interrupter;
+    return solve(Amat, bVec, xVec, precond, interrupter, termination);
+}
+
+
+template<typename PositiveDefMatrix, typename Interrupter>
+inline State
+solve(
+    const PositiveDefMatrix& Amat,
+    const Vector<typename PositiveDefMatrix::ValueType>& bVec,
+    Vector<typename PositiveDefMatrix::ValueType>& xVec,
+    Preconditioner<typename PositiveDefMatrix::ValueType>& precond,
+    Interrupter& interrupter,
+    const State& termination)
+{
+    typedef typename PositiveDefMatrix::ValueType ValueType;
+    typedef Vector<ValueType> VectorType;
+
+    State result;
+    result.success = false;
+    result.iterations = 0;
+    result.relativeError = 0.0;
+    result.absoluteError = 0.0;
+
+    const SizeType size = Amat.numRows();
+    if (size == 0) {
+        OPENVDB_LOG_WARN("pcg::solve(): matrix has dimension zero");
+        return result;
+    }
+    if (size != bVec.size()) {
+        OPENVDB_THROW(ArithmeticError, "A and b have incompatible sizes"
+            << size << "x" << size << " vs. " << bVec.size() << ")");
+    }
+    if (size != xVec.size()) {
+        OPENVDB_THROW(ArithmeticError, "A and x have incompatible sizes"
+            << size << "x" << size << " vs. " << xVec.size() << ")");
+    }
+
+    // Temp vectors
+    VectorType zVec(size); // transformed residual (M^-1 r)
+    VectorType pVec(size); // search direction
+    VectorType qVec(size); // A * p
+
+    // Compute norm of B (the source)
+    const ValueType tmp = bVec.infNorm();
+    const ValueType infNormOfB = isZero(tmp) ? 1.f : tmp;
+
+    // Compute rVec: residual = b - Ax.
+    VectorType rVec(size); // vector of residuals
+
+    internal::computeResidual(Amat, xVec, bVec, rVec);
+
+    assert(rVec.isFinite());
+
+    // Normalize the residual norm with the source norm and look for early out.
+    result.absoluteError = static_cast<double>(rVec.infNorm());
+    result.relativeError = static_cast<double>(result.absoluteError / infNormOfB);
+    if (result.relativeError <= termination.relativeError) {
+        result.success = true;
+        return result;
+    }
+
+    // Iterations of the CG solve
+
+    ValueType rDotZPrev(1); // inner product of <z,r>
+
+    // Keep track of the minimum error to monitor convergence.
+    ValueType minL2Error = std::numeric_limits<ValueType>::max();
+    ValueType l2Error;
+
+    int iteration = 0;
+    for ( ; iteration < termination.iterations; ++iteration) {
+
+        if (interrupter.wasInterrupted()) {
+            OPENVDB_THROW(RuntimeError, "conjugate gradient solver was interrupted");
+        }
+
+        OPENVDB_LOG_DEBUG_RUNTIME("pcg::solve() " << result);
+
+        result.iterations = iteration + 1;
+
+        // Apply preconditioner to residual
+        // z_{k} = M^-1 r_{k}
+        precond.apply(rVec, zVec);
+
+        // <r,z>
+        const ValueType rDotZ = rVec.dot(zVec);
+        assert(std::isfinite(rDotZ));
+
+        if (0 == iteration) {
+            // Initialize
+            pVec = zVec;
+        } else {
+            const ValueType beta = rDotZ / rDotZPrev;
+            // p = beta * p + z
+            internal::axpy(beta, pVec, zVec, /*result */pVec);
+        }
+
+        // q_{k} = A p_{k}
+        Amat.vectorMultiply(pVec, qVec);
+
+        // alpha = <r_{k-1}, z_{k-1}> / <p_{k},q_{k}>
+        const ValueType pAp = pVec.dot(qVec);
+        assert(std::isfinite(pAp));
+
+        const ValueType alpha = rDotZ / pAp;
+        rDotZPrev = rDotZ;
+
+        // x_{k} = x_{k-1} + alpha * p_{k}
+        internal::axpy(alpha, pVec, xVec, /*result=*/xVec);
+
+        // r_{k} = r_{k-1} - alpha_{k-1} A p_{k}
+        internal::axpy(-alpha, qVec, rVec, /*result=*/rVec);
+
+        // update tolerances
+        l2Error = rVec.l2Norm();
+        minL2Error = Min(l2Error, minL2Error);
+
+        result.absoluteError = static_cast<double>(rVec.infNorm());
+        result.relativeError = static_cast<double>(result.absoluteError / infNormOfB);
+
+        if (l2Error > 2 * minL2Error) {
+            // The solution started to diverge.
+            result.success = false;
+            break;
+        }
+        if (!std::isfinite(result.absoluteError)) {
+            // Total divergence of solution
+            result.success = false;
+            break;
+        }
+        if (result.absoluteError <= termination.absoluteError) {
+            // Convergence
+            result.success = true;
+            break;
+        }
+        if (result.relativeError <= termination.relativeError) {
+            // Convergence
+            result.success = true;
+            break;
+        }
+    }
+    OPENVDB_LOG_DEBUG_RUNTIME("pcg::solve() " << result);
+
+    return result;
+}
+
+} // namespace pcg
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_CONJGRADIENT_HAS_BEEN_INCLUDED
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
diff --git a/nuparu/include/openvdb_new/math/Coord.h b/nuparu/include/openvdb_new/math/Coord.h
new file mode 100644
index 00000000..01a4893c
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Coord.h
@@ -0,0 +1,528 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_COORD_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_COORD_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include "Math.h"
+#include "Vec3.h"
+
+namespace tbb { class split; } // forward declaration
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @brief Signed (x, y, z) 32-bit integer coordinates
+class Coord
+{
+public:
+    typedef int32_t Int32;
+    typedef uint32_t Index32;
+    typedef Vec3<Int32> Vec3i;
+    typedef Vec3<Index32> Vec3I;
+
+    typedef Int32 ValueType;
+    typedef std::numeric_limits<ValueType> Limits;
+
+    Coord() { mVec[0] = mVec[1] = mVec[2] = 0; }
+    explicit Coord(Int32 xyz) { mVec[0] = mVec[1] = mVec[2] = xyz; }
+    Coord(Int32 x, Int32 y, Int32 z) { mVec[0] = x; mVec[1] = y; mVec[2] = z; }
+    explicit Coord(const Vec3i& v) { mVec[0] = v[0]; mVec[1] = v[1]; mVec[2] = v[2]; }
+    explicit Coord(const Vec3I& v)
+    {
+        mVec[0] = Int32(v[0]); mVec[1] = Int32(v[1]); mVec[2] = Int32(v[2]);
+    }
+    explicit Coord(const Int32* v) { mVec[0] = v[0]; mVec[1] = v[1]; mVec[2] = v[2]; }
+
+    /// @brief Return the smallest possible coordinate
+    static Coord min() { return Coord(Limits::min()); }
+
+    /// @brief Return the largest possible coordinate
+    static Coord max() { return Coord(Limits::max()); }
+
+    /// @brief Return @a xyz rounded to the closest integer coordinates
+    /// (cell centered conversion).
+    template<typename T> static Coord round(const Vec3<T>& xyz)
+    {
+        return Coord(Int32(Round(xyz[0])), Int32(Round(xyz[1])), Int32(Round(xyz[2])));
+    }
+    /// @brief Return the largest integer coordinates that are not greater
+    /// than @a xyz (node centered conversion).
+    template<typename T> static Coord floor(const Vec3<T>& xyz)
+    {
+        return Coord(Int32(Floor(xyz[0])), Int32(Floor(xyz[1])), Int32(Floor(xyz[2])));
+    }
+
+    /// @brief Return the largest integer coordinates that are not greater
+    /// than @a xyz+1 (node centered conversion).
+    template<typename T> static Coord ceil(const Vec3<T>& xyz)
+    {
+        return Coord(Int32(Ceil(xyz[0])), Int32(Ceil(xyz[1])), Int32(Ceil(xyz[2])));
+    }
+
+    Coord& reset(Int32 x, Int32 y, Int32 z)
+    {
+        mVec[0] = x; mVec[1] = y; mVec[2] = z;
+        return *this;
+    }
+    Coord& reset(Int32 xyz) { return this->reset(xyz, xyz, xyz); }
+
+    Coord& setX(Int32 x) { mVec[0] = x; return *this; }
+    Coord& setY(Int32 y) { mVec[1] = y; return *this; }
+    Coord& setZ(Int32 z) { mVec[2] = z; return *this; }
+
+    Coord& offset(Int32 dx, Int32 dy, Int32 dz)
+    {
+        mVec[0]+=dx; mVec[1]+=dy; mVec[2]+=dz;
+        return *this;
+    }
+    Coord& offset(Int32 n) { return this->offset(n, n, n); }
+    Coord offsetBy(Int32 dx, Int32 dy, Int32 dz) const
+    {
+        return Coord(mVec[0] + dx, mVec[1] + dy, mVec[2] + dz);
+    }
+    Coord offsetBy(Int32 n) const { return offsetBy(n, n, n); }
+
+    Coord& operator+=(const Coord& rhs)
+    {
+        mVec[0] += rhs[0]; mVec[1] += rhs[1]; mVec[2] += rhs[2];
+        return *this;
+    }
+    Coord& operator-=(const Coord& rhs)
+    {
+        mVec[0] -= rhs[0]; mVec[1] -= rhs[1]; mVec[2] -= rhs[2];
+        return *this;
+    }
+    Coord operator+(const Coord& rhs) const
+    {
+        return Coord(mVec[0] + rhs[0], mVec[1] + rhs[1], mVec[2] + rhs[2]);
+    }
+    Coord operator-(const Coord& rhs) const
+    {
+        return Coord(mVec[0] - rhs[0], mVec[1] - rhs[1], mVec[2] - rhs[2]);
+    }
+    Coord operator-() const { return Coord(-mVec[0], -mVec[1], -mVec[2]); }
+
+    Coord  operator>> (size_t n) const { return Coord(mVec[0]>>n, mVec[1]>>n, mVec[2]>>n); }
+    Coord  operator<< (size_t n) const { return Coord(mVec[0]<<n, mVec[1]<<n, mVec[2]<<n); }
+    Coord& operator<<=(size_t n) { mVec[0]<<=n; mVec[1]<<=n; mVec[2]<<=n; return *this; }
+    Coord& operator>>=(size_t n) { mVec[0]>>=n; mVec[1]>>=n; mVec[2]>>=n; return *this; }
+    Coord  operator&  (Int32 n) const { return Coord(mVec[0] & n, mVec[1] & n, mVec[2] & n); }
+    Coord  operator|  (Int32 n) const { return Coord(mVec[0] | n, mVec[1] | n, mVec[2] | n); }
+    Coord& operator&= (Int32 n) { mVec[0]&=n; mVec[1]&=n; mVec[2]&=n; return *this; }
+    Coord& operator|= (Int32 n) { mVec[0]|=n; mVec[1]|=n; mVec[2]|=n; return *this; }
+
+    Int32 x() const { return mVec[0]; }
+    Int32 y() const { return mVec[1]; }
+    Int32 z() const { return mVec[2]; }
+    Int32 operator[](size_t i) const { assert(i < 3); return mVec[i]; }
+    Int32& x() { return mVec[0]; }
+    Int32& y() { return mVec[1]; }
+    Int32& z() { return mVec[2]; }
+    Int32& operator[](size_t i) { assert(i < 3); return mVec[i]; }
+
+    const Int32* data() const { return mVec; }
+    Int32* data() { return mVec; }
+    const Int32* asPointer() const { return mVec; }
+    Int32* asPointer() { return mVec; }
+    Vec3d asVec3d() const { return Vec3d(double(mVec[0]), double(mVec[1]), double(mVec[2])); }
+    Vec3s asVec3s() const { return Vec3s(float(mVec[0]), float(mVec[1]), float(mVec[2])); }
+    Vec3i asVec3i() const { return Vec3i(mVec); }
+    Vec3I asVec3I() const { return Vec3I(Index32(mVec[0]), Index32(mVec[1]), Index32(mVec[2])); }
+    void asXYZ(Int32& x, Int32& y, Int32& z) const { x = mVec[0]; y = mVec[1]; z = mVec[2]; }
+
+    bool operator==(const Coord& rhs) const
+    {
+        return (mVec[0] == rhs.mVec[0] && mVec[1] == rhs.mVec[1] && mVec[2] == rhs.mVec[2]);
+    }
+    bool operator!=(const Coord& rhs) const { return !(*this == rhs); }
+
+    /// Lexicographic less than
+    bool operator<(const Coord& rhs) const
+    {
+        return this->x() < rhs.x() ? true : this->x() > rhs.x() ? false
+             : this->y() < rhs.y() ? true : this->y() > rhs.y() ? false
+             : this->z() < rhs.z() ? true : false;
+    }
+    /// Lexicographic less than or equal to
+    bool operator<=(const Coord& rhs) const
+    {
+        return this->x() < rhs.x() ? true : this->x() > rhs.x() ? false
+             : this->y() < rhs.y() ? true : this->y() > rhs.y() ? false
+             : this->z() <=rhs.z() ? true : false;
+    }
+    /// Lexicographic greater than
+    bool operator>(const Coord& rhs) const { return !(*this <= rhs); }
+    /// Lexicographic greater than or equal to
+    bool operator>=(const Coord& rhs) const { return !(*this < rhs); }
+
+    /// Perform a component-wise minimum with the other Coord.
+    void minComponent(const Coord& other)
+    {
+        mVec[0] = std::min(mVec[0], other.mVec[0]);
+        mVec[1] = std::min(mVec[1], other.mVec[1]);
+        mVec[2] = std::min(mVec[2], other.mVec[2]);
+    }
+
+    /// Perform a component-wise maximum with the other Coord.
+    void maxComponent(const Coord& other)
+    {
+        mVec[0] = std::max(mVec[0], other.mVec[0]);
+        mVec[1] = std::max(mVec[1], other.mVec[1]);
+        mVec[2] = std::max(mVec[2], other.mVec[2]);
+    }
+
+    /// Return the component-wise minimum of the two Coords.
+    static inline Coord minComponent(const Coord& lhs, const Coord& rhs)
+    {
+        return Coord(std::min(lhs.x(), rhs.x()),
+                     std::min(lhs.y(), rhs.y()),
+                     std::min(lhs.z(), rhs.z()));
+    }
+
+    /// Return the component-wise maximum of the two Coords.
+    static inline Coord maxComponent(const Coord& lhs, const Coord& rhs)
+    {
+        return Coord(std::max(lhs.x(), rhs.x()),
+                     std::max(lhs.y(), rhs.y()),
+                     std::max(lhs.z(), rhs.z()));
+    }
+    
+    /// Return true if any of the components of @a a are smaller than the
+    /// corresponding components of @a b.
+    static inline bool lessThan(const Coord& a, const Coord& b)
+    {
+            return (a[0] < b[0] || a[1] < b[1] || a[2] < b[2]);
+    }
+
+    /// @brief Return the index (0, 1 or 2) with the smallest value.
+    size_t minIndex() const { return MinIndex(mVec); }
+
+    /// @brief Return the index (0, 1 or 2) with the largest value.
+    size_t maxIndex() const { return MaxIndex(mVec); }
+
+    void read(std::istream& is) { is.read(reinterpret_cast<char*>(mVec), sizeof(mVec)); }
+    void write(std::ostream& os) const
+    {
+        os.write(reinterpret_cast<const char*>(mVec), sizeof(mVec));
+    }
+
+private:
+
+    Int32 mVec[3];
+}; // class Coord
+
+
+////////////////////////////////////////
+
+
+/// @brief Axis-aligned bounding box of signed integer coordinates
+/// @note The range of the integer coordinates, [min, max], is inclusive.
+/// Thus, a bounding box with min = max is not empty but rather encloses
+/// a single coordinate.
+class CoordBBox
+{
+public:
+    typedef uint64_t         Index64;
+    typedef Coord::ValueType ValueType;
+
+    /// @brief Iterator over Coord domain covered by a CoordBBox
+    ///
+    /// @note If ZYX is true Z is the fastest moving coordinate, else
+    /// it is the X coordinate, i.e. XYZ traversal
+    template<bool ZYX>
+    class Iterator {
+    public:
+        /// @brief C-tor from a bounding box
+        Iterator(const CoordBBox &b) : mPos(b.min()), mMin(b.min()), mMax(b.max()) {}
+        /// @brief Increments iterator to point to the next coordinate
+        /// @note Stops a the last + 1 coordinate of the bounding box
+        /// as defined by the template parameter.
+        Iterator& operator++() {
+            ZYX ? this->next<2,1,0>() : this->next<0,1,2>();
+            return *this;
+        }
+        /// @brief Return true if the iterator still points to a valid coordinate
+        operator bool() const {
+            return ZYX ? mPos[0] <= mMax[0] : mPos[2] <= mMax[2];
+        }
+        /// @brief Return a const reference to the coordinate currently pointed to
+        const Coord& operator*() const { return mPos; }
+    private:
+        template<size_t a, size_t b, size_t c>
+        inline void next() {
+            if ( mPos[a] < mMax[a] )  {//by far this is the most common case
+                ++mPos[a];
+            } else if ( mPos[b] < mMax[b] )  {
+                mPos[a] = mMin[a];
+                ++mPos[b];
+            } else if ( mPos[c] <= mMax[c] ) {
+                mPos[a] = mMin[a];
+                mPos[b] = mMin[b];
+                ++mPos[c];
+            }
+        }
+        Coord mPos, mMin, mMax;
+    };// CoordBBox::Iterator
+
+    /// @brief The default constructor produces an empty bounding box.
+    CoordBBox(): mMin(Coord::max()), mMax(Coord::min()) {}
+    /// @brief Construct a bounding box with the given @a min and @a max bounds.
+    CoordBBox(const Coord& min, const Coord& max): mMin(min), mMax(max) {}
+    /// @brief Splitting constructor for use in TBB ranges
+    /// @note The other bounding box is assumed to be divisible.
+    CoordBBox(CoordBBox& other, const tbb::split&): mMin(other.mMin), mMax(other.mMax)
+    {
+        assert(this->is_divisible());
+        const size_t n = this->maxExtent();
+        mMax[n] = (mMin[n] + mMax[n]) >> 1;
+        other.mMin[n] = mMax[n] + 1;
+    }
+
+    static CoordBBox createCube(const Coord& min, ValueType dim)
+    {
+        return CoordBBox(min, min.offsetBy(dim - 1));
+    }
+
+    /// Return an "infinite" bounding box, as defined by the Coord value range.
+    static CoordBBox inf() { return CoordBBox(Coord::min(), Coord::max()); }
+
+    const Coord& min() const { return mMin; }
+    const Coord& max() const { return mMax; }
+
+    Coord& min() { return mMin; }
+    Coord& max() { return mMax; }
+
+    void reset() { mMin = Coord::max(); mMax = Coord::min(); }
+    void reset(const Coord& min, const Coord& max) { mMin = min; mMax = max; }
+    void resetToCube(const Coord& min, ValueType dim) { mMin = min; mMax = min.offsetBy(dim - 1); }
+
+    /// @note The start coordinate is inclusive.
+    Coord getStart() const { return mMin; }
+    /// @note The end coordinate is exclusive.
+    Coord getEnd() const { return mMax.offsetBy(1); }
+
+    bool operator==(const CoordBBox& rhs) const { return mMin == rhs.mMin && mMax == rhs.mMax; }
+    bool operator!=(const CoordBBox& rhs) const { return !(*this == rhs); }
+
+    bool empty() const { return (mMin[0] > mMax[0] || mMin[1] > mMax[1] || mMin[2] > mMax[2]); }
+    //@{
+    /// Return @c true if this bounding box is nonempty
+    operator bool() const { return !this->empty(); }
+    bool hasVolume() const { return !this->empty(); }
+    //@}
+
+    /// Return the floating-point position of the center of this bounding box.
+    Vec3d getCenter() const { return 0.5 * Vec3d((mMin + mMax).asPointer()); }
+
+    /// @brief Return the dimensions of the coordinates spanned by this bounding box.
+    /// @note Since coordinates are inclusive, a bounding box with min = max
+    /// has dimensions of (1, 1, 1).
+    Coord dim() const { return mMax.offsetBy(1) - mMin; }
+    /// @todo deprecate - use dim instead
+    Coord extents() const { return this->dim(); }
+    /// @brief Return the integer volume of coordinates spanned by this bounding box.
+    /// @note Since coordinates are inclusive, a bounding box with min = max has volume one.
+    Index64 volume() const
+    {
+        const Coord d = this->dim();
+        return Index64(d[0]) * Index64(d[1]) * Index64(d[2]);
+    }
+    /// Return @c true if this bounding box can be subdivided [mainly for use by TBB].
+    bool is_divisible() const { return mMin[0]<mMax[0] && mMin[1]<mMax[1] && mMin[2]<mMax[2]; }
+
+    /// @brief Return the index (0, 1 or 2) of the shortest axis.
+    size_t minExtent() const { return this->dim().minIndex(); }
+
+    /// @brief Return the index (0, 1 or 2) of the longest axis.
+    size_t maxExtent() const { return this->dim().maxIndex(); }
+
+    /// Return @c true if point (x, y, z) is inside this bounding box.
+    bool isInside(const Coord& xyz) const
+    {
+        return !(Coord::lessThan(xyz,mMin) || Coord::lessThan(mMax,xyz));
+    }
+
+    /// Return @c true if the given bounding box is inside this bounding box.
+    bool isInside(const CoordBBox& b) const
+    {
+        return !(Coord::lessThan(b.mMin,mMin) || Coord::lessThan(mMax,b.mMax));
+    }
+
+    /// Return @c true if the given bounding box overlaps with this bounding box.
+    bool hasOverlap(const CoordBBox& b) const
+    {
+        return !(Coord::lessThan(mMax,b.mMin) || Coord::lessThan(b.mMax,mMin));
+    }
+
+    /// Pad this bounding box with the specified padding.
+    void expand(ValueType padding)
+    {
+        mMin.offset(-padding);
+        mMax.offset( padding);
+    }
+
+    /// Return a new instance that is expanded by the specified padding.
+    CoordBBox expandBy(ValueType padding) const
+    {
+        return CoordBBox(mMin.offsetBy(-padding),mMax.offsetBy(padding));
+    }
+    
+    /// Expand this bounding box to enclose point (x, y, z).
+    void expand(const Coord& xyz)
+    {
+        mMin.minComponent(xyz);
+        mMax.maxComponent(xyz);
+    }
+    
+    /// Union this bounding box with the given bounding box.
+    void expand(const CoordBBox& bbox)
+    {
+          mMin.minComponent(bbox.min());
+          mMax.maxComponent(bbox.max());
+    }
+    /// Intersect this bounding box with the given bounding box.
+    void intersect(const CoordBBox& bbox)
+    {
+        mMin.maxComponent(bbox.min());
+        mMax.minComponent(bbox.max());
+    }
+    /// @brief Union this bounding box with the cubical bounding box
+    /// of the given size and with the given minimum coordinates.
+    void expand(const Coord& min, Coord::ValueType dim)
+    {
+        mMin.minComponent(min);
+        mMax.maxComponent(min.offsetBy(dim-1));
+    }
+    /// Translate this bounding box by @f$(t_x, t_y, t_z)@f$.
+    void translate(const Coord& t) { mMin += t; mMax += t; }
+
+    //@{
+    /// @brief Bit-wise operations performed on both the min and max members
+    CoordBBox  operator>> (size_t n) const { return CoordBBox(mMin>>n, mMax>>n); }
+    CoordBBox  operator<< (size_t n) const { return CoordBBox(mMin<<n, mMax<<n); }
+    CoordBBox& operator<<=(size_t n) { mMin <<= n; mMax <<= n; return *this; }
+    CoordBBox& operator>>=(size_t n) { mMin >>= n; mMax >>= n; return *this; }
+    CoordBBox  operator&  (Coord::Int32 n) const { return CoordBBox(mMin & n, mMax & n); }
+    CoordBBox  operator|  (Coord::Int32 n) const { return CoordBBox(mMin | n, mMax | n); }
+    CoordBBox& operator&= (Coord::Int32 n) { mMin &= n; mMax &= n; return *this; }
+    CoordBBox& operator|= (Coord::Int32 n) { mMin |= n; mMax |= n; return *this; }
+    //@}
+     
+    /// Unserialize this bounding box from the given stream.
+    void read(std::istream& is) { mMin.read(is); mMax.read(is); }
+    /// Serialize this bounding box to the given stream.
+    void write(std::ostream& os) const { mMin.write(os); mMax.write(os); }
+
+private:
+    Coord mMin, mMax;
+}; // class CoordBBox
+
+
+////////////////////////////////////////
+
+
+inline std::ostream& operator<<(std::ostream& os, const Coord& xyz)
+{
+    os << xyz.asVec3i(); return os;
+}
+
+
+//@{
+/// Allow a Coord to be added to or subtracted from a Vec3.
+template<typename T>
+inline Vec3<typename promote<T, typename Coord::ValueType>::type>
+operator+(const Vec3<T>& v0, const Coord& v1)
+{
+    Vec3<typename promote<T, typename Coord::ValueType>::type> result(v0);
+    result[0] += v1[0];
+    result[1] += v1[1];
+    result[2] += v1[2];
+    return result;
+}
+
+template<typename T>
+inline Vec3<typename promote<T, typename Coord::ValueType>::type>
+operator+(const Coord& v1, const Vec3<T>& v0)
+{
+    Vec3<typename promote<T, typename Coord::ValueType>::type> result(v0);
+    result[0] += v1[0];
+    result[1] += v1[1];
+    result[2] += v1[2];
+    return result;
+}
+//@}
+
+
+//@{
+/// Allow a Coord to be subtracted from a Vec3.
+template <typename T>
+inline Vec3<typename promote<T, Coord::ValueType>::type>
+operator-(const Vec3<T>& v0, const Coord& v1)
+{
+    Vec3<typename promote<T, Coord::ValueType>::type> result(v0);
+    result[0] -= v1[0];
+    result[1] -= v1[1];
+    result[2] -= v1[2];
+    return result;
+}
+
+template <typename T>
+inline Vec3<typename promote<T, Coord::ValueType>::type>
+operator-(const Coord& v1, const Vec3<T>& v0)
+{
+    Vec3<typename promote<T, Coord::ValueType>::type> result(v0);
+    result[0] -= v1[0];
+    result[1] -= v1[1];
+    result[2] -= v1[2];
+    return -result;
+}
+//@}
+
+inline std::ostream&
+operator<<(std::ostream& os, const CoordBBox& b)
+{
+    os << b.min() << " -> " << b.max();
+    return os;
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_COORD_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/DDA.h b/nuparu/include/openvdb_new/math/DDA.h
new file mode 100644
index 00000000..c4242b14
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/DDA.h
@@ -0,0 +1,373 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file DDA.h
+///
+/// @author Ken Museth
+///
+/// @brief Digital Differential Analyzers specialized for VDB.
+
+#ifndef OPENVDB_MATH_DDA_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_DDA_HAS_BEEN_INCLUDED
+
+#include "Coord.h"
+#include "Math.h"
+#include "Vec3.h"
+#include <iostream>// for std::ostream
+#include <limits>// for std::numeric_limits<Type>::max()
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @brief A Digital Differential Analyzer specialized for OpenVDB grids
+/// @note Conceptually similar to Bresenham's line algorithm applied
+/// to a 3D Ray intersecting OpenVDB nodes or voxels. Log2Dim = 0
+/// corresponds to a voxel and Log2Dim a tree node of size 2^Log2Dim.
+///
+/// @note The Ray template class is expected to have the following
+/// methods: test(time), t0(), t1(), invDir(), and  operator()(time).
+/// See the example Ray class above for their definition.
+template<typename RayT, Index Log2Dim = 0>
+class DDA
+{
+public:
+    typedef typename RayT::RealType RealType;
+    typedef RealType                RealT;
+    typedef typename RayT::Vec3Type Vec3Type;
+    typedef Vec3Type                Vec3T;
+
+    /// @brief uninitialized constructor
+    DDA() {}
+
+    DDA(const RayT& ray) { this->init(ray); }
+
+    DDA(const RayT& ray, RealT startTime) { this->init(ray, startTime); }
+
+    DDA(const RayT& ray, RealT startTime, RealT maxTime) { this->init(ray, startTime, maxTime); }
+
+    inline void init(const RayT& ray, RealT startTime, RealT maxTime)
+    {
+        assert(startTime <= maxTime);
+        static const int DIM = 1 << Log2Dim;
+        mT0 = startTime;
+        mT1 = maxTime;
+        const Vec3T &pos = ray(mT0), &dir = ray.dir(), &inv = ray.invDir();
+        mVoxel = Coord::floor(pos) & (~(DIM-1));
+        for (int axis = 0; axis < 3; ++axis) {
+            if (math::isZero(dir[axis])) {//handles dir = +/- 0
+                mStep[axis]  = 0;//dummy value
+                mNext[axis]  = std::numeric_limits<RealT>::max();//i.e. disabled!
+                mDelta[axis] = std::numeric_limits<RealT>::max();//dummy value
+            } else if (inv[axis] > 0) {
+                mStep[axis]  = DIM;
+                mNext[axis]  = mT0 + (mVoxel[axis] + DIM - pos[axis]) * inv[axis];
+                mDelta[axis] = mStep[axis] * inv[axis];
+            } else {
+                mStep[axis]  = -DIM;
+                mNext[axis]  = mT0 + (mVoxel[axis] - pos[axis]) * inv[axis];
+                mDelta[axis] = mStep[axis] * inv[axis];
+            }
+        }
+    }
+
+    inline void init(const RayT& ray) { this->init(ray, ray.t0(), ray.t1()); }
+
+    inline void init(const RayT& ray, RealT startTime) { this->init(ray, startTime, ray.t1()); }
+
+    /// @brief Increment the voxel index to next intersected voxel or node
+    /// and returns true if the step in time does not exceed maxTime.
+    inline bool step()
+    {
+        const int stepAxis = static_cast<int>(math::MinIndex(mNext));
+        mT0 = mNext[stepAxis];
+        mNext[stepAxis]  += mDelta[stepAxis];
+        mVoxel[stepAxis] += mStep[stepAxis];
+        return mT0 <= mT1;
+    }
+
+    /// @brief Return the index coordinates of the next node or voxel
+    /// intersected by the ray. If Log2Dim = 0 the return value is the
+    /// actual signed coordinate of the voxel, else it is the origin
+    /// of the corresponding VDB tree node or tile.
+    /// @note Incurs no computational overhead.
+    inline const Coord& voxel() const { return mVoxel; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// first hit of a tree node of size 2^Log2Dim.
+    /// @details This value is initialized to startTime or ray.t0()
+    /// depending on the constructor used.
+    /// @note Incurs no computational overhead.
+    inline RealType time() const { return mT0; }
+
+    /// @brief Return the maximum time (parameterized along the Ray).
+    inline RealType maxTime() const { return mT1; }
+
+    /// @brief Return the time (parameterized along the Ray) of the
+    /// second (i.e. next) hit of a tree node of size 2^Log2Dim.
+    /// @note Incurs a (small) computational overhead.
+    inline RealType next() const { return math::Min(mT1, mNext[0], mNext[1], mNext[2]); }
+
+    /// @brief Print information about this DDA for debugging.
+    /// @param os    a stream to which to write textual information.
+    void print(std::ostream& os = std::cout) const
+      {
+          os << "Dim=" << (1<<Log2Dim) << " time=" << mT0 << " next()="
+             << this->next() << " voxel=" << mVoxel << " next=" << mNext
+             << " delta=" << mDelta << " step=" << mStep << std::endl;
+      }
+
+private:
+    RealT mT0, mT1;
+    Coord mVoxel, mStep;
+    Vec3T mDelta, mNext;
+}; // class DDA
+
+/// @brief Output streaming of the Ray class.
+/// @note Primarily intended for debugging.
+template<typename RayT, Index Log2Dim>
+inline std::ostream& operator<<(std::ostream& os, const DDA<RayT, Log2Dim>& dda)
+{
+    os << "Dim="     << (1<<Log2Dim) << " time="  << dda.time()
+       << " next()=" << dda.next()   << " voxel=" << dda.voxel();
+    return os;
+}
+
+/////////////////////////////////////////// LevelSetHDDA ////////////////////////////////////////////
+
+
+/// @brief Helper class that implements Hierarchical Digital Differential Analyzers
+/// and is specialized for ray intersections with level sets
+template<typename TreeT, int NodeLevel>
+struct LevelSetHDDA
+{
+    typedef typename TreeT::RootNodeType::NodeChainType ChainT;
+    typedef typename boost::mpl::at<ChainT, boost::mpl::int_<NodeLevel> >::type NodeT;
+
+    template <typename TesterT>
+    static bool test(TesterT& tester)
+    {
+        math::DDA<typename TesterT::RayT, NodeT::TOTAL> dda(tester.ray());
+        do {
+            if (tester.template hasNode<NodeT>(dda.voxel())) {
+                tester.setRange(dda.time(), dda.next());
+                if (LevelSetHDDA<TreeT, NodeLevel-1>::test(tester)) return true;
+            }
+        } while(dda.step());
+        return false;
+    }
+};
+
+/// @brief Specialization of Hierarchical Digital Differential Analyzer
+/// class that intersects a ray against the voxels of a level set
+template<typename TreeT>
+struct LevelSetHDDA<TreeT, -1>
+{
+    template <typename TesterT>
+    static bool test(TesterT& tester)
+    {
+        math::DDA<typename TesterT::RayT, 0> dda(tester.ray());
+        tester.init(dda.time());
+        do { if (tester(dda.voxel(), dda.next())) return true; } while(dda.step());
+        return false;
+    }
+};
+
+//////////////////////////////////////////// VolumeHDDA /////////////////////////////////////////////
+
+/// @brief Helper class that implements Hierarchical Digital Differential Analyzers
+/// for ray intersections against a generic volume.
+///
+/// @details The template argument ChildNodeLevel specifies the entry
+/// upper node level used for the hierarchical ray-marching. The final
+/// lowest level is always the leaf node level, i.e. not the voxel level!
+template <typename TreeT, typename RayT, int ChildNodeLevel>
+class VolumeHDDA
+{
+public:
+
+    typedef typename TreeT::RootNodeType::NodeChainType ChainT;
+    typedef typename boost::mpl::at<ChainT, boost::mpl::int_<ChildNodeLevel> >::type NodeT;
+    typedef typename RayT::TimeSpan TimeSpanT;
+
+    VolumeHDDA() {}
+
+    template <typename AccessorT>
+    TimeSpanT march(RayT& ray, AccessorT &acc)
+    {
+        TimeSpanT t(-1, -1);
+        if (ray.valid()) this->march(ray, acc, t);
+        return t;
+    }
+
+    /// ListType is a list of RayType::TimeSpan and is required to
+    /// have the two methods: clear() and push_back(). Thus, it could
+    /// be std::vector<typename RayType::TimeSpan> or
+    /// std::deque<typename RayType::TimeSpan>.  
+    template <typename AccessorT, typename ListT>
+    void hits(RayT& ray, AccessorT &acc, ListT& times)
+    {
+        TimeSpanT t(-1,-1);
+        times.clear();
+        this->hits(ray, acc, times, t);
+        if (t.valid()) times.push_back(t);
+    }
+
+private:
+
+    friend class VolumeHDDA<TreeT, RayT, ChildNodeLevel+1>;
+
+    template <typename AccessorT>
+    bool march(RayT& ray, AccessorT &acc, TimeSpanT& t)
+    {
+        mDDA.init(ray);
+        do {
+            if (acc.template probeConstNode<NodeT>(mDDA.voxel()) != NULL) {//child node
+                ray.setTimes(mDDA.time(), mDDA.next());
+                if (mHDDA.march(ray, acc, t)) return true;//terminate
+            } else if (acc.isValueOn(mDDA.voxel())) {//hit an active tile
+                if (t.t0<0) t.t0 = mDDA.time();//this is the first hit so set t0
+            } else if (t.t0>=0) {//hit an inactive tile after hitting active values
+                t.t1 = mDDA.time();//set end of active ray segment
+                if (t.valid()) return true;//terminate
+                t.set(-1, -1);//reset to an empty and invalid time-span
+            }
+        } while (mDDA.step());
+        if (t.t0>=0) t.t1 = mDDA.maxTime();
+        return false;
+    }
+    
+    /// ListType is a list of RayType::TimeSpan and is required to
+    /// have the two methods: clear() and push_back(). Thus, it could
+    /// be std::vector<typename RayType::TimeSpan> or
+    /// std::deque<typename RayType::TimeSpan>.
+    template <typename AccessorT, typename ListT>
+    void hits(RayT& ray, AccessorT &acc, ListT& times, TimeSpanT& t)
+    {
+        mDDA.init(ray);
+        do {
+            if (acc.template probeConstNode<NodeT>(mDDA.voxel()) != NULL) {//child node
+                ray.setTimes(mDDA.time(), mDDA.next());
+                mHDDA.hits(ray, acc, times, t);
+            } else if (acc.isValueOn(mDDA.voxel())) {//hit an active tile
+                if (t.t0<0) t.t0 = mDDA.time();//this is the first hit so set t0
+            } else if (t.t0>=0) {//hit an inactive tile after hitting active values
+                t.t1 = mDDA.time();//set end of active ray segment
+                if (t.valid()) times.push_back(t);
+                t.set(-1,-1);//reset to an empty and invalid time-span
+            }
+        } while (mDDA.step());
+        if (t.t0>=0) t.t1 = mDDA.maxTime();
+    }
+
+    math::DDA<RayT, NodeT::TOTAL> mDDA;
+    VolumeHDDA<TreeT, RayT, ChildNodeLevel-1> mHDDA;
+};
+
+/// @brief Specialization of Hierarchical Digital Differential Analyzer
+/// class that intersects against the leafs or tiles of a generic volume.
+template <typename TreeT, typename RayT>
+class VolumeHDDA<TreeT, RayT, 0>
+{
+public:
+
+    typedef typename TreeT::LeafNodeType LeafT;
+    typedef typename RayT::TimeSpan TimeSpanT;
+
+    VolumeHDDA() {}
+
+    template <typename AccessorT>
+    TimeSpanT march(RayT& ray, AccessorT &acc)
+    {
+        TimeSpanT t(-1, -1);
+        if (ray.valid()) this->march(ray, acc, t);
+        return t;
+    }
+
+    template <typename AccessorT, typename ListT>
+    void hits(RayT& ray, AccessorT &acc, ListT& times)
+    {
+        TimeSpanT t(-1,-1);
+        times.clear();
+        this->hits(ray, acc, times, t);
+        if (t.valid()) times.push_back(t);
+    }
+
+private:
+
+    friend class VolumeHDDA<TreeT, RayT, 1>;
+
+    template <typename AccessorT>
+    bool march(RayT& ray, AccessorT &acc, TimeSpanT& t)
+    {
+        mDDA.init(ray);
+        do {
+            if (acc.template probeConstNode<LeafT>(mDDA.voxel()) ||
+                acc.isValueOn(mDDA.voxel())) {//hit a leaf or an active tile
+                if (t.t0<0) t.t0 = mDDA.time();//this is the first hit
+            } else if (t.t0>=0) {//hit an inactive tile after hitting active values
+                t.t1 = mDDA.time();//set end of active ray segment
+                if (t.valid()) return true;//terminate
+                t.set(-1, -1);//reset to an empty and invalid time-span
+            }
+        } while (mDDA.step());
+        if (t.t0>=0) t.t1 = mDDA.maxTime();
+        return false;
+    }
+
+    template <typename AccessorT, typename ListT>
+    void hits(RayT& ray, AccessorT &acc, ListT& times, TimeSpanT& t)
+    {
+        mDDA.init(ray);
+        do {
+            if (acc.template probeConstNode<LeafT>(mDDA.voxel()) ||
+                acc.isValueOn(mDDA.voxel())) {//hit a leaf or an active tile
+                if (t.t0<0) t.t0 = mDDA.time();//this is the first hit
+            } else if (t.t0>=0) {//hit an inactive tile after hitting active values
+                t.t1 = mDDA.time();//set end of active ray segment
+                if (t.valid()) times.push_back(t);
+                t.set(-1, -1);//reset to an empty and invalid time-span
+            }
+        } while (mDDA.step());
+        if (t.t0>=0) t.t1 = mDDA.maxTime();
+    }
+    math::DDA<RayT, LeafT::TOTAL> mDDA;
+};
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_DDA_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/FiniteDifference.h b/nuparu/include/openvdb_new/math/FiniteDifference.h
new file mode 100644
index 00000000..28fdea3a
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/FiniteDifference.h
@@ -0,0 +1,2376 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file FiniteDifference.h
+
+#ifndef OPENVDB_MATH_FINITEDIFFERENCE_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_FINITEDIFFERENCE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include "Math.h"
+#include "Coord.h"
+#include "Vec3.h"
+
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/algorithm/string/trim.hpp>
+
+#ifdef DWA_OPENVDB
+#include <simd/Simd.h>
+#endif
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+
+////////////////////////////////////////
+
+
+/// @brief Different discrete schemes used in the first derivatives.
+// Add new items to the *end* of this list, and update NUM_DS_SCHEMES.
+enum DScheme {
+    UNKNOWN_DS = -1,
+    CD_2NDT =  0,   // center difference,    2nd order, but the result must be divided by 2
+    CD_2ND,         // center difference,    2nd order
+    CD_4TH,         // center difference,    4th order
+    CD_6TH,         // center difference,    6th order
+    FD_1ST,         // forward difference,   1st order
+    FD_2ND,         // forward difference,   2nd order
+    FD_3RD,         // forward difference,   3rd order
+    BD_1ST,         // backward difference,  1st order
+    BD_2ND,         // backward difference,  2nd order
+    BD_3RD,         // backward difference,  3rd order
+    FD_WENO5,       // forward difference,   weno5
+    BD_WENO5,       // backward difference,  weno5
+    FD_HJWENO5,     // forward differene,   HJ-weno5
+    BD_HJWENO5      // backward difference, HJ-weno5
+};
+
+enum { NUM_DS_SCHEMES = BD_HJWENO5 + 1 };
+
+
+inline std::string
+dsSchemeToString(DScheme dss)
+{
+    std::string ret;
+    switch (dss) {
+        case UNKNOWN_DS:    ret = "unknown_ds"; break;
+        case CD_2NDT:       ret = "cd_2ndt";    break;
+        case CD_2ND:        ret = "cd_2nd";     break;
+        case CD_4TH:        ret = "cd_4th";     break;
+        case CD_6TH:        ret = "cd_6th";     break;
+        case FD_1ST:        ret = "fd_1st";     break;
+        case FD_2ND:        ret = "fd_2nd";     break;
+        case FD_3RD:        ret = "fd_3rd";     break;
+        case BD_1ST:        ret = "bd_1st";     break;
+        case BD_2ND:        ret = "bd_2nd";     break;
+        case BD_3RD:        ret = "bd_3rd";     break;
+        case FD_WENO5:      ret = "fd_weno5";   break;
+        case BD_WENO5:      ret = "bd_weno5";   break;
+        case FD_HJWENO5:    ret = "fd_hjweno5"; break;
+        case BD_HJWENO5:    ret = "bd_hjweno5"; break;
+    }
+    return ret;
+}
+
+inline DScheme
+stringToDScheme(const std::string& s)
+{
+    DScheme ret = UNKNOWN_DS;
+
+    std::string str = s;
+    boost::trim(str);
+    boost::to_lower(str);
+
+    if (str == dsSchemeToString(CD_2NDT)) {
+        ret = CD_2NDT;
+    } else if (str == dsSchemeToString(CD_2ND)) {
+        ret = CD_2ND;
+    } else if (str == dsSchemeToString(CD_4TH)) {
+        ret = CD_4TH;
+    } else if (str == dsSchemeToString(CD_6TH)) {
+        ret = CD_6TH;
+    } else if (str == dsSchemeToString(FD_1ST)) {
+        ret = FD_1ST;
+    } else if (str == dsSchemeToString(FD_2ND)) {
+        ret = FD_2ND;
+    } else if (str == dsSchemeToString(FD_3RD)) {
+        ret = FD_3RD;
+    } else if (str == dsSchemeToString(BD_1ST)) {
+        ret = BD_1ST;
+    } else if (str == dsSchemeToString(BD_2ND)) {
+        ret = BD_2ND;
+    } else if (str == dsSchemeToString(BD_3RD)) {
+        ret = BD_3RD;
+    } else if (str == dsSchemeToString(FD_WENO5)) {
+        ret = FD_WENO5;
+    } else if (str == dsSchemeToString(BD_WENO5)) {
+        ret = BD_WENO5;
+    } else if (str == dsSchemeToString(FD_HJWENO5)) {
+        ret = FD_HJWENO5;
+    } else if (str == dsSchemeToString(BD_HJWENO5)) {
+        ret = BD_HJWENO5;
+    }
+
+    return ret;
+}
+
+inline std::string
+dsSchemeToMenuName(DScheme dss)
+{
+    std::string ret;
+    switch (dss) {
+        case UNKNOWN_DS:    ret = "Unknown DS scheme";                      break;
+        case CD_2NDT:       ret = "Twice 2nd-order center difference";      break;
+        case CD_2ND:        ret = "2nd-order center difference";            break;
+        case CD_4TH:        ret = "4th-order center difference";            break;
+        case CD_6TH:        ret = "6th-order center difference";            break;
+        case FD_1ST:        ret = "1st-order forward difference";           break;
+        case FD_2ND:        ret = "2nd-order forward difference";           break;
+        case FD_3RD:        ret = "3rd-order forward difference";           break;
+        case BD_1ST:        ret = "1st-order backward difference";          break;
+        case BD_2ND:        ret = "2nd-order backward difference";          break;
+        case BD_3RD:        ret = "3rd-order backward difference";          break;
+        case FD_WENO5:      ret = "5th-order WENO forward difference";      break;
+        case BD_WENO5:      ret = "5th-order WENO backward difference";     break;
+        case FD_HJWENO5:    ret = "5th-order HJ-WENO forward difference";   break;
+        case BD_HJWENO5:    ret = "5th-order HJ-WENO backward difference";  break;
+    }
+    return ret;
+}
+
+
+
+////////////////////////////////////////
+
+
+/// @brief Different discrete schemes used in the second derivatives.
+// Add new items to the *end* of this list, and update NUM_DD_SCHEMES.
+enum DDScheme {
+    UNKNOWN_DD  = -1,
+    CD_SECOND   =  0,   // center difference, 2nd order
+    CD_FOURTH,          // center difference, 4th order
+    CD_SIXTH            // center difference, 6th order
+};
+
+enum { NUM_DD_SCHEMES = CD_SIXTH + 1 };
+
+
+////////////////////////////////////////
+
+
+/// @brief Biased Gradients are limited to non-centered differences
+// Add new items to the *end* of this list, and update NUM_BIAS_SCHEMES.
+enum BiasedGradientScheme {
+    UNKNOWN_BIAS    = -1,
+    FIRST_BIAS      = 0,    // uses FD_1ST & BD_1ST
+    SECOND_BIAS,            // uses FD_2ND & BD_2ND
+    THIRD_BIAS,             // uses FD_3RD & BD_3RD
+    WENO5_BIAS,             // uses WENO5
+    HJWENO5_BIAS            // uses HJWENO5
+};
+
+enum { NUM_BIAS_SCHEMES = HJWENO5_BIAS + 1 };
+
+inline std::string
+biasedGradientSchemeToString(BiasedGradientScheme bgs)
+{
+    std::string ret;
+    switch (bgs) {
+        case UNKNOWN_BIAS:  ret = "unknown_bias";   break;
+        case FIRST_BIAS:    ret = "first_bias";     break;
+        case SECOND_BIAS:   ret = "second_bias";    break;
+        case THIRD_BIAS:    ret = "third_bias";     break;
+        case WENO5_BIAS:    ret = "weno5_bias";     break;
+        case HJWENO5_BIAS:  ret = "hjweno5_bias";   break;
+    }
+    return ret;
+}
+
+inline BiasedGradientScheme
+stringToBiasedGradientScheme(const std::string& s)
+{
+    BiasedGradientScheme ret = UNKNOWN_BIAS;
+
+    std::string str = s;
+    boost::trim(str);
+    boost::to_lower(str);
+
+    if (str == biasedGradientSchemeToString(FIRST_BIAS)) {
+        ret = FIRST_BIAS;
+    } else if (str == biasedGradientSchemeToString(SECOND_BIAS)) {
+        ret = SECOND_BIAS;
+    } else if (str == biasedGradientSchemeToString(THIRD_BIAS)) {
+        ret = THIRD_BIAS;
+    } else if (str == biasedGradientSchemeToString(WENO5_BIAS)) {
+        ret = WENO5_BIAS;
+    } else if (str == biasedGradientSchemeToString(HJWENO5_BIAS)) {
+        ret = HJWENO5_BIAS;
+    }
+    return ret;
+}
+
+inline std::string
+biasedGradientSchemeToMenuName(BiasedGradientScheme bgs)
+{
+    std::string ret;
+    switch (bgs) {
+        case UNKNOWN_BIAS:  ret = "Unknown biased gradient";            break;
+        case FIRST_BIAS:    ret = "1st-order biased gradient";          break;
+        case SECOND_BIAS:   ret = "2nd-order biased gradient";          break;
+        case THIRD_BIAS:    ret = "3rd-order biased gradient";          break;
+        case WENO5_BIAS:    ret = "5th-order WENO biased gradient";     break;
+        case HJWENO5_BIAS:  ret = "5th-order HJ-WENO biased gradient";  break;
+    }
+    return ret;
+}
+
+////////////////////////////////////////
+
+
+/// @brief Temporal integration schemes
+// Add new items to the *end* of this list, and update NUM_TEMPORAL_SCHEMES.
+enum TemporalIntegrationScheme {
+    UNKNOWN_TIS = -1,
+    TVD_RK1,//same as explicit Euler integration
+    TVD_RK2,
+    TVD_RK3
+};
+
+enum { NUM_TEMPORAL_SCHEMES = TVD_RK3 + 1 };
+
+inline std::string
+temporalIntegrationSchemeToString(TemporalIntegrationScheme tis)
+{
+    std::string ret;
+    switch (tis) {
+        case UNKNOWN_TIS:   ret = "unknown_tis";    break;
+        case TVD_RK1:       ret = "tvd_rk1";        break;
+        case TVD_RK2:       ret = "tvd_rk2";        break;
+        case TVD_RK3:       ret = "tvd_rk3";        break;
+    }
+    return ret;
+}
+
+inline TemporalIntegrationScheme
+stringToTemporalIntegrationScheme(const std::string& s)
+{
+    TemporalIntegrationScheme ret = UNKNOWN_TIS;
+
+    std::string str = s;
+    boost::trim(str);
+    boost::to_lower(str);
+
+    if (str == temporalIntegrationSchemeToString(TVD_RK1)) {
+        ret = TVD_RK1;
+    } else if (str == temporalIntegrationSchemeToString(TVD_RK2)) {
+        ret = TVD_RK2;
+    } else if (str == temporalIntegrationSchemeToString(TVD_RK3)) {
+        ret = TVD_RK3;
+    }
+
+    return ret;
+}
+
+inline std::string
+temporalIntegrationSchemeToMenuName(TemporalIntegrationScheme tis)
+{
+    std::string ret;
+    switch (tis) {
+        case UNKNOWN_TIS:   ret = "Unknown temporal integration";   break;
+        case TVD_RK1:       ret = "Forward Euler";                  break;
+        case TVD_RK2:       ret = "2nd-order Runge-Kutta";          break;
+        case TVD_RK3:       ret = "3rd-order Runge-Kutta";          break;
+    }
+    return ret;
+}
+
+
+//@}
+
+
+/// @brief Implementation of nominally fifth-order finite-difference WENO
+/// @details This function returns the numerical flux.  See "High Order Finite Difference and
+/// Finite Volume WENO Schemes and Discontinuous Galerkin Methods for CFD" - Chi-Wang Shu
+/// ICASE Report No 2001-11 (page 6).  Also see ICASE No 97-65 for a more complete reference
+/// (Shu, 1997).
+/// Given v1 = f(x-2dx), v2 = f(x-dx), v3 = f(x), v4 = f(x+dx) and v5 = f(x+2dx),
+/// return an interpolated value f(x+dx/2) with the special property that
+/// ( f(x+dx/2) - f(x-dx/2) ) / dx  = df/dx (x) + error,
+/// where the error is fifth-order in smooth regions: O(dx) <= error <=O(dx^5)
+template<typename ValueType>
+inline ValueType
+WENO5(const ValueType& v1, const ValueType& v2, const ValueType& v3,
+    const ValueType& v4, const ValueType& v5, float scale2 = 0.01f)
+{
+    const double C = 13.0 / 12.0;
+    // WENO is formulated for non-dimensional equations, here the optional scale2
+    // is a reference value (squared) for the function being interpolated.  For
+    // example if 'v' is of order 1000, then scale2 = 10^6 is ok.  But in practice
+    // leave scale2 = 1.
+    const double eps = 1e-6 * scale2;
+    // {\tilde \omega_k} = \gamma_k / ( \beta_k + \epsilon)^2 in Shu's ICASE report)
+    const double A1=0.1/math::Pow2(C*math::Pow2(v1-2*v2+v3)+0.25*math::Pow2(v1-4*v2+3.0*v3)+eps),
+                 A2=0.6/math::Pow2(C*math::Pow2(v2-2*v3+v4)+0.25*math::Pow2(v2-v4)+eps),
+                 A3=0.3/math::Pow2(C*math::Pow2(v3-2*v4+v5)+0.25*math::Pow2(3.0*v3-4*v4+v5)+eps);
+
+    return static_cast<ValueType>(static_cast<ValueType>(
+        A1*(2.0*v1 - 7.0*v2 + 11.0*v3) +
+        A2*(5.0*v3 -     v2 +  2.0*v4) +
+        A3*(2.0*v3 + 5.0*v4 -      v5))/(6.0*(A1+A2+A3)));
+}
+
+
+template <typename Real>
+inline Real GodunovsNormSqrd(bool isOutside,
+                             Real dP_xm, Real dP_xp,
+                             Real dP_ym, Real dP_yp,
+                             Real dP_zm, Real dP_zp)
+{
+    using math::Max;
+    using math::Min;
+    using math::Pow2;
+
+    const Real zero(0);
+    Real dPLen2;
+    if (isOutside) { // outside
+        dPLen2  = Max(Pow2(Max(dP_xm, zero)), Pow2(Min(dP_xp,zero))); // (dP/dx)2
+        dPLen2 += Max(Pow2(Max(dP_ym, zero)), Pow2(Min(dP_yp,zero))); // (dP/dy)2
+        dPLen2 += Max(Pow2(Max(dP_zm, zero)), Pow2(Min(dP_zp,zero))); // (dP/dz)2
+    } else { // inside
+        dPLen2  = Max(Pow2(Min(dP_xm, zero)), Pow2(Max(dP_xp,zero))); // (dP/dx)2
+        dPLen2 += Max(Pow2(Min(dP_ym, zero)), Pow2(Max(dP_yp,zero))); // (dP/dy)2
+        dPLen2 += Max(Pow2(Min(dP_zm, zero)), Pow2(Max(dP_zp,zero))); // (dP/dz)2
+    }
+    return dPLen2; // |\nabla\phi|^2
+}
+
+    
+template <typename Real>
+OPENVDB_DEPRECATED inline Real GudonovsNormSqrd(bool isOutside,
+                                                Real dP_xm, Real dP_xp,
+                                                Real dP_ym, Real dP_yp,
+                                                Real dP_zm, Real dP_zp)
+{ return GodunovsNormSqrd(isOutside, dP_xm, dP_xp, dP_ym, dP_yp, dP_zm, dP_zp); }    
+
+template<typename Real>
+inline Real
+GodunovsNormSqrd(bool isOutside, const Vec3<Real>& gradient_m, const Vec3<Real>& gradient_p)
+{
+    return GodunovsNormSqrd<Real>(isOutside,
+                                  gradient_m[0], gradient_p[0],
+                                  gradient_m[1], gradient_p[1],
+                                  gradient_m[2], gradient_p[2]);
+}
+
+template<typename Real>
+OPENVDB_DEPRECATED inline Real GudonovsNormSqrd(bool isOutside,
+                                                const Vec3<Real>& gradient_m,
+                                                const Vec3<Real>& gradient_p)
+{
+    return GodunovsNormSqrd<Real>(isOutside, gradient_m, gradient_p);
+}    
+
+
+#ifdef DWA_OPENVDB
+inline simd::Float4 simdMin(const simd::Float4& a, const simd::Float4& b) {
+    return simd::Float4(_mm_min_ps(a.base(), b.base()));
+}
+inline simd::Float4 simdMax(const simd::Float4& a, const simd::Float4& b) {
+    return simd::Float4(_mm_max_ps(a.base(), b.base()));
+}
+
+inline float simdSum(const simd::Float4& v);
+
+inline simd::Float4 Pow2(const simd::Float4& v) { return v * v; }
+
+template<>
+inline simd::Float4
+WENO5<simd::Float4>(const simd::Float4& v1, const simd::Float4& v2, const simd::Float4& v3,
+                    const simd::Float4& v4, const simd::Float4& v5, float scale2)
+{
+    using math::Pow2;
+    typedef simd::Float4 F4;
+    const F4
+        C(13.f / 12.f),
+        eps(1.0e-6f * scale2),
+        two(2.0), three(3.0), four(4.0), five(5.0), fourth(0.25),
+        A1 = F4(0.1f) / Pow2(C*Pow2(v1-two*v2+v3) + fourth*Pow2(v1-four*v2+three*v3) + eps),
+        A2 = F4(0.6f) / Pow2(C*Pow2(v2-two*v3+v4) + fourth*Pow2(v2-v4) + eps),
+        A3 = F4(0.3f) / Pow2(C*Pow2(v3-two*v4+v5) + fourth*Pow2(three*v3-four*v4+v5) + eps);
+    return (A1 * (two * v1 - F4(7.0) * v2 + F4(11.0) * v3) +
+            A2 * (five * v3 - v2 + two * v4) +
+            A3 * (two * v3 + five * v4 - v5)) / (F4(6.0) * (A1 + A2 + A3));
+}
+
+
+inline float
+simdSum(const simd::Float4& v)
+{
+    // temp = { v3+v3, v2+v2, v1+v3, v0+v2 }
+    __m128 temp = _mm_add_ps(v.base(), _mm_movehl_ps(v.base(), v.base()));
+    // temp = { v3+v3, v2+v2, v1+v3, (v0+v2)+(v1+v3) }
+    temp = _mm_add_ss(temp, _mm_shuffle_ps(temp, temp, 1));
+    return _mm_cvtss_f32(temp);
+}
+
+inline float
+GodunovsNormSqrd(bool isOutside, const simd::Float4& dP_m, const simd::Float4& dP_p)
+{
+    const simd::Float4 zero(0.0);
+    simd::Float4 v = isOutside
+        ? simdMax(math::Pow2(simdMax(dP_m, zero)), math::Pow2(simdMin(dP_p, zero)))
+        : simdMax(math::Pow2(simdMin(dP_m, zero)), math::Pow2(simdMax(dP_p, zero)));
+    return simdSum(v);//should be v[0]+v[1]+v[2]
+}
+
+OPENVDB_DEPRECATED inline float GudonovsNormSqrd(bool isOutside,
+                                                 const simd::Float4& dP_m,
+                                                 const simd::Float4& dP_p)
+{
+    return GodunovsNormSqrd(isOutside, dP_m, dP_p);
+}    
+#endif
+
+template<DScheme DiffScheme>
+struct D1
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk);
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk);
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk);
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S);
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S);
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S);
+};
+
+template<>
+struct D1<CD_2NDT>
+{
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp1, const ValueType& xm1) {
+        return xp1 - xm1;
+    }
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(1, 0, 0)),
+            grid.getValue(ijk.offsetBy(-1, 0, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0, 1, 0)),
+            grid.getValue(ijk.offsetBy( 0, -1, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0, 0, 1)),
+            grid.getValue(ijk.offsetBy( 0, 0, -1)));
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue< 1, 0, 0>(),  S.template getValue<-1, 0, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 1, 0>(),  S.template getValue< 0,-1, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0, 1>(),  S.template getValue< 0, 0,-1>());
+    }
+};
+
+template<>
+struct D1<CD_2ND>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp1, const ValueType& xm1) {
+        return (xp1 - xm1)*ValueType(0.5);
+    }
+
+
+    // random access
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(1, 0, 0)),
+            grid.getValue(ijk.offsetBy(-1, 0, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0, 1, 0)),
+            grid.getValue(ijk.offsetBy( 0, -1, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0, 0, 1)),
+            grid.getValue(ijk.offsetBy( 0, 0, -1)));
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference(S.template getValue< 1, 0, 0>(), S.template getValue<-1, 0, 0>());
+    }
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference(S.template getValue< 0, 1, 0>(), S.template getValue< 0,-1, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference(S.template getValue< 0, 0, 1>(), S.template getValue< 0, 0,-1>());
+    }
+
+};
+
+template<>
+struct D1<CD_4TH>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference( const ValueType& xp2, const ValueType& xp1,
+                                 const ValueType& xm1, const ValueType& xm2 ) {
+        return ValueType(2./3.)*(xp1 - xm1) + ValueType(1./12.)*(xm2 - xp2) ;
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy( 2,0,0)), grid.getValue(ijk.offsetBy( 1,0,0)),
+            grid.getValue(ijk.offsetBy(-1,0,0)), grid.getValue(ijk.offsetBy(-2,0,0)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+
+        return difference(
+            grid.getValue(ijk.offsetBy( 0, 2, 0)), grid.getValue(ijk.offsetBy( 0, 1, 0)),
+            grid.getValue(ijk.offsetBy( 0,-1, 0)), grid.getValue(ijk.offsetBy( 0,-2, 0)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+
+        return difference(
+            grid.getValue(ijk.offsetBy( 0, 0, 2)), grid.getValue(ijk.offsetBy( 0, 0, 1)),
+            grid.getValue(ijk.offsetBy( 0, 0,-1)), grid.getValue(ijk.offsetBy( 0, 0,-2)) );
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue< 2, 0, 0>(),
+                           S.template getValue< 1, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(),
+                           S.template getValue<-2, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 2, 0>(),
+                           S.template getValue< 0, 1, 0>(),
+                           S.template getValue< 0,-1, 0>(),
+                           S.template getValue< 0,-2, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0, 2>(),
+                           S.template getValue< 0, 0, 1>(),
+                           S.template getValue< 0, 0,-1>(),
+                           S.template getValue< 0, 0,-2>() );
+    }
+};
+
+template<>
+struct D1<CD_6TH>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference( const ValueType& xp3, const ValueType& xp2, const ValueType& xp1,
+                                 const ValueType& xm1, const ValueType& xm2, const ValueType& xm3 )
+    {
+        return ValueType(3./4.)*(xp1 - xm1) - ValueType(0.15)*(xp2 - xm2)
+            + ValueType(1./60.)*(xp3-xm3);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy( 3,0,0)), grid.getValue(ijk.offsetBy( 2,0,0)),
+            grid.getValue(ijk.offsetBy( 1,0,0)), grid.getValue(ijk.offsetBy(-1,0,0)),
+            grid.getValue(ijk.offsetBy(-2,0,0)), grid.getValue(ijk.offsetBy(-3,0,0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy( 0, 3, 0)), grid.getValue(ijk.offsetBy( 0, 2, 0)),
+            grid.getValue(ijk.offsetBy( 0, 1, 0)), grid.getValue(ijk.offsetBy( 0,-1, 0)),
+            grid.getValue(ijk.offsetBy( 0,-2, 0)), grid.getValue(ijk.offsetBy( 0,-3, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy( 0, 0, 3)), grid.getValue(ijk.offsetBy( 0, 0, 2)),
+            grid.getValue(ijk.offsetBy( 0, 0, 1)), grid.getValue(ijk.offsetBy( 0, 0,-1)),
+            grid.getValue(ijk.offsetBy( 0, 0,-2)), grid.getValue(ijk.offsetBy( 0, 0,-3)));
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return  difference(S.template getValue< 3, 0, 0>(),
+                           S.template getValue< 2, 0, 0>(),
+                           S.template getValue< 1, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(),
+                           S.template getValue<-2, 0, 0>(),
+                           S.template getValue<-3, 0, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+
+        return  difference( S.template getValue< 0, 3, 0>(),
+                            S.template getValue< 0, 2, 0>(),
+                            S.template getValue< 0, 1, 0>(),
+                            S.template getValue< 0,-1, 0>(),
+                            S.template getValue< 0,-2, 0>(),
+                            S.template getValue< 0,-3, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+
+        return  difference( S.template getValue< 0, 0, 3>(),
+                            S.template getValue< 0, 0, 2>(),
+                            S.template getValue< 0, 0, 1>(),
+                            S.template getValue< 0, 0,-1>(),
+                            S.template getValue< 0, 0,-2>(),
+                            S.template getValue< 0, 0,-3>());
+    }
+};
+
+
+template<>
+struct D1<FD_1ST>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp1, const ValueType& xp0) {
+        return xp1 - xp0;
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(grid.getValue(ijk.offsetBy(1, 0, 0)), grid.getValue(ijk));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(grid.getValue(ijk.offsetBy(0, 1, 0)), grid.getValue(ijk));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(grid.getValue(ijk.offsetBy(0, 0, 1)), grid.getValue(ijk));
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference(S.template getValue< 1, 0, 0>(), S.template getValue< 0, 0, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference(S.template getValue< 0, 1, 0>(), S.template getValue< 0, 0, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference(S.template getValue< 0, 0, 1>(), S.template getValue< 0, 0, 0>());
+    }
+};
+
+
+template<>
+struct D1<FD_2ND>
+{
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp2, const ValueType& xp1, const ValueType& xp0)
+    {
+        return ValueType(2)*xp1 -(ValueType(0.5)*xp2 + ValueType(3./2.)*xp0);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(2,0,0)),
+            grid.getValue(ijk.offsetBy(1,0,0)),
+            grid.getValue(ijk));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0,2,0)),
+            grid.getValue(ijk.offsetBy(0,1,0)),
+            grid.getValue(ijk));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0,0,2)),
+            grid.getValue(ijk.offsetBy(0,0,1)),
+            grid.getValue(ijk));
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue< 2, 0, 0>(),
+                           S.template getValue< 1, 0, 0>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 2, 0>(),
+                           S.template getValue< 0, 1, 0>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0, 2>(),
+                           S.template getValue< 0, 0, 1>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+};
+
+
+template<>
+struct D1<FD_3RD>
+{
+
+    // the difference opperator
+    template<typename ValueType>
+    static ValueType difference(const ValueType& xp3, const ValueType& xp2,
+        const ValueType& xp1, const ValueType& xp0)
+    {
+        return static_cast<ValueType>(xp3/3.0 - 1.5*xp2 + 3.0*xp1 - 11.0*xp0/6.0);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(3,0,0)),
+                           grid.getValue(ijk.offsetBy(2,0,0)),
+                           grid.getValue(ijk.offsetBy(1,0,0)),
+                           grid.getValue(ijk) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(0,3,0)),
+                           grid.getValue(ijk.offsetBy(0,2,0)),
+                           grid.getValue(ijk.offsetBy(0,1,0)),
+                           grid.getValue(ijk) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(0,0,3)),
+                           grid.getValue(ijk.offsetBy(0,0,2)),
+                           grid.getValue(ijk.offsetBy(0,0,1)),
+                           grid.getValue(ijk) );
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference(S.template getValue< 3, 0, 0>(),
+                          S.template getValue< 2, 0, 0>(),
+                          S.template getValue< 1, 0, 0>(),
+                          S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference(S.template getValue< 0, 3, 0>(),
+                          S.template getValue< 0, 2, 0>(),
+                          S.template getValue< 0, 1, 0>(),
+                          S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0, 3>(),
+                           S.template getValue< 0, 0, 2>(),
+                           S.template getValue< 0, 0, 1>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+};
+
+
+template<>
+struct D1<BD_1ST>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xm1, const ValueType& xm0) {
+        return -D1<FD_1ST>::difference(xm1, xm0);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(grid.getValue(ijk.offsetBy(-1,0,0)), grid.getValue(ijk));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(grid.getValue(ijk.offsetBy(0,-1,0)), grid.getValue(ijk));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(grid.getValue(ijk.offsetBy(0, 0,-1)), grid.getValue(ijk));
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference(S.template getValue<-1, 0, 0>(), S.template getValue< 0, 0, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference(S.template getValue< 0,-1, 0>(), S.template getValue< 0, 0, 0>());
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference(S.template getValue< 0, 0,-1>(), S.template getValue< 0, 0, 0>());
+    }
+};
+
+
+template<>
+struct D1<BD_2ND>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xm2, const ValueType& xm1, const ValueType& xm0)
+    {
+        return -D1<FD_2ND>::difference(xm2, xm1, xm0);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(-2,0,0)),
+                           grid.getValue(ijk.offsetBy(-1,0,0)),
+                           grid.getValue(ijk) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(0,-2,0)),
+                           grid.getValue(ijk.offsetBy(0,-1,0)),
+                           grid.getValue(ijk) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(0,0,-2)),
+                           grid.getValue(ijk.offsetBy(0,0,-1)),
+                           grid.getValue(ijk) );
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue<-2, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0,-2, 0>(),
+                           S.template getValue< 0,-1, 0>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0,-2>(),
+                           S.template getValue< 0, 0,-1>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+};
+
+
+template<>
+struct D1<BD_3RD>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xm3, const ValueType& xm2,
+        const ValueType& xm1, const ValueType& xm0)
+    {
+        return -D1<FD_3RD>::difference(xm3, xm2, xm1, xm0);
+    }
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy(-3,0,0)),
+                           grid.getValue(ijk.offsetBy(-2,0,0)),
+                           grid.getValue(ijk.offsetBy(-1,0,0)),
+                           grid.getValue(ijk) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy( 0,-3,0)),
+                           grid.getValue(ijk.offsetBy( 0,-2,0)),
+                           grid.getValue(ijk.offsetBy( 0,-1,0)),
+                           grid.getValue(ijk) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy( 0, 0,-3)),
+                           grid.getValue(ijk.offsetBy( 0, 0,-2)),
+                           grid.getValue(ijk.offsetBy( 0, 0,-1)),
+                           grid.getValue(ijk) );
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue<-3, 0, 0>(),
+                           S.template getValue<-2, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0,-3, 0>(),
+                           S.template getValue< 0,-2, 0>(),
+                           S.template getValue< 0,-1, 0>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0,-3>(),
+                           S.template getValue< 0, 0,-2>(),
+                           S.template getValue< 0, 0,-1>(),
+                           S.template getValue< 0, 0, 0>() );
+    }
+
+};
+
+template<>
+struct D1<FD_WENO5>
+{
+    // the difference operator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp3, const ValueType& xp2,
+                                const ValueType& xp1, const ValueType& xp0,
+                                const ValueType& xm1, const ValueType& xm2) {
+        return WENO5<ValueType>(xp3, xp2, xp1, xp0, xm1)
+              - WENO5<ValueType>(xp2, xp1, xp0, xm1, xm2);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(3,0,0));
+        V[1] = grid.getValue(ijk.offsetBy(2,0,0));
+        V[2] = grid.getValue(ijk.offsetBy(1,0,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(-1,0,0));
+        V[5] = grid.getValue(ijk.offsetBy(-2,0,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,3,0));
+        V[1] = grid.getValue(ijk.offsetBy(0,2,0));
+        V[2] = grid.getValue(ijk.offsetBy(0,1,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,-1,0));
+        V[5] = grid.getValue(ijk.offsetBy(0,-2,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,0,3));
+        V[1] = grid.getValue(ijk.offsetBy(0,0,2));
+        V[2] = grid.getValue(ijk.offsetBy(0,0,1));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,0,-1));
+        V[5] = grid.getValue(ijk.offsetBy(0,0,-2));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+
+        return static_cast<typename Stencil::ValueType>(difference(
+            S.template getValue< 3, 0, 0>(),
+            S.template getValue< 2, 0, 0>(),
+            S.template getValue< 1, 0, 0>(),
+            S.template getValue< 0, 0, 0>(),
+            S.template getValue<-1, 0, 0>(),
+            S.template getValue<-2, 0, 0>() ));
+
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return static_cast<typename Stencil::ValueType>(difference(
+            S.template getValue< 0, 3, 0>(),
+            S.template getValue< 0, 2, 0>(),
+            S.template getValue< 0, 1, 0>(),
+            S.template getValue< 0, 0, 0>(),
+            S.template getValue< 0,-1, 0>(),
+            S.template getValue< 0,-2, 0>() ));
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return static_cast<typename Stencil::ValueType>(difference(
+            S.template getValue< 0, 0, 3>(),
+            S.template getValue< 0, 0, 2>(),
+            S.template getValue< 0, 0, 1>(),
+            S.template getValue< 0, 0, 0>(),
+            S.template getValue< 0, 0,-1>(),
+            S.template getValue< 0, 0,-2>() ));
+    }
+};
+
+template<>
+struct D1<FD_HJWENO5>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp3, const ValueType& xp2,
+                                const ValueType& xp1, const ValueType& xp0,
+                                const ValueType& xm1, const ValueType& xm2) {
+        return WENO5<ValueType>(xp3 - xp2, xp2 - xp1, xp1 - xp0, xp0-xm1, xm1-xm2);
+    }
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(3,0,0));
+        V[1] = grid.getValue(ijk.offsetBy(2,0,0));
+        V[2] = grid.getValue(ijk.offsetBy(1,0,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(-1,0,0));
+        V[5] = grid.getValue(ijk.offsetBy(-2,0,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,3,0));
+        V[1] = grid.getValue(ijk.offsetBy(0,2,0));
+        V[2] = grid.getValue(ijk.offsetBy(0,1,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,-1,0));
+        V[5] = grid.getValue(ijk.offsetBy(0,-2,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,0,3));
+        V[1] = grid.getValue(ijk.offsetBy(0,0,2));
+        V[2] = grid.getValue(ijk.offsetBy(0,0,1));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,0,-1));
+        V[5] = grid.getValue(ijk.offsetBy(0,0,-2));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+
+        return difference( S.template getValue< 3, 0, 0>(),
+                           S.template getValue< 2, 0, 0>(),
+                           S.template getValue< 1, 0, 0>(),
+                           S.template getValue< 0, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(),
+                           S.template getValue<-2, 0, 0>() );
+
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 3, 0>(),
+                           S.template getValue< 0, 2, 0>(),
+                           S.template getValue< 0, 1, 0>(),
+                           S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0,-1, 0>(),
+                           S.template getValue< 0,-2, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+
+        return difference( S.template getValue< 0, 0, 3>(),
+                           S.template getValue< 0, 0, 2>(),
+                           S.template getValue< 0, 0, 1>(),
+                           S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0, 0,-1>(),
+                           S.template getValue< 0, 0,-2>() );
+    }
+
+};
+
+template<>
+struct D1<BD_WENO5>
+{
+
+    template<typename ValueType>
+    static ValueType difference(const ValueType& xm3, const ValueType& xm2, const ValueType& xm1,
+                                const ValueType& xm0, const ValueType& xp1, const ValueType& xp2)
+    {
+        return -D1<FD_WENO5>::difference(xm3, xm2, xm1, xm0, xp1, xp2);
+    }
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(-3,0,0));
+        V[1] = grid.getValue(ijk.offsetBy(-2,0,0));
+        V[2] = grid.getValue(ijk.offsetBy(-1,0,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(1,0,0));
+        V[5] = grid.getValue(ijk.offsetBy(2,0,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,-3,0));
+        V[1] = grid.getValue(ijk.offsetBy(0,-2,0));
+        V[2] = grid.getValue(ijk.offsetBy(0,-1,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,1,0));
+        V[5] = grid.getValue(ijk.offsetBy(0,2,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,0,-3));
+        V[1] = grid.getValue(ijk.offsetBy(0,0,-2));
+        V[2] = grid.getValue(ijk.offsetBy(0,0,-1));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,0,1));
+        V[5] = grid.getValue(ijk.offsetBy(0,0,2));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        typedef typename Stencil::ValueType ValueType;
+        ValueType V[6];
+        V[0] = S.template getValue<-3, 0, 0>();
+        V[1] = S.template getValue<-2, 0, 0>();
+        V[2] = S.template getValue<-1, 0, 0>();
+        V[3] = S.template getValue< 0, 0, 0>();
+        V[4] = S.template getValue< 1, 0, 0>();
+        V[5] = S.template getValue< 2, 0, 0>();
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        typedef typename Stencil::ValueType ValueType;
+        ValueType V[6];
+        V[0] = S.template getValue< 0,-3, 0>();
+        V[1] = S.template getValue< 0,-2, 0>();
+        V[2] = S.template getValue< 0,-1, 0>();
+        V[3] = S.template getValue< 0, 0, 0>();
+        V[4] = S.template getValue< 0, 1, 0>();
+        V[5] = S.template getValue< 0, 2, 0>();
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        typedef typename Stencil::ValueType ValueType;
+        ValueType V[6];
+        V[0] = S.template getValue< 0, 0,-3>();
+        V[1] = S.template getValue< 0, 0,-2>();
+        V[2] = S.template getValue< 0, 0,-1>();
+        V[3] = S.template getValue< 0, 0, 0>();
+        V[4] = S.template getValue< 0, 0, 1>();
+        V[5] = S.template getValue< 0, 0, 2>();
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+};
+
+
+template<>
+struct D1<BD_HJWENO5>
+{
+    template<typename ValueType>
+    static ValueType difference(const ValueType& xm3, const ValueType& xm2, const ValueType& xm1,
+                                const ValueType& xm0, const ValueType& xp1, const ValueType& xp2)
+    {
+        return -D1<FD_HJWENO5>::difference(xm3, xm2, xm1, xm0, xp1, xp2);
+    }
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(-3,0,0));
+        V[1] = grid.getValue(ijk.offsetBy(-2,0,0));
+        V[2] = grid.getValue(ijk.offsetBy(-1,0,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(1,0,0));
+        V[5] = grid.getValue(ijk.offsetBy(2,0,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,-3,0));
+        V[1] = grid.getValue(ijk.offsetBy(0,-2,0));
+        V[2] = grid.getValue(ijk.offsetBy(0,-1,0));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,1,0));
+        V[5] = grid.getValue(ijk.offsetBy(0,2,0));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType V[6];
+        V[0] = grid.getValue(ijk.offsetBy(0,0,-3));
+        V[1] = grid.getValue(ijk.offsetBy(0,0,-2));
+        V[2] = grid.getValue(ijk.offsetBy(0,0,-1));
+        V[3] = grid.getValue(ijk);
+        V[4] = grid.getValue(ijk.offsetBy(0,0,1));
+        V[5] = grid.getValue(ijk.offsetBy(0,0,2));
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        typedef typename Stencil::ValueType ValueType;
+        ValueType V[6];
+        V[0] = S.template getValue<-3, 0, 0>();
+        V[1] = S.template getValue<-2, 0, 0>();
+        V[2] = S.template getValue<-1, 0, 0>();
+        V[3] = S.template getValue< 0, 0, 0>();
+        V[4] = S.template getValue< 1, 0, 0>();
+        V[5] = S.template getValue< 2, 0, 0>();
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        typedef typename Stencil::ValueType ValueType;
+        ValueType V[6];
+        V[0] = S.template getValue< 0,-3, 0>();
+        V[1] = S.template getValue< 0,-2, 0>();
+        V[2] = S.template getValue< 0,-1, 0>();
+        V[3] = S.template getValue< 0, 0, 0>();
+        V[4] = S.template getValue< 0, 1, 0>();
+        V[5] = S.template getValue< 0, 2, 0>();
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        typedef typename Stencil::ValueType ValueType;
+        ValueType V[6];
+        V[0] = S.template getValue< 0, 0,-3>();
+        V[1] = S.template getValue< 0, 0,-2>();
+        V[2] = S.template getValue< 0, 0,-1>();
+        V[3] = S.template getValue< 0, 0, 0>();
+        V[4] = S.template getValue< 0, 0, 1>();
+        V[5] = S.template getValue< 0, 0, 2>();
+
+        return difference(V[0], V[1], V[2], V[3], V[4], V[5]);
+    }
+};
+
+
+template<DScheme DiffScheme>
+struct D1Vec
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inX(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<DiffScheme>::inX(grid, ijk)[n];
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inY(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<DiffScheme>::inY(grid, ijk)[n];
+    }
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inZ(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<DiffScheme>::inZ(grid, ijk)[n];
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inX(const Stencil& S, int n)
+    {
+        return D1<DiffScheme>::inX(S)[n];
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inY(const Stencil& S, int n)
+    {
+        return D1<DiffScheme>::inY(S)[n];
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inZ(const Stencil& S, int n)
+    {
+        return D1<DiffScheme>::inZ(S)[n];
+    }
+};
+
+
+template<>
+struct D1Vec<CD_2NDT>
+{
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inX(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_2NDT>::difference( grid.getValue(ijk.offsetBy( 1, 0, 0))[n],
+                                        grid.getValue(ijk.offsetBy(-1, 0, 0))[n] );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inY(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_2NDT>::difference( grid.getValue(ijk.offsetBy(0, 1, 0))[n],
+                                        grid.getValue(ijk.offsetBy(0,-1, 0))[n] );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inZ(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_2NDT>::difference( grid.getValue(ijk.offsetBy(0, 0, 1))[n],
+                                        grid.getValue(ijk.offsetBy(0, 0,-1))[n] );
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inX(const Stencil& S, int n)
+    {
+        return D1<CD_2NDT>::difference( S.template getValue< 1, 0, 0>()[n],
+                                        S.template getValue<-1, 0, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inY(const Stencil& S, int n)
+    {
+        return D1<CD_2NDT>::difference( S.template getValue< 0, 1, 0>()[n],
+                                        S.template getValue< 0,-1, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inZ(const Stencil& S, int n)
+    {
+        return D1<CD_2NDT>::difference( S.template getValue< 0, 0, 1>()[n],
+                                        S.template getValue< 0, 0,-1>()[n] );
+    }
+};
+
+template<>
+struct D1Vec<CD_2ND>
+{
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inX(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_2ND>::difference( grid.getValue(ijk.offsetBy( 1, 0, 0))[n] ,
+                                       grid.getValue(ijk.offsetBy(-1, 0, 0))[n] );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inY(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_2ND>::difference( grid.getValue(ijk.offsetBy(0, 1, 0))[n] ,
+                                       grid.getValue(ijk.offsetBy(0,-1, 0))[n] );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inZ(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_2ND>::difference( grid.getValue(ijk.offsetBy(0, 0, 1))[n] ,
+                                       grid.getValue(ijk.offsetBy(0, 0,-1))[n] );
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inX(const Stencil& S, int n)
+    {
+        return D1<CD_2ND>::difference( S.template getValue< 1, 0, 0>()[n],
+                                       S.template getValue<-1, 0, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inY(const Stencil& S, int n)
+    {
+        return D1<CD_2ND>::difference( S.template getValue< 0, 1, 0>()[n],
+                                       S.template getValue< 0,-1, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inZ(const Stencil& S, int n)
+    {
+        return D1<CD_2ND>::difference( S.template getValue< 0, 0, 1>()[n],
+                                       S.template getValue< 0, 0,-1>()[n] );
+    }
+};
+
+
+template<>
+struct D1Vec<CD_4TH> {
+    // typedef typename Accessor::ValueType::value_type  value_type;
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inX(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_4TH>::difference(
+            grid.getValue(ijk.offsetBy(2, 0, 0))[n], grid.getValue(ijk.offsetBy( 1, 0, 0))[n],
+            grid.getValue(ijk.offsetBy(-1,0, 0))[n], grid.getValue(ijk.offsetBy(-2, 0, 0))[n]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inY(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_4TH>::difference(
+            grid.getValue(ijk.offsetBy( 0, 2, 0))[n], grid.getValue(ijk.offsetBy( 0, 1, 0))[n],
+            grid.getValue(ijk.offsetBy( 0,-1, 0))[n], grid.getValue(ijk.offsetBy( 0,-2, 0))[n]);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inZ(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_4TH>::difference(
+            grid.getValue(ijk.offsetBy(0,0, 2))[n], grid.getValue(ijk.offsetBy( 0, 0, 1))[n],
+            grid.getValue(ijk.offsetBy(0,0,-1))[n], grid.getValue(ijk.offsetBy( 0, 0,-2))[n]);
+    }
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inX(const Stencil& S, int n)
+    {
+        return D1<CD_4TH>::difference(
+            S.template getValue< 2, 0, 0>()[n],  S.template getValue< 1, 0, 0>()[n],
+            S.template getValue<-1, 0, 0>()[n],  S.template getValue<-2, 0, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inY(const Stencil& S, int n)
+    {
+        return D1<CD_4TH>::difference(
+            S.template getValue< 0, 2, 0>()[n],  S.template getValue< 0, 1, 0>()[n],
+            S.template getValue< 0,-1, 0>()[n],  S.template getValue< 0,-2, 0>()[n]);
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inZ(const Stencil& S, int n)
+    {
+        return D1<CD_4TH>::difference(
+            S.template getValue< 0, 0, 2>()[n],  S.template getValue< 0, 0, 1>()[n],
+            S.template getValue< 0, 0,-1>()[n],  S.template getValue< 0, 0,-2>()[n]);
+    }
+};
+
+
+template<>
+struct D1Vec<CD_6TH>
+{
+    //typedef typename Accessor::ValueType::value_type::value_type  ValueType;
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inX(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_6TH>::difference(
+            grid.getValue(ijk.offsetBy( 3, 0, 0))[n], grid.getValue(ijk.offsetBy( 2, 0, 0))[n],
+            grid.getValue(ijk.offsetBy( 1, 0, 0))[n], grid.getValue(ijk.offsetBy(-1, 0, 0))[n],
+            grid.getValue(ijk.offsetBy(-2, 0, 0))[n], grid.getValue(ijk.offsetBy(-3, 0, 0))[n] );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inY(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_6TH>::difference(
+            grid.getValue(ijk.offsetBy( 0, 3, 0))[n], grid.getValue(ijk.offsetBy( 0, 2, 0))[n],
+            grid.getValue(ijk.offsetBy( 0, 1, 0))[n], grid.getValue(ijk.offsetBy( 0,-1, 0))[n],
+            grid.getValue(ijk.offsetBy( 0,-2, 0))[n], grid.getValue(ijk.offsetBy( 0,-3, 0))[n] );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType::value_type
+    inZ(const Accessor& grid, const Coord& ijk, int n)
+    {
+        return D1<CD_6TH>::difference(
+            grid.getValue(ijk.offsetBy( 0, 0, 3))[n], grid.getValue(ijk.offsetBy( 0, 0, 2))[n],
+            grid.getValue(ijk.offsetBy( 0, 0, 1))[n], grid.getValue(ijk.offsetBy( 0, 0,-1))[n],
+            grid.getValue(ijk.offsetBy( 0, 0,-2))[n], grid.getValue(ijk.offsetBy( 0, 0,-3))[n] );
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inX(const Stencil& S, int n)
+    {
+        return D1<CD_6TH>::difference(
+            S.template getValue< 3, 0, 0>()[n], S.template getValue< 2, 0, 0>()[n],
+            S.template getValue< 1, 0, 0>()[n], S.template getValue<-1, 0, 0>()[n],
+            S.template getValue<-2, 0, 0>()[n], S.template getValue<-3, 0, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inY(const Stencil& S, int n)
+    {
+        return D1<CD_6TH>::difference(
+            S.template getValue< 0, 3, 0>()[n], S.template getValue< 0, 2, 0>()[n],
+            S.template getValue< 0, 1, 0>()[n], S.template getValue< 0,-1, 0>()[n],
+            S.template getValue< 0,-2, 0>()[n], S.template getValue< 0,-3, 0>()[n] );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType::value_type inZ(const Stencil& S, int n)
+    {
+        return D1<CD_6TH>::difference(
+            S.template getValue< 0, 0, 3>()[n], S.template getValue< 0, 0, 2>()[n],
+            S.template getValue< 0, 0, 1>()[n], S.template getValue< 0, 0,-1>()[n],
+            S.template getValue< 0, 0,-2>()[n], S.template getValue< 0, 0,-3>()[n] );
+    }
+};
+
+template<DDScheme DiffScheme>
+struct D2
+{
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk);
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk);
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk);
+
+    // cross derivatives
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandY(const Accessor& grid, const Coord& ijk);
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandZ(const Accessor& grid, const Coord& ijk);
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inYandZ(const Accessor& grid, const Coord& ijk);
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S);
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S);
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S);
+
+    // cross derivatives
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandY(const Stencil& S);
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandZ(const Stencil& S);
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inYandZ(const Stencil& S);
+};
+
+template<>
+struct D2<CD_SECOND>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp1, const ValueType& xp0, const ValueType& xm1)
+    {
+        return xp1 + xm1 - ValueType(2)*xp0;
+    }
+
+    template <typename ValueType>
+    static ValueType crossdifference(const ValueType& xpyp, const ValueType& xpym,
+                                     const ValueType& xmyp, const ValueType& xmym)
+    {
+        return ValueType(0.25)*(xpyp + xmym - xpym - xmyp);
+    }
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy( 1,0,0)), grid.getValue(ijk),
+                           grid.getValue(ijk.offsetBy(-1,0,0)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+
+        return difference( grid.getValue(ijk.offsetBy(0, 1,0)), grid.getValue(ijk),
+                           grid.getValue(ijk.offsetBy(0,-1,0)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+        return difference( grid.getValue(ijk.offsetBy( 0,0, 1)), grid.getValue(ijk),
+                           grid.getValue(ijk.offsetBy( 0,0,-1)) );
+    }
+
+    // cross derivatives
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandY(const Accessor& grid, const Coord& ijk)
+    {
+        return crossdifference(
+            grid.getValue(ijk.offsetBy(1, 1,0)), grid.getValue(ijk.offsetBy( 1,-1,0)),
+            grid.getValue(ijk.offsetBy(-1,1,0)), grid.getValue(ijk.offsetBy(-1,-1,0)));
+
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandZ(const Accessor& grid, const Coord& ijk)
+    {
+        return crossdifference(
+            grid.getValue(ijk.offsetBy(1,0, 1)), grid.getValue(ijk.offsetBy(1, 0,-1)),
+            grid.getValue(ijk.offsetBy(-1,0,1)), grid.getValue(ijk.offsetBy(-1,0,-1)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inYandZ(const Accessor& grid, const Coord& ijk)
+    {
+        return crossdifference(
+            grid.getValue(ijk.offsetBy(0, 1,1)), grid.getValue(ijk.offsetBy(0, 1,-1)),
+            grid.getValue(ijk.offsetBy(0,-1,1)), grid.getValue(ijk.offsetBy(0,-1,-1)) );
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue< 1, 0, 0>(), S.template getValue< 0, 0, 0>(),
+                           S.template getValue<-1, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 1, 0>(), S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0,-1, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0, 1>(), S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0, 0,-1>() );
+    }
+
+    // cross derivatives
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandY(const Stencil& S)
+    {
+        return crossdifference(S.template getValue< 1, 1, 0>(),  S.template getValue< 1,-1, 0>(),
+                               S.template getValue<-1, 1, 0>(),  S.template getValue<-1,-1, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandZ(const Stencil& S)
+    {
+        return crossdifference(S.template getValue< 1, 0, 1>(),  S.template getValue< 1, 0,-1>(),
+                               S.template getValue<-1, 0, 1>(),  S.template getValue<-1, 0,-1>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inYandZ(const Stencil& S)
+    {
+        return crossdifference(S.template getValue< 0, 1, 1>(),  S.template getValue< 0, 1,-1>(),
+                               S.template getValue< 0,-1, 1>(),  S.template getValue< 0,-1,-1>() );
+    }
+};
+
+
+template<>
+struct D2<CD_FOURTH>
+{
+
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp2, const ValueType& xp1, const ValueType& xp0,
+                                const ValueType& xm1, const ValueType& xm2) {
+        return ValueType(-1./12.)*(xp2 + xm2) + ValueType(4./3.)*(xp1 + xm1) -ValueType(2.5)*xp0;
+    }
+
+    template <typename ValueType>
+    static ValueType crossdifference(const ValueType& xp2yp2, const ValueType& xp2yp1,
+                                     const ValueType& xp2ym1, const ValueType& xp2ym2,
+                                     const ValueType& xp1yp2, const ValueType& xp1yp1,
+                                     const ValueType& xp1ym1, const ValueType& xp1ym2,
+                                     const ValueType& xm2yp2, const ValueType& xm2yp1,
+                                     const ValueType& xm2ym1, const ValueType& xm2ym2,
+                                     const ValueType& xm1yp2, const ValueType& xm1yp1,
+                                     const ValueType& xm1ym1, const ValueType& xm1ym2 ) {
+        ValueType tmp1 =
+            ValueType(2./3.0)*(xp1yp1 - xm1yp1 - xp1ym1 + xm1ym1)-
+            ValueType(1./12.)*(xp2yp1 - xm2yp1 - xp2ym1 + xm2ym1);
+        ValueType tmp2 =
+            ValueType(2./3.0)*(xp1yp2 - xm1yp2 - xp1ym2 + xm1ym2)-
+            ValueType(1./12.)*(xp2yp2 - xm2yp2 - xp2ym2 + xm2ym2);
+
+        return ValueType(2./3.)*tmp1 - ValueType(1./12.)*tmp2;
+    }
+
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(2,0,0)),  grid.getValue(ijk.offsetBy( 1,0,0)),
+            grid.getValue(ijk),
+            grid.getValue(ijk.offsetBy(-1,0,0)), grid.getValue(ijk.offsetBy(-2, 0, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy(0, 2,0)), grid.getValue(ijk.offsetBy(0, 1,0)),
+            grid.getValue(ijk),
+            grid.getValue(ijk.offsetBy(0,-1,0)), grid.getValue(ijk.offsetBy(0,-2, 0)));
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+         return difference(
+             grid.getValue(ijk.offsetBy(0,0, 2)), grid.getValue(ijk.offsetBy(0, 0,1)),
+             grid.getValue(ijk),
+             grid.getValue(ijk.offsetBy(0,0,-1)), grid.getValue(ijk.offsetBy(0,0,-2)));
+    }
+
+    // cross derivatives
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandY(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  ValueType;
+        typename Accessor::ValueType tmp1 =
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0, 1, 0)) -
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0,-1, 0));
+        typename Accessor::ValueType tmp2 =
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0, 2, 0)) -
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0,-2, 0));
+        return ValueType(2./3.)*tmp1 - ValueType(1./12.)*tmp2;
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  ValueType;
+        typename Accessor::ValueType tmp1 =
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0, 0, 1)) -
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0, 0,-1));
+        typename Accessor::ValueType tmp2 =
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0, 0, 2)) -
+            D1<CD_4TH>::inX(grid, ijk.offsetBy(0, 0,-2));
+        return ValueType(2./3.)*tmp1 - ValueType(1./12.)*tmp2;
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inYandZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  ValueType;
+        typename Accessor::ValueType tmp1 =
+            D1<CD_4TH>::inY(grid, ijk.offsetBy(0, 0, 1)) -
+            D1<CD_4TH>::inY(grid, ijk.offsetBy(0, 0,-1));
+        typename Accessor::ValueType tmp2 =
+            D1<CD_4TH>::inY(grid, ijk.offsetBy(0, 0, 2)) -
+            D1<CD_4TH>::inY(grid, ijk.offsetBy(0, 0,-2));
+        return ValueType(2./3.)*tmp1 - ValueType(1./12.)*tmp2;
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return  difference(S.template getValue< 2, 0, 0>(), S.template getValue< 1, 0, 0>(),
+                           S.template getValue< 0, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(), S.template getValue<-2, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return  difference(S.template getValue< 0, 2, 0>(), S.template getValue< 0, 1, 0>(),
+                           S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0,-1, 0>(), S.template getValue< 0,-2, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return  difference(S.template getValue< 0, 0, 2>(), S.template getValue< 0, 0, 1>(),
+                           S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0, 0,-1>(), S.template getValue< 0, 0,-2>() );
+    }
+
+    // cross derivatives
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandY(const Stencil& S)
+     {
+         return crossdifference(
+             S.template getValue< 2, 2, 0>(), S.template getValue< 2, 1, 0>(),
+             S.template getValue< 2,-1, 0>(), S.template getValue< 2,-2, 0>(),
+             S.template getValue< 1, 2, 0>(), S.template getValue< 1, 1, 0>(),
+             S.template getValue< 1,-1, 0>(), S.template getValue< 1,-2, 0>(),
+             S.template getValue<-2, 2, 0>(), S.template getValue<-2, 1, 0>(),
+             S.template getValue<-2,-1, 0>(), S.template getValue<-2,-2, 0>(),
+             S.template getValue<-1, 2, 0>(), S.template getValue<-1, 1, 0>(),
+             S.template getValue<-1,-1, 0>(), S.template getValue<-1,-2, 0>() );
+     }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandZ(const Stencil& S)
+    {
+        return crossdifference(
+            S.template getValue< 2, 0, 2>(), S.template getValue< 2, 0, 1>(),
+            S.template getValue< 2, 0,-1>(), S.template getValue< 2, 0,-2>(),
+            S.template getValue< 1, 0, 2>(), S.template getValue< 1, 0, 1>(),
+            S.template getValue< 1, 0,-1>(), S.template getValue< 1, 0,-2>(),
+            S.template getValue<-2, 0, 2>(), S.template getValue<-2, 0, 1>(),
+            S.template getValue<-2, 0,-1>(), S.template getValue<-2, 0,-2>(),
+            S.template getValue<-1, 0, 2>(), S.template getValue<-1, 0, 1>(),
+            S.template getValue<-1, 0,-1>(), S.template getValue<-1, 0,-2>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inYandZ(const Stencil& S)
+    {
+        return crossdifference(
+            S.template getValue< 0, 2, 2>(), S.template getValue< 0, 2, 1>(),
+            S.template getValue< 0, 2,-1>(), S.template getValue< 0, 2,-2>(),
+            S.template getValue< 0, 1, 2>(), S.template getValue< 0, 1, 1>(),
+            S.template getValue< 0, 1,-1>(), S.template getValue< 0, 1,-2>(),
+            S.template getValue< 0,-2, 2>(), S.template getValue< 0,-2, 1>(),
+            S.template getValue< 0,-2,-1>(), S.template getValue< 0,-2,-2>(),
+            S.template getValue< 0,-1, 2>(), S.template getValue< 0,-1, 1>(),
+            S.template getValue< 0,-1,-1>(), S.template getValue< 0,-1,-2>() );
+    }
+};
+
+
+template<>
+struct D2<CD_SIXTH>
+{
+    // the difference opperator
+    template <typename ValueType>
+    static ValueType difference(const ValueType& xp3, const ValueType& xp2, const ValueType& xp1,
+                                const ValueType& xp0,
+                                const ValueType& xm1, const ValueType& xm2, const ValueType& xm3)
+    {
+        return  ValueType(1./90.)*(xp3 + xm3) - ValueType(3./20.)*(xp2 + xm2)
+              + ValueType(1.5)*(xp1 + xm1) - ValueType(49./18.)*xp0;
+    }
+
+    template <typename ValueType>
+    static ValueType crossdifference( const ValueType& xp1yp1,const ValueType& xm1yp1,
+                                      const ValueType& xp1ym1,const ValueType& xm1ym1,
+                                      const ValueType& xp2yp1,const ValueType& xm2yp1,
+                                      const ValueType& xp2ym1,const ValueType& xm2ym1,
+                                      const ValueType& xp3yp1,const ValueType& xm3yp1,
+                                      const ValueType& xp3ym1,const ValueType& xm3ym1,
+                                      const ValueType& xp1yp2,const ValueType& xm1yp2,
+                                      const ValueType& xp1ym2,const ValueType& xm1ym2,
+                                      const ValueType& xp2yp2,const ValueType& xm2yp2,
+                                      const ValueType& xp2ym2,const ValueType& xm2ym2,
+                                      const ValueType& xp3yp2,const ValueType& xm3yp2,
+                                      const ValueType& xp3ym2,const ValueType& xm3ym2,
+                                      const ValueType& xp1yp3,const ValueType& xm1yp3,
+                                      const ValueType& xp1ym3,const ValueType& xm1ym3,
+                                      const ValueType& xp2yp3,const ValueType& xm2yp3,
+                                      const ValueType& xp2ym3,const ValueType& xm2ym3,
+                                      const ValueType& xp3yp3,const ValueType& xm3yp3,
+                                      const ValueType& xp3ym3,const ValueType& xm3ym3 )
+    {
+        ValueType tmp1 =
+            ValueType(0.7500)*(xp1yp1 - xm1yp1 - xp1ym1 + xm1ym1) -
+            ValueType(0.1500)*(xp2yp1 - xm2yp1 - xp2ym1 + xm2ym1) +
+            ValueType(1./60.)*(xp3yp1 - xm3yp1 - xp3ym1 + xm3ym1);
+
+        ValueType tmp2 =
+            ValueType(0.7500)*(xp1yp2 - xm1yp2 - xp1ym2 + xm1ym2) -
+            ValueType(0.1500)*(xp2yp2 - xm2yp2 - xp2ym2 + xm2ym2) +
+            ValueType(1./60.)*(xp3yp2 - xm3yp2 - xp3ym2 + xm3ym2);
+
+        ValueType tmp3 =
+            ValueType(0.7500)*(xp1yp3 - xm1yp3 - xp1ym3 + xm1ym3) -
+            ValueType(0.1500)*(xp2yp3 - xm2yp3 - xp2ym3 + xm2ym3) +
+            ValueType(1./60.)*(xp3yp3 - xm3yp3 - xp3ym3 + xm3ym3);
+
+        return ValueType(0.75)*tmp1 - ValueType(0.15)*tmp2 + ValueType(1./60)*tmp3;
+    }
+
+    // random access version
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inX(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy( 3, 0, 0)), grid.getValue(ijk.offsetBy( 2, 0, 0)),
+            grid.getValue(ijk.offsetBy( 1, 0, 0)), grid.getValue(ijk),
+            grid.getValue(ijk.offsetBy(-1, 0, 0)), grid.getValue(ijk.offsetBy(-2, 0, 0)),
+            grid.getValue(ijk.offsetBy(-3, 0, 0)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inY(const Accessor& grid, const Coord& ijk)
+    {
+        return difference(
+            grid.getValue(ijk.offsetBy( 0, 3, 0)), grid.getValue(ijk.offsetBy( 0, 2, 0)),
+            grid.getValue(ijk.offsetBy( 0, 1, 0)), grid.getValue(ijk),
+            grid.getValue(ijk.offsetBy( 0,-1, 0)), grid.getValue(ijk.offsetBy( 0,-2, 0)),
+            grid.getValue(ijk.offsetBy( 0,-3, 0)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inZ(const Accessor& grid, const Coord& ijk)
+    {
+
+        return difference(
+            grid.getValue(ijk.offsetBy( 0, 0, 3)), grid.getValue(ijk.offsetBy( 0, 0, 2)),
+            grid.getValue(ijk.offsetBy( 0, 0, 1)), grid.getValue(ijk),
+            grid.getValue(ijk.offsetBy( 0, 0,-1)), grid.getValue(ijk.offsetBy( 0, 0,-2)),
+            grid.getValue(ijk.offsetBy( 0, 0,-3)) );
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandY(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueT;
+        ValueT tmp1 =
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 1, 0)) -
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0,-1, 0));
+        ValueT tmp2 =
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 2, 0)) -
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0,-2, 0));
+        ValueT tmp3 =
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 3, 0)) -
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0,-3, 0));
+        return ValueT(0.75*tmp1 - 0.15*tmp2 + 1./60*tmp3);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inXandZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueT;
+        ValueT tmp1 =
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 0, 1)) -
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 0,-1));
+        ValueT tmp2 =
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 0, 2)) -
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 0,-2));
+        ValueT tmp3 =
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 0, 3)) -
+            D1<CD_6TH>::inX(grid, ijk.offsetBy(0, 0,-3));
+        return ValueT(0.75*tmp1 - 0.15*tmp2 + 1./60*tmp3);
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType inYandZ(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueT;
+        ValueT tmp1 =
+            D1<CD_6TH>::inY(grid, ijk.offsetBy(0, 0, 1)) -
+            D1<CD_6TH>::inY(grid, ijk.offsetBy(0, 0,-1));
+        ValueT tmp2 =
+            D1<CD_6TH>::inY(grid, ijk.offsetBy(0, 0, 2)) -
+            D1<CD_6TH>::inY(grid, ijk.offsetBy(0, 0,-2));
+        ValueT tmp3 =
+            D1<CD_6TH>::inY(grid, ijk.offsetBy(0, 0, 3)) -
+            D1<CD_6TH>::inY(grid, ijk.offsetBy(0, 0,-3));
+        return ValueT(0.75*tmp1 - 0.15*tmp2 + 1./60*tmp3);
+    }
+
+
+    // stencil access version
+    template<typename Stencil>
+    static typename Stencil::ValueType inX(const Stencil& S)
+    {
+        return difference( S.template getValue< 3, 0, 0>(),  S.template getValue< 2, 0, 0>(),
+                           S.template getValue< 1, 0, 0>(),  S.template getValue< 0, 0, 0>(),
+                           S.template getValue<-1, 0, 0>(),  S.template getValue<-2, 0, 0>(),
+                           S.template getValue<-3, 0, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inY(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 3, 0>(),  S.template getValue< 0, 2, 0>(),
+                           S.template getValue< 0, 1, 0>(),  S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0,-1, 0>(),  S.template getValue< 0,-2, 0>(),
+                           S.template getValue< 0,-3, 0>() );
+
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inZ(const Stencil& S)
+    {
+        return difference( S.template getValue< 0, 0, 3>(),  S.template getValue< 0, 0, 2>(),
+                           S.template getValue< 0, 0, 1>(),  S.template getValue< 0, 0, 0>(),
+                           S.template getValue< 0, 0,-1>(),  S.template getValue< 0, 0,-2>(),
+                           S.template getValue< 0, 0,-3>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandY(const Stencil& S)
+    {
+        return crossdifference( S.template getValue< 1, 1, 0>(), S.template getValue<-1, 1, 0>(),
+                                S.template getValue< 1,-1, 0>(), S.template getValue<-1,-1, 0>(),
+                                S.template getValue< 2, 1, 0>(), S.template getValue<-2, 1, 0>(),
+                                S.template getValue< 2,-1, 0>(), S.template getValue<-2,-1, 0>(),
+                                S.template getValue< 3, 1, 0>(), S.template getValue<-3, 1, 0>(),
+                                S.template getValue< 3,-1, 0>(), S.template getValue<-3,-1, 0>(),
+                                S.template getValue< 1, 2, 0>(), S.template getValue<-1, 2, 0>(),
+                                S.template getValue< 1,-2, 0>(), S.template getValue<-1,-2, 0>(),
+                                S.template getValue< 2, 2, 0>(), S.template getValue<-2, 2, 0>(),
+                                S.template getValue< 2,-2, 0>(), S.template getValue<-2,-2, 0>(),
+                                S.template getValue< 3, 2, 0>(), S.template getValue<-3, 2, 0>(),
+                                S.template getValue< 3,-2, 0>(), S.template getValue<-3,-2, 0>(),
+                                S.template getValue< 1, 3, 0>(), S.template getValue<-1, 3, 0>(),
+                                S.template getValue< 1,-3, 0>(), S.template getValue<-1,-3, 0>(),
+                                S.template getValue< 2, 3, 0>(), S.template getValue<-2, 3, 0>(),
+                                S.template getValue< 2,-3, 0>(), S.template getValue<-2,-3, 0>(),
+                                S.template getValue< 3, 3, 0>(), S.template getValue<-3, 3, 0>(),
+                                S.template getValue< 3,-3, 0>(), S.template getValue<-3,-3, 0>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inXandZ(const Stencil& S)
+    {
+        return crossdifference( S.template getValue< 1, 0, 1>(), S.template getValue<-1, 0, 1>(),
+                                S.template getValue< 1, 0,-1>(), S.template getValue<-1, 0,-1>(),
+                                S.template getValue< 2, 0, 1>(), S.template getValue<-2, 0, 1>(),
+                                S.template getValue< 2, 0,-1>(), S.template getValue<-2, 0,-1>(),
+                                S.template getValue< 3, 0, 1>(), S.template getValue<-3, 0, 1>(),
+                                S.template getValue< 3, 0,-1>(), S.template getValue<-3, 0,-1>(),
+                                S.template getValue< 1, 0, 2>(), S.template getValue<-1, 0, 2>(),
+                                S.template getValue< 1, 0,-2>(), S.template getValue<-1, 0,-2>(),
+                                S.template getValue< 2, 0, 2>(), S.template getValue<-2, 0, 2>(),
+                                S.template getValue< 2, 0,-2>(), S.template getValue<-2, 0,-2>(),
+                                S.template getValue< 3, 0, 2>(), S.template getValue<-3, 0, 2>(),
+                                S.template getValue< 3, 0,-2>(), S.template getValue<-3, 0,-2>(),
+                                S.template getValue< 1, 0, 3>(), S.template getValue<-1, 0, 3>(),
+                                S.template getValue< 1, 0,-3>(), S.template getValue<-1, 0,-3>(),
+                                S.template getValue< 2, 0, 3>(), S.template getValue<-2, 0, 3>(),
+                                S.template getValue< 2, 0,-3>(), S.template getValue<-2, 0,-3>(),
+                                S.template getValue< 3, 0, 3>(), S.template getValue<-3, 0, 3>(),
+                                S.template getValue< 3, 0,-3>(), S.template getValue<-3, 0,-3>() );
+    }
+
+    template<typename Stencil>
+    static typename Stencil::ValueType inYandZ(const Stencil& S)
+    {
+        return crossdifference( S.template getValue< 0, 1, 1>(), S.template getValue< 0,-1, 1>(),
+                                S.template getValue< 0, 1,-1>(), S.template getValue< 0,-1,-1>(),
+                                S.template getValue< 0, 2, 1>(), S.template getValue< 0,-2, 1>(),
+                                S.template getValue< 0, 2,-1>(), S.template getValue< 0,-2,-1>(),
+                                S.template getValue< 0, 3, 1>(), S.template getValue< 0,-3, 1>(),
+                                S.template getValue< 0, 3,-1>(), S.template getValue< 0,-3,-1>(),
+                                S.template getValue< 0, 1, 2>(), S.template getValue< 0,-1, 2>(),
+                                S.template getValue< 0, 1,-2>(), S.template getValue< 0,-1,-2>(),
+                                S.template getValue< 0, 2, 2>(), S.template getValue< 0,-2, 2>(),
+                                S.template getValue< 0, 2,-2>(), S.template getValue< 0,-2,-2>(),
+                                S.template getValue< 0, 3, 2>(), S.template getValue< 0,-3, 2>(),
+                                S.template getValue< 0, 3,-2>(), S.template getValue< 0,-3,-2>(),
+                                S.template getValue< 0, 1, 3>(), S.template getValue< 0,-1, 3>(),
+                                S.template getValue< 0, 1,-3>(), S.template getValue< 0,-1,-3>(),
+                                S.template getValue< 0, 2, 3>(), S.template getValue< 0,-2, 3>(),
+                                S.template getValue< 0, 2,-3>(), S.template getValue< 0,-2,-3>(),
+                                S.template getValue< 0, 3, 3>(), S.template getValue< 0,-3, 3>(),
+                                S.template getValue< 0, 3,-3>(), S.template getValue< 0,-3,-3>() );
+    }
+
+};
+
+} // end math namespace
+} // namespace OPENVDB_VERSION_NAME
+} // end openvdb namespace
+
+#endif // OPENVDB_MATH_FINITEDIFFERENCE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/LegacyFrustum.h b/nuparu/include/openvdb_new/math/LegacyFrustum.h
new file mode 100644
index 00000000..b6b13c10
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/LegacyFrustum.h
@@ -0,0 +1,196 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file math/LegacyFrustum.h
+
+#ifndef OPENVDB_MATH_LEGACYFRUSTUM_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_LEGACYFRUSTUM_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <openvdb/Types.h> // for Real typedef
+#include "Coord.h"
+#include "Mat4.h"
+#include "Vec3.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+namespace internal {
+
+/// @brief LegacyFrustum class used at DreamWorks for converting old vdb files.
+class LegacyFrustum
+{
+public:
+    LegacyFrustum(std::istream& is)
+    {
+        // First read in the old transform's base class.
+        // the "extents"
+        Vec3i tmpMin, tmpMax;
+        is.read(reinterpret_cast<char*>(&tmpMin), sizeof(Vec3i::ValueType) * 3);
+        is.read(reinterpret_cast<char*>(&tmpMax), sizeof(Vec3i::ValueType) * 3);
+
+        Coord tmpMinCoord(tmpMin);
+        Coord tmpMaxCoord(tmpMax);
+
+        // set the extents
+        mExtents = CoordBBox(tmpMinCoord, tmpMaxCoord);
+
+        // read the old-frustum class member data
+        //Mat4d tmpW2C;
+        Mat4d tmpW2C, tmpC2S, tmpS2C, tmpWorldToLocal;
+        Mat4d tmpS2U, tmpXYLocalToUnit, tmpZLocalToUnit;
+        Real tmpWindow[6];
+        Real tmpPadding;
+
+        //Mat4d  tmpXYUnitToLocal, tmpZUnitToLocal
+
+        // read in each matrix.
+        is.read(reinterpret_cast<char*>(&tmpW2C),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&mC2W),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&tmpC2S),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&tmpS2C),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&tmpWorldToLocal),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&mLocalToWorld),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+
+        is.read(reinterpret_cast<char*>(&tmpWindow[0]), sizeof(Real));
+        is.read(reinterpret_cast<char*>(&tmpWindow[1]), sizeof(Real));
+        is.read(reinterpret_cast<char*>(&tmpWindow[2]), sizeof(Real));
+        is.read(reinterpret_cast<char*>(&tmpWindow[3]), sizeof(Real));
+        is.read(reinterpret_cast<char*>(&tmpWindow[4]), sizeof(Real));
+        is.read(reinterpret_cast<char*>(&tmpWindow[5]), sizeof(Real));
+
+        is.read(reinterpret_cast<char*>(&tmpPadding), sizeof(Real));
+
+        is.read(reinterpret_cast<char*>(&tmpS2U),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&mXYUnitToLocal),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&tmpXYLocalToUnit),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&mZUnitToLocal),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+        is.read(reinterpret_cast<char*>(&tmpZLocalToUnit),
+            sizeof(Mat4d::value_type) * Mat4d::size * Mat4d::size);
+
+
+        mNearPlane = tmpWindow[4];
+        mFarPlane  = tmpWindow[5];
+
+        // Look up the world space corners of the
+        // frustum grid.
+        mFrNearOrigin = unitToLocalFrustum(Vec3R(0,0,0));
+        mFrFarOrigin = unitToLocalFrustum(Vec3R(0,0,1));
+
+        Vec3d frNearXTip = unitToLocalFrustum(Vec3R(1,0,0));
+        Vec3d frNearYTip = unitToLocalFrustum(Vec3R(0,1,0));
+        mFrNearXBasis = frNearXTip - mFrNearOrigin;
+        mFrNearYBasis = frNearYTip - mFrNearOrigin;
+
+        Vec3R frFarXTip = unitToLocalFrustum(Vec3R(1,0,1));
+        Vec3R frFarYTip = unitToLocalFrustum(Vec3R(0,1,1));
+        mFrFarXBasis = frFarXTip - mFrFarOrigin;
+        mFrFarYBasis = frFarYTip - mFrFarOrigin;
+    }
+
+    ~LegacyFrustum() {}
+
+    const Mat4d& getCamXForm() const {return mC2W; }
+
+    double getDepth() const {return (mFarPlane - mNearPlane); }
+    double getTaper() const {
+
+        return   getNearPlaneWidth() / getFarPlaneWidth();
+    }
+
+    double getNearPlaneWidth() const {
+        double nearPlaneWidth  = (unitToWorld(Vec3d(0,0,0)) - unitToWorld(Vec3d(1,0,0))).length();
+        return nearPlaneWidth;
+    }
+
+    double getFarPlaneWidth() const {
+        double farPlaneWidth = (unitToWorld(Vec3d(0,0,1)) - unitToWorld(Vec3d(1,0,1))).length();
+        return farPlaneWidth;
+    }
+
+    double getNearPlaneDist() const { return mNearPlane; }
+
+    const CoordBBox& getBBox() const {return mExtents; }
+
+    Vec3d unitToWorld(const Vec3d& in) const {return mLocalToWorld.transform( unitToLocal(in) ); }
+
+private:
+    LegacyFrustum() {}
+
+    Vec3d unitToLocal(const Vec3d& U) const {
+
+        // We first find the local space coordinates
+        // of the unit point projected onto the near
+        // and far planes of the frustum by using a
+        // linear combination of the planes basis vectors
+        Vec3d nearLS = ( U[0] * mFrNearXBasis ) + ( U[1] * mFrNearYBasis ) + mFrNearOrigin;
+        Vec3d farLS  = ( U[0] * mFrFarXBasis  ) + ( U[1] * mFrFarYBasis  ) + mFrFarOrigin;
+
+        // then we lerp the two ws points in frustum z space
+        return U[2] * farLS + ( 1.0 - U[2] ) * nearLS;
+    }
+
+    Vec3d unitToLocalFrustum(const Vec3d& u) const {
+        Vec3d fzu = mZUnitToLocal.transformH(u);
+        Vec3d fu = u;
+        fu[2] = fzu.z();
+        return mXYUnitToLocal.transformH(fu);
+    }
+
+private:
+    Mat4d mC2W, mLocalToWorld, mXYUnitToLocal, mZUnitToLocal;
+    CoordBBox mExtents;
+    Vec3d mFrNearXBasis, mFrNearYBasis, mFrFarXBasis, mFrFarYBasis;
+    Vec3d mFrNearOrigin, mFrFarOrigin;
+    double mNearPlane, mFarPlane;
+};
+
+} // namespace internal
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_LEGACYFRUSTUM_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Maps.h b/nuparu/include/openvdb_new/math/Maps.h
new file mode 100644
index 00000000..17655946
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Maps.h
@@ -0,0 +1,2685 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Maps.h
+
+#ifndef OPENVDB_MATH_MAPS_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_MAPS_HAS_BEEN_INCLUDED
+
+#include "Math.h"
+#include "Mat4.h"
+#include "Vec3.h"
+#include "BBox.h"
+#include "Coord.h"
+#include <openvdb/io/io.h> // for io::getFormatVersion()
+#include <openvdb/util/Name.h>
+#include <openvdb/Types.h>
+#include <boost/shared_ptr.hpp>
+#include <map>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+
+////////////////////////////////////////
+
+/// Forward declarations of the different map types
+
+class MapBase;
+class ScaleMap;
+class TranslationMap;
+class ScaleTranslateMap;
+class UniformScaleMap;
+class UniformScaleTranslateMap;
+class AffineMap;
+class UnitaryMap;
+class NonlinearFrustumMap;
+
+template<typename T1, typename T2> class CompoundMap;
+
+typedef CompoundMap<UnitaryMap, TranslationMap>                     UnitaryAndTranslationMap;
+typedef CompoundMap<CompoundMap<UnitaryMap, ScaleMap>, UnitaryMap>  SpectralDecomposedMap;
+typedef SpectralDecomposedMap                                       SymmetricMap;
+typedef CompoundMap<SymmetricMap, UnitaryAndTranslationMap>         FullyDecomposedMap;
+typedef CompoundMap<SymmetricMap, UnitaryMap>                       PolarDecomposedMap;
+
+
+////////////////////////////////////////
+
+/// Map traits
+
+template<typename T> struct is_linear                 { static const bool value = false; };
+template<> struct is_linear<AffineMap>                { static const bool value = true; };
+template<> struct is_linear<ScaleMap>                 { static const bool value = true; };
+template<> struct is_linear<UniformScaleMap>          { static const bool value = true; };
+template<> struct is_linear<UnitaryMap>               { static const bool value = true; };
+template<> struct is_linear<TranslationMap>           { static const bool value = true; };
+template<> struct is_linear<ScaleTranslateMap>        { static const bool value = true; };
+template<> struct is_linear<UniformScaleTranslateMap> { static const bool value = true; };
+
+template<typename T1, typename T2> struct is_linear<CompoundMap<T1, T2> > {
+    static const bool value = is_linear<T1>::value && is_linear<T2>::value;
+};
+
+
+template<typename T> struct is_uniform_scale          { static const bool value = false; };
+template<> struct is_uniform_scale<UniformScaleMap>   { static const bool value = true; };
+
+template<typename T> struct is_uniform_scale_translate       { static const bool value = false; };
+template<> struct is_uniform_scale_translate<TranslationMap> { static const bool value = true; };
+template<> struct is_uniform_scale_translate<UniformScaleTranslateMap> {
+    static const bool value = true;
+};
+
+
+template<typename T> struct is_scale                  { static const bool value = false; };
+template<> struct is_scale<ScaleMap>                  { static const bool value = true; };
+
+template<typename T> struct is_scale_translate        { static const bool value = false; };
+template<> struct is_scale_translate<ScaleTranslateMap> { static const bool value = true; };
+
+
+template<typename T> struct is_uniform_diagonal_jacobian {
+    static const bool value = is_uniform_scale<T>::value || is_uniform_scale_translate<T>::value;
+};
+
+template<typename T> struct is_diagonal_jacobian {
+    static const bool value = is_scale<T>::value || is_scale_translate<T>::value;
+};
+
+
+////////////////////////////////////////
+
+/// Utility methods
+
+/// @brief Create a SymmetricMap from a symmetric matrix.
+/// Decomposes the map into Rotation Diagonal Rotation^T
+OPENVDB_API boost::shared_ptr<SymmetricMap> createSymmetricMap(const Mat3d& m);
+
+
+/// @brief General decomposition of a Matrix into a Unitary (e.g. rotation)
+/// following a Symmetric (e.g. stretch & shear)
+OPENVDB_API boost::shared_ptr<FullyDecomposedMap> createFullyDecomposedMap(const Mat4d& m);
+
+
+/// @brief Decomposes a general linear into translation following polar decomposition.
+///
+/// T U S where:
+///
+///  T: Translation
+///  U: Unitary (rotation or reflection)
+///  S: Symmetric
+///
+/// @note: the Symmetric is automatically decomposed into Q D Q^T, where
+/// Q is rotation and D is diagonal.
+OPENVDB_API boost::shared_ptr<PolarDecomposedMap> createPolarDecomposedMap(const Mat3d& m);
+
+
+/// @brief reduces an AffineMap to a ScaleMap or a ScaleTranslateMap when it can
+OPENVDB_API boost::shared_ptr<MapBase> simplify(boost::shared_ptr<AffineMap> affine);
+
+/// @brief Returns the left pseudoInverse of the input matrix when the 3x3 part is symmetric
+/// otherwise it zeros the 3x3 and reverses the translation.
+OPENVDB_API Mat4d approxInverse(const Mat4d& mat);
+
+
+////////////////////////////////////////
+
+
+/// @brief Abstract base class for maps
+class OPENVDB_API MapBase
+{
+public:
+    typedef boost::shared_ptr<MapBase>       Ptr;
+    typedef boost::shared_ptr<const MapBase> ConstPtr;
+    typedef Ptr (*MapFactory)();
+
+    virtual ~MapBase(){}
+
+    virtual boost::shared_ptr<AffineMap> getAffineMap() const = 0;
+
+    /// Return the name of this map's concrete type (e.g., @c "AffineMap").
+    virtual Name type() const = 0;
+
+    /// Return @c true if this map is of concrete type @c MapT (e.g., AffineMap).
+    template<typename MapT> bool isType() const { return this->type() == MapT::mapType(); }
+
+    /// Return @c true if this map is equal to the given map.
+    virtual bool isEqual(const MapBase& other) const = 0;
+
+    /// Return @c true if this map is linear.
+    virtual bool isLinear() const = 0;
+    /// Return @c true if the spacing between the image of latice is uniform in all directions
+    virtual bool hasUniformScale() const = 0;
+
+    virtual Vec3d applyMap(const Vec3d& in) const = 0;
+    virtual Vec3d applyInverseMap(const Vec3d& in) const = 0;
+
+    //@{
+    /// @brief Apply the Inverse Jacobian Transpose of this map to a vector.
+    /// For a linear map this is equivalent to applying the transpose of
+    /// inverse map excluding translation.
+    virtual Vec3d applyIJT(const Vec3d& in) const = 0;
+    virtual Vec3d applyIJT(const Vec3d& in, const Vec3d& domainPos) const = 0;
+    //@}
+
+    virtual Mat3d applyIJC(const Mat3d& m) const = 0;
+    virtual Mat3d applyIJC(const Mat3d& m, const Vec3d& v, const Vec3d& domainPos) const = 0;
+
+
+    virtual double determinant() const = 0;
+    virtual double determinant(const Vec3d&) const = 0;
+
+
+    //@{
+    /// @brief Method to return the local size of a voxel.
+    /// When a location is specified as an argument, it is understood to be
+    /// be in the domain of the map (i.e. index space)
+    virtual Vec3d voxelSize() const = 0;
+    virtual Vec3d voxelSize(const Vec3d&) const = 0;
+    //@}
+
+    virtual void read(std::istream&) = 0;
+    virtual void write(std::ostream&) const = 0;
+
+    virtual std::string str() const = 0;
+
+    virtual MapBase::Ptr copy() const = 0;
+
+    //@{
+    /// @brief Methods to update the map
+    virtual MapBase::Ptr preRotate(double radians, Axis axis = X_AXIS) const = 0;
+    virtual MapBase::Ptr preTranslate(const Vec3d&) const = 0;
+    virtual MapBase::Ptr preScale(const Vec3d&) const = 0;
+    virtual MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const = 0;
+
+    virtual MapBase::Ptr postRotate(double radians, Axis axis = X_AXIS) const = 0;
+    virtual MapBase::Ptr postTranslate(const Vec3d&) const = 0;
+    virtual MapBase::Ptr postScale(const Vec3d&) const = 0;
+    virtual MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const = 0;
+    //@}
+
+    //@{
+    /// @brief Apply the Jacobian of this map to a vector.
+    /// For a linear map this is equivalent to applying the map excluding translation.
+    /// @warning Houdini 12.5 uses an earlier version of OpenVDB, and maps created
+    /// with that version lack a virtual table entry for this method.  Do not call
+    /// this method from Houdini 12.5.
+    virtual Vec3d applyJacobian(const Vec3d& in) const = 0;
+    virtual Vec3d applyJacobian(const Vec3d& in, const Vec3d& domainPos) const = 0;
+    //@}
+
+    //@{
+    /// @brief Apply the InverseJacobian of this map to a vector.
+    /// For a linear map this is equivalent to applying the map inverse excluding translation.
+    /// @warning Houdini 12.5 uses an earlier version of OpenVDB, and maps created
+    /// with that version lack a virtual table entry for this method.  Do not call
+    /// this method from Houdini 12.5.
+    virtual Vec3d applyInverseJacobian(const Vec3d& in) const = 0;
+    virtual Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d& domainPos) const = 0;
+    //@}
+
+
+    //@{
+    /// @brief Apply the Jacobian transpose of this map to a vector.
+    /// For a linear map this is equivalent to applying the transpose of the map
+    /// excluding translation.
+    /// @warning Houdini 12.5 uses an earlier version of OpenVDB, and maps created
+    /// with that version lack a virtual table entry for this method.  Do not call
+    /// this method from Houdini 12.5.
+    virtual Vec3d applyJT(const Vec3d& in) const = 0;
+    virtual Vec3d applyJT(const Vec3d& in, const Vec3d& domainPos) const = 0;
+    //@}
+
+    /// @brief Return a new map representing the inverse of this map.
+    /// @throw NotImplementedError if the map is a NonlinearFrustumMap.
+    /// @warning Houdini 12.5 uses an earlier version of OpenVDB, and maps created
+    /// with that version lack a virtual table entry for this method.  Do not call
+    /// this method from Houdini 12.5.
+    virtual MapBase::Ptr inverseMap() const = 0;
+
+protected:
+    MapBase() {}
+
+    template<typename MapT>
+    static bool isEqualBase(const MapT& self, const MapBase& other)
+    {
+        return other.isType<MapT>() && (self == *static_cast<const MapT*>(&other));
+    }
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief Threadsafe singleton object for accessing the map type-name dictionary.
+/// Associates a map type-name with a factory function.
+class OPENVDB_API MapRegistry
+{
+public:
+    typedef std::map<Name, MapBase::MapFactory> MapDictionary;
+
+    static MapRegistry* instance();
+
+    /// Create a new map of the given (registered) type name.
+    static MapBase::Ptr createMap(const Name&);
+
+    /// Return @c true if the given map type name is registered.
+    static bool isRegistered(const Name&);
+
+    /// Register a map type along with a factory function.
+    static void registerMap(const Name&, MapBase::MapFactory);
+
+    /// Remove a map type from the registry.
+    static void unregisterMap(const Name&);
+
+    /// Clear the map type registry.
+    static void clear();
+
+private:
+    MapRegistry() {}
+
+    static MapRegistry* staticInstance();
+
+    static MapRegistry* mInstance;
+
+    MapDictionary mMap;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief A general linear transform using homogeneous coordinates to perform
+/// rotation, scaling, shear and translation
+class OPENVDB_API AffineMap: public MapBase
+{
+public:
+    typedef boost::shared_ptr<AffineMap>       Ptr;
+    typedef boost::shared_ptr<const AffineMap> ConstPtr;
+
+    AffineMap():
+        mMatrix(Mat4d::identity()),
+        mMatrixInv(Mat4d::identity()),
+        mJacobianInv(Mat3d::identity()),
+        mDeterminant(1),
+        mVoxelSize(Vec3d(1,1,1)),
+        mIsDiagonal(true),
+        mIsIdentity(true)
+        // the default constructor for translation is zero
+    {
+    }
+
+    AffineMap(const Mat3d& m)
+    {
+        Mat4d mat4(Mat4d::identity());
+        mat4.setMat3(m);
+        mMatrix = mat4;
+        updateAcceleration();
+    }
+
+    AffineMap(const Mat4d& m): mMatrix(m)
+    {
+        if (!isAffine(m)) {
+            OPENVDB_THROW(ArithmeticError,
+                "Tried to initialize an affine transform from a non-affine 4x4 matrix");
+        }
+        updateAcceleration();
+    }
+
+    AffineMap(const AffineMap& other):
+        MapBase(other),
+        mMatrix(other.mMatrix),
+        mMatrixInv(other.mMatrixInv),
+        mJacobianInv(other.mJacobianInv),
+        mDeterminant(other.mDeterminant),
+        mVoxelSize(other.mVoxelSize),
+        mIsDiagonal(other.mIsDiagonal),
+        mIsIdentity(other.mIsIdentity)
+    {
+    }
+
+    /// @brief constructor that merges the matrixes for two affine maps
+    AffineMap(const AffineMap& first, const AffineMap& second):
+        mMatrix(first.mMatrix * second.mMatrix)
+    {
+        updateAcceleration();
+    }
+
+    ~AffineMap() {}
+
+    /// Return a MapBase::Ptr to a new AffineMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new AffineMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new AffineMap(*this)); }
+
+    MapBase::Ptr inverseMap() const { return MapBase::Ptr(new AffineMap(mMatrixInv)); }
+
+    static bool isRegistered() { return MapRegistry::isRegistered(AffineMap::mapType()); }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            AffineMap::mapType(),
+            AffineMap::create);
+    }
+
+    Name type() const { return mapType(); }
+    static Name mapType() { return Name("AffineMap"); }
+
+    /// Return @c true (an AffineMap is always linear).
+    bool isLinear() const { return true; }
+
+    /// Return @c false ( test if this is unitary with translation )
+    bool hasUniformScale() const
+    {
+        Mat3d mat = mMatrix.getMat3();
+        const double det = mat.det();
+        if (isApproxEqual(det, double(0))) {
+            return false;
+        } else {
+            mat *= (1.f / pow(std::abs(det),1./3.));
+            return isUnitary(mat);
+        }
+    }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const AffineMap& other) const
+    {
+        // the Mat.eq() is approximate
+        if (!mMatrix.eq(other.mMatrix)) { return false; }
+        if (!mMatrixInv.eq(other.mMatrixInv))  { return false; }
+        return true;
+    }
+
+    bool operator!=(const AffineMap& other) const { return !(*this == other); }
+
+    AffineMap& operator=(const AffineMap& other)
+    {
+        mMatrix = other.mMatrix;
+        mMatrixInv = other.mMatrixInv;
+
+        mJacobianInv = other.mJacobianInv;
+        mDeterminant = other.mDeterminant;
+        mVoxelSize = other.mVoxelSize;
+        mIsDiagonal  = other.mIsDiagonal;
+        mIsIdentity  = other.mIsIdentity;
+        return *this;
+    }
+    /// Return the image of @c in under the map
+    Vec3d applyMap(const Vec3d& in) const { return in * mMatrix; }
+    /// Return the pre-image of @c in under the map
+    Vec3d applyInverseMap(const Vec3d& in) const {return in * mMatrixInv; }
+
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in, const Vec3d&) const { return applyJacobian(in); }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in) const { return mMatrix.transform3x3(in); }
+
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d&) const { return applyInverseJacobian(in); }
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in) const { return mMatrixInv.transform3x3(in); }
+
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    /// This tranforms range-space gradients to domain-space gradients
+    Vec3d applyJT(const Vec3d& in, const Vec3d&) const { return applyJT(in); }
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    Vec3d applyJT(const Vec3d& in) const {
+        const double* m = mMatrix.asPointer();
+        return Vec3d( m[ 0] * in[0] + m[ 1] * in[1] + m[ 2] * in[2],
+                      m[ 4] * in[0] + m[ 5] * in[1] + m[ 6] * in[2],
+                      m[ 8] * in[0] + m[ 9] * in[1] + m[10] * in[2] );
+    }
+
+    /// Return the transpose of the inverse Jacobian of the map applied to @a in.
+    Vec3d applyIJT(const Vec3d& in, const Vec3d&) const { return applyIJT(in); }
+    /// Return the transpose of the inverse Jacobian of the map applied to @c in
+    Vec3d applyIJT(const Vec3d& in) const { return in * mJacobianInv; }
+    /// Return the Jacobian Curvature: zero for a linear map
+    Mat3d applyIJC(const Mat3d& m) const {
+        return mJacobianInv.transpose()* m * mJacobianInv;
+    }
+    Mat3d applyIJC(const Mat3d& in, const Vec3d& , const Vec3d& ) const {
+        return applyIJC(in);
+    }
+    /// Return the determinant of the Jacobian, ignores argument
+    double determinant(const Vec3d& ) const { return determinant(); }
+    /// Return the determinant of the Jacobian
+    double determinant() const { return mDeterminant; }
+
+    //@{
+    /// @brief Return the lengths of the images of the segments
+    /// (0,0,0)-(1,0,0), (0,0,0)-(0,1,0) and (0,0,0)-(0,0,1).
+    Vec3d voxelSize() const { return mVoxelSize; }
+    Vec3d voxelSize(const Vec3d&) const { return voxelSize(); }
+    //@}
+
+    /// Return @c true if the underlying matrix is approximately an identity
+    bool isIdentity() const { return mIsIdentity; }
+    /// Return @c true  if the underylying matrix is diagonal
+    bool isDiagonal() const { return mIsDiagonal; }
+    /// Return @c true if the map is equivalent to a ScaleMap
+    bool isScale() const { return isDiagonal(); }
+    /// Return @c true if the map is equivalent to a ScaleTranslateMap
+    bool isScaleTranslate() const { return math::isDiagonal(mMatrix.getMat3()); }
+
+
+    // Methods that modify the existing affine map
+
+    //@{
+    /// @brief Modify the existing affine map by pre-applying the given operation.
+    void accumPreRotation(Axis axis, double radians)
+    {
+        mMatrix.preRotate(axis, radians);
+        updateAcceleration();
+    }
+    void accumPreScale(const Vec3d& v)
+    {
+        mMatrix.preScale(v);
+        updateAcceleration();
+    }
+    void accumPreTranslation(const Vec3d& v)
+    {
+        mMatrix.preTranslate(v);
+        updateAcceleration();
+    }
+    void accumPreShear(Axis axis0, Axis axis1, double shear)
+    {
+        mMatrix.preShear(axis0, axis1, shear);
+        updateAcceleration();
+    }
+    //@}
+
+
+    //@{
+    /// @brief Modify the existing affine map by post-applying the given operation.
+    void accumPostRotation(Axis axis, double radians)
+    {
+        mMatrix.postRotate(axis, radians);
+        updateAcceleration();
+    }
+    void accumPostScale(const Vec3d& v)
+    {
+        mMatrix.postScale(v);
+        updateAcceleration();
+    }
+    void accumPostTranslation(const Vec3d& v)
+    {
+        mMatrix.postTranslate(v);
+        updateAcceleration();
+    }
+    void accumPostShear(Axis axis0, Axis axis1, double shear)
+    {
+        mMatrix.postShear(axis0, axis1, shear);
+        updateAcceleration();
+    }
+    //@}
+
+
+    /// read serialization
+    void read(std::istream& is)
+    {
+        mMatrix.read(is);
+        updateAcceleration();
+    }
+
+    /// write serialization
+    void write(std::ostream& os) const
+    {
+        mMatrix.write(os);
+    }
+
+    /// string serialization, useful for debugging
+    std::string str() const
+    {
+        std::ostringstream buffer;
+        buffer << " - mat4:\n" << mMatrix.str() << std::endl;
+        buffer << " - voxel dimensions: " << mVoxelSize << std::endl;
+        return buffer.str();
+    }
+
+    /// on-demand decomposition of the affine map
+    boost::shared_ptr<FullyDecomposedMap> createDecomposedMap()
+    {
+        return createFullyDecomposedMap(mMatrix);
+    }
+
+    /// Return AffineMap::Ptr to  a deep copy of the current AffineMap
+    AffineMap::Ptr getAffineMap() const { return AffineMap::Ptr(new AffineMap(*this)); }
+
+    /// Return AffineMap::Ptr to the inverse of this map
+    AffineMap::Ptr inverse() const { return AffineMap::Ptr(new AffineMap(mMatrixInv)); }
+
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropraite operation.
+    MapBase::Ptr preRotate(double radians, Axis axis = X_AXIS) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreRotation(axis, radians);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr preTranslate(const Vec3d& t) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreTranslation(t);
+        return boost::static_pointer_cast<MapBase, AffineMap>(affineMap);
+    }
+    MapBase::Ptr preScale(const Vec3d& s) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreScale(s);
+        return boost::static_pointer_cast<MapBase, AffineMap>(affineMap);
+    }
+    MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of postfixing the appropraite operation.
+    MapBase::Ptr postRotate(double radians, Axis axis = X_AXIS) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostRotation(axis, radians);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr postTranslate(const Vec3d& t) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostTranslation(t);
+        return boost::static_pointer_cast<MapBase, AffineMap>(affineMap);
+    }
+    MapBase::Ptr postScale(const Vec3d& s) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostScale(s);
+        return boost::static_pointer_cast<MapBase, AffineMap>(affineMap);
+    }
+    MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+    /// Return the matrix representation of this AffineMap
+    Mat4d getMat4() const { return mMatrix;}
+    const Mat4d& getConstMat4() const {return mMatrix;}
+    const Mat3d& getConstJacobianInv() const {return mJacobianInv;}
+
+private:
+    void updateAcceleration() {
+        Mat3d mat3 = mMatrix.getMat3();
+        mDeterminant = mat3.det();
+
+        if (std::abs(mDeterminant) < (3.0 * math::Tolerance<double>::value())) {
+            OPENVDB_THROW(ArithmeticError,
+                "Tried to initialize an affine transform from a nearly singular matrix");
+        }
+        mMatrixInv = mMatrix.inverse();
+        mJacobianInv = mat3.inverse().transpose();
+        mIsDiagonal = math::isDiagonal(mMatrix);
+        mIsIdentity = math::isIdentity(mMatrix);
+        Vec3d pos = applyMap(Vec3d(0,0,0));
+        mVoxelSize(0) = (applyMap(Vec3d(1,0,0)) - pos).length();
+        mVoxelSize(1) = (applyMap(Vec3d(0,1,0)) - pos).length();
+        mVoxelSize(2) = (applyMap(Vec3d(0,0,1)) - pos).length();
+    }
+
+    // the underlying matrix
+    Mat4d  mMatrix;
+
+    // stored for acceleration
+    Mat4d  mMatrixInv;
+    Mat3d  mJacobianInv;
+    double mDeterminant;
+    Vec3d  mVoxelSize;
+    bool   mIsDiagonal, mIsIdentity;
+}; // class AffineMap
+
+
+////////////////////////////////////////
+
+
+/// @brief A specialized Affine transform that scales along the principal axis
+/// the scaling need not be uniform in the three-directions
+class OPENVDB_API ScaleMap: public MapBase
+{
+public:
+    typedef boost::shared_ptr<ScaleMap>       Ptr;
+    typedef boost::shared_ptr<const ScaleMap> ConstPtr;
+
+    ScaleMap(): MapBase(), mScaleValues(Vec3d(1,1,1)), mVoxelSize(Vec3d(1,1,1)),
+                mScaleValuesInverse(Vec3d(1,1,1)),
+                mInvScaleSqr(1,1,1), mInvTwiceScale(0.5,0.5,0.5){}
+
+    ScaleMap(const Vec3d& scale):
+        MapBase(),
+        mScaleValues(scale),
+        mVoxelSize(Vec3d(std::abs(scale(0)),std::abs(scale(1)), std::abs(scale(2))))
+    {
+        double determinant = scale[0]* scale[1] * scale[2];
+        if (std::abs(determinant) < 3.0 * math::Tolerance<double>::value()) {
+            OPENVDB_THROW(ArithmeticError, "Non-zero scale values required");
+        }
+        mScaleValuesInverse = 1.0 / mScaleValues;
+        mInvScaleSqr = mScaleValuesInverse * mScaleValuesInverse;
+        mInvTwiceScale = mScaleValuesInverse / 2;
+    }
+
+    ScaleMap(const ScaleMap& other):
+        MapBase(),
+        mScaleValues(other.mScaleValues),
+        mVoxelSize(other.mVoxelSize),
+        mScaleValuesInverse(other.mScaleValuesInverse),
+        mInvScaleSqr(other.mInvScaleSqr),
+        mInvTwiceScale(other.mInvTwiceScale)
+    {
+    }
+
+    ~ScaleMap() {}
+
+    /// Return a MapBase::Ptr to a new ScaleMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new ScaleMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new ScaleMap(*this)); }
+
+    MapBase::Ptr inverseMap() const { return MapBase::Ptr(new ScaleMap(mScaleValuesInverse)); }
+
+    static bool isRegistered() { return MapRegistry::isRegistered(ScaleMap::mapType()); }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            ScaleMap::mapType(),
+            ScaleMap::create);
+    }
+
+    Name type() const { return mapType(); }
+    static Name mapType() { return Name("ScaleMap"); }
+
+    /// Return @c true (a ScaleMap is always linear).
+    bool isLinear() const { return true; }
+
+    /// Return @c true if the values have the same magitude (eg. -1, 1, -1 would be a rotation).
+    bool hasUniformScale() const
+    {
+        bool value = isApproxEqual(
+            std::abs(mScaleValues.x()), std::abs(mScaleValues.y()), double(5e-7));
+        value = value && isApproxEqual(
+            std::abs(mScaleValues.x()), std::abs(mScaleValues.z()), double(5e-7));
+        return value;
+    }
+
+    /// Return the image of @c in under the map
+    Vec3d applyMap(const Vec3d& in) const
+    {
+        return Vec3d(
+            in.x() * mScaleValues.x(),
+            in.y() * mScaleValues.y(),
+            in.z() * mScaleValues.z());
+    }
+    /// Return the pre-image of @c in under the map
+    Vec3d applyInverseMap(const Vec3d& in) const
+    {
+        return Vec3d(
+            in.x() * mScaleValuesInverse.x(),
+            in.y() * mScaleValuesInverse.y(),
+            in.z() * mScaleValuesInverse.z());
+    }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in, const Vec3d&) const { return applyJacobian(in); }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in) const { return applyMap(in); }
+
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d&) const { return applyInverseJacobian(in); }
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in) const { return applyInverseMap(in); }
+
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    /// This tranforms range-space gradients to domain-space gradients
+    Vec3d applyJT(const Vec3d& in, const Vec3d&) const { return applyJT(in); }
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    Vec3d applyJT(const Vec3d& in) const { return applyMap(in); }
+
+
+
+    /// @brief Return the transpose of the inverse Jacobian of the map applied to @a in.
+    /// @details Ignores second argument
+    Vec3d applyIJT(const Vec3d& in, const Vec3d&) const { return applyIJT(in);}
+    /// Return the transpose of the inverse Jacobian of the map applied to @c in
+    Vec3d applyIJT(const Vec3d& in) const { return applyInverseMap(in); }
+    /// Return the Jacobian Curvature: zero for a linear map
+    Mat3d applyIJC(const Mat3d& in) const
+    {
+        Mat3d tmp;
+        for (int i = 0; i < 3; i++) {
+            tmp.setRow(i, in.row(i) * mScaleValuesInverse(i));
+        }
+        for (int i = 0; i < 3; i++) {
+            tmp.setCol(i, tmp.col(i) * mScaleValuesInverse(i));
+        }
+        return tmp;
+    }
+    Mat3d applyIJC(const Mat3d& in, const Vec3d&, const Vec3d&) const { return applyIJC(in); }
+    /// Return the product of the scale values, ignores argument
+    double determinant(const Vec3d&) const { return determinant(); }
+    /// Return the product of the scale values
+    double determinant() const { return mScaleValues.x() * mScaleValues.y() * mScaleValues.z(); }
+
+    /// Return the scale values that define the map
+    const Vec3d& getScale() const {return mScaleValues;}
+
+    /// Return the square of the scale.  Used to optimize some finite difference calculations
+    const Vec3d& getInvScaleSqr() const { return mInvScaleSqr; }
+    /// Return 1/(2 scale). Used to optimize some finite difference calculations
+    const Vec3d& getInvTwiceScale() const { return mInvTwiceScale; }
+    /// Return 1/(scale)
+    const Vec3d& getInvScale() const { return mScaleValuesInverse; }
+
+    //@{
+    /// @brief Returns the lengths of the images
+    /// of the segments
+    /// \f$(0,0,0)-(1,0,0)\f$, \f$(0,0,0)-(0,1,0)\f$, \f$(0,0,0)-(0,0,1)\f$
+    /// this is equivalent to the absolute values of the scale values
+    Vec3d voxelSize() const { return mVoxelSize; }
+    Vec3d voxelSize(const Vec3d&) const { return voxelSize(); }
+    //@}
+
+    /// read serialization
+    void read(std::istream& is)
+    {
+        mScaleValues.read(is);
+        mVoxelSize.read(is);
+        mScaleValuesInverse.read(is);
+        mInvScaleSqr.read(is);
+        mInvTwiceScale.read(is);
+    }
+    /// write serialization
+    void write(std::ostream& os) const
+    {
+        mScaleValues.write(os);
+        mVoxelSize.write(os);
+        mScaleValuesInverse.write(os);
+        mInvScaleSqr.write(os);
+        mInvTwiceScale.write(os);
+    }
+    /// string serialization, useful for debuging
+    std::string str() const
+    {
+        std::ostringstream buffer;
+        buffer << " - scale: " << mScaleValues << std::endl;
+        buffer << " - voxel dimensions: " << mVoxelSize << std::endl;
+        return buffer.str();
+    }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const ScaleMap& other) const
+    {
+        // ::eq() uses a tolerance
+        if (!mScaleValues.eq(other.mScaleValues)) { return false; }
+        return true;
+    }
+
+    bool operator!=(const ScaleMap& other) const { return !(*this == other); }
+
+    /// Return a AffineMap equivalent to this map
+    AffineMap::Ptr getAffineMap() const
+    {
+        return AffineMap::Ptr(new AffineMap(math::scale<Mat4d>(mScaleValues)));
+    }
+
+
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropraite operation to the existing map
+    MapBase::Ptr preRotate(double radians, Axis axis) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreRotation(axis, radians);
+        return simplify(affineMap);
+    }
+
+    MapBase::Ptr preTranslate(const Vec3d& tr) const;
+
+    MapBase::Ptr preScale(const Vec3d& v) const;
+
+    MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropraite operation to the existing map.
+    MapBase::Ptr postRotate(double radians, Axis axis) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostRotation(axis, radians);
+        return simplify(affineMap);
+    }
+
+    MapBase::Ptr postTranslate(const Vec3d& tr) const;
+
+    MapBase::Ptr postScale(const Vec3d& v) const;
+
+    MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+private:
+    Vec3d mScaleValues, mVoxelSize, mScaleValuesInverse, mInvScaleSqr, mInvTwiceScale;
+}; // class ScaleMap
+
+
+/// @brief A specialized Affine transform that scales along the principal axis
+/// the scaling is uniform in the three-directions
+class OPENVDB_API UniformScaleMap: public ScaleMap
+{
+public:
+    typedef boost::shared_ptr<UniformScaleMap>       Ptr;
+    typedef boost::shared_ptr<const UniformScaleMap> ConstPtr;
+
+    UniformScaleMap(): ScaleMap(Vec3d(1,1,1)) {}
+    UniformScaleMap(double scale): ScaleMap(Vec3d(scale, scale, scale)) {}
+    UniformScaleMap(const UniformScaleMap& other): ScaleMap(other) {}
+    ~UniformScaleMap() {}
+
+    /// Return a MapBase::Ptr to a new UniformScaleMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new UniformScaleMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new UniformScaleMap(*this)); }
+
+    MapBase::Ptr inverseMap() const
+    {
+        const Vec3d& invScale = getInvScale();
+        return MapBase::Ptr(new UniformScaleMap( invScale[0]));
+    }
+
+    static bool isRegistered() { return MapRegistry::isRegistered(UniformScaleMap::mapType()); }
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            UniformScaleMap::mapType(),
+            UniformScaleMap::create);
+    }
+
+    Name type() const { return mapType(); }
+    static Name mapType() { return Name("UniformScaleMap"); }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const UniformScaleMap& other) const { return ScaleMap::operator==(other); }
+    bool operator!=(const UniformScaleMap& other) const { return !(*this == other); }
+
+    /// Return a MapBase::Ptr to a UniformScaleTraslateMap that is the result of
+    /// pre-translation on this map
+    MapBase::Ptr preTranslate(const Vec3d& tr) const;
+
+    /// Return a MapBase::Ptr to a UniformScaleTraslateMap that is the result of
+    /// post-translation on this map
+    MapBase::Ptr postTranslate(const Vec3d& tr) const;
+
+}; // class UniformScaleMap
+
+
+////////////////////////////////////////
+
+
+inline MapBase::Ptr
+ScaleMap::preScale(const Vec3d& v) const
+{
+    const Vec3d new_scale(v * mScaleValues);
+    if (isApproxEqual(new_scale[0],new_scale[1]) && isApproxEqual(new_scale[0],new_scale[2])) {
+        return MapBase::Ptr(new UniformScaleMap(new_scale[0]));
+    } else {
+        return MapBase::Ptr(new ScaleMap(new_scale));
+    }
+}
+
+
+inline MapBase::Ptr
+ScaleMap::postScale(const Vec3d& v) const
+{ // pre-post Scale are the same for a scale map
+    return preScale(v);
+}
+
+
+/// @brief A specialized linear transform that performs a translation
+class OPENVDB_API TranslationMap: public MapBase
+{
+public:
+    typedef boost::shared_ptr<TranslationMap>       Ptr;
+    typedef boost::shared_ptr<const TranslationMap> ConstPtr;
+
+    // default constructor is a translation by zero.
+    TranslationMap(): MapBase(), mTranslation(Vec3d(0,0,0)) {}
+    TranslationMap(const Vec3d& t): MapBase(), mTranslation(t) {}
+    TranslationMap(const TranslationMap& other): MapBase(), mTranslation(other.mTranslation) {}
+
+    ~TranslationMap() {}
+
+    /// Return a MapBase::Ptr to a new TranslationMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new TranslationMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new TranslationMap(*this)); }
+
+    MapBase::Ptr inverseMap() const { return MapBase::Ptr(new TranslationMap(-mTranslation)); }
+
+    static bool isRegistered() { return MapRegistry::isRegistered(TranslationMap::mapType()); }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            TranslationMap::mapType(),
+            TranslationMap::create);
+    }
+
+    Name type() const { return mapType(); }
+    static Name mapType() { return Name("TranslationMap"); }
+
+    /// Return @c true (a TranslationMap is always linear).
+    bool isLinear() const { return true; }
+
+    /// Return @c false (by convention true)
+    bool hasUniformScale() const { return true; }
+
+    /// Return the image of @c in under the map
+    Vec3d applyMap(const Vec3d& in) const { return in + mTranslation; }
+    /// Return the pre-image of @c in under the map
+    Vec3d applyInverseMap(const Vec3d& in) const { return in - mTranslation; }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in, const Vec3d&) const { return applyJacobian(in); }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in) const { return in; }
+
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d&) const { return applyInverseJacobian(in); }
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in) const { return in; }
+
+
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    /// This tranforms range-space gradients to domain-space gradients
+    Vec3d applyJT(const Vec3d& in, const Vec3d&) const { return applyJT(in); }
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    Vec3d applyJT(const Vec3d& in) const { return in; }
+
+    /// @brief Return the transpose of the inverse Jacobian (Identity for TranslationMap)
+    /// of the map applied to @c in, ignores second argument
+    Vec3d applyIJT(const Vec3d& in, const Vec3d& ) const { return applyIJT(in);}
+    /// @brief Return the transpose of the inverse Jacobian (Identity for TranslationMap)
+    /// of the map applied to @c in
+    Vec3d applyIJT(const Vec3d& in) const {return in;}
+    /// Return the Jacobian Curvature: zero for a linear map
+    Mat3d applyIJC(const Mat3d& mat) const {return mat;}
+    Mat3d applyIJC(const Mat3d& mat, const Vec3d&, const Vec3d&) const { return applyIJC(mat); }
+
+    /// Return @c 1
+    double determinant(const Vec3d& ) const { return determinant(); }
+    /// Return @c 1
+    double determinant() const { return 1.0; }
+
+    /// Return \f$ (1,1,1) \f$
+    Vec3d voxelSize() const { return Vec3d(1,1,1);}
+    /// Return \f$ (1,1,1) \f$
+    Vec3d voxelSize(const Vec3d&) const { return voxelSize();}
+
+    /// Return the translation vector
+    const Vec3d& getTranslation() const { return mTranslation; }
+    /// read serialization
+    void read(std::istream& is) { mTranslation.read(is); }
+    /// write serialization
+    void write(std::ostream& os) const { mTranslation.write(os); }
+
+    /// string serialization, useful for debuging
+    std::string str() const
+    {
+        std::ostringstream buffer;
+        buffer << " - translation: " << mTranslation << std::endl;
+        return buffer.str();
+    }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const TranslationMap& other) const
+    {
+        // ::eq() uses a tolerance
+        return mTranslation.eq(other.mTranslation);
+    }
+
+    bool operator!=(const TranslationMap& other) const { return !(*this == other); }
+
+    /// Return AffineMap::Ptr to an AffineMap equivalent to *this
+    AffineMap::Ptr getAffineMap() const
+    {
+        Mat4d matrix(Mat4d::identity());
+        matrix.setTranslation(mTranslation);
+
+        AffineMap::Ptr affineMap(new AffineMap(matrix));
+        return affineMap;
+    }
+
+    //@{
+    /// @brief Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropriate operation.
+    MapBase::Ptr preRotate(double radians, Axis axis) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreRotation(axis, radians);
+        return simplify(affineMap);
+
+    }
+    MapBase::Ptr preTranslate(const Vec3d& t) const
+    {
+        return MapBase::Ptr(new TranslationMap(t + mTranslation));
+    }
+
+    MapBase::Ptr preScale(const Vec3d& v) const;
+
+    MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+    //@{
+    /// @brief Return a MapBase::Ptr to a new map that is the result
+    /// of postfixing the appropriate operation.
+    MapBase::Ptr postRotate(double radians, Axis axis) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostRotation(axis, radians);
+        return simplify(affineMap);
+
+    }
+    MapBase::Ptr postTranslate(const Vec3d& t) const
+    { // post and pre are the same for this
+        return MapBase::Ptr(new TranslationMap(t + mTranslation));
+    }
+
+    MapBase::Ptr postScale(const Vec3d& v) const;
+
+    MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+private:
+    Vec3d mTranslation;
+}; // class TranslationMap
+
+
+////////////////////////////////////////
+
+
+/// @brief A specialized Affine transform that scales along the principal axis
+/// the scaling need not be uniform in the three-directions, and then
+/// translates the result.
+class OPENVDB_API ScaleTranslateMap: public MapBase
+{
+public:
+    typedef boost::shared_ptr<ScaleTranslateMap>       Ptr;
+    typedef boost::shared_ptr<const ScaleTranslateMap> ConstPtr;
+
+    ScaleTranslateMap():
+        MapBase(),
+        mTranslation(Vec3d(0,0,0)),
+        mScaleValues(Vec3d(1,1,1)),
+        mVoxelSize(Vec3d(1,1,1)),
+        mScaleValuesInverse(Vec3d(1,1,1)),
+        mInvScaleSqr(1,1,1),
+        mInvTwiceScale(0.5,0.5,0.5)
+    {
+    }
+
+    ScaleTranslateMap(const Vec3d& scale, const Vec3d& translate):
+        MapBase(),
+        mTranslation(translate),
+        mScaleValues(scale),
+        mVoxelSize(std::abs(scale(0)), std::abs(scale(1)), std::abs(scale(2)))
+    {
+        const double determinant = scale[0]* scale[1] * scale[2];
+        if (std::abs(determinant) < 3.0 * math::Tolerance<double>::value()) {
+            OPENVDB_THROW(ArithmeticError, "Non-zero scale values required");
+        }
+        mScaleValuesInverse = 1.0 / mScaleValues;
+        mInvScaleSqr = mScaleValuesInverse * mScaleValuesInverse;
+        mInvTwiceScale = mScaleValuesInverse / 2;
+    }
+
+    ScaleTranslateMap(const ScaleMap& scale, const TranslationMap& translate):
+        MapBase(),
+        mTranslation(translate.getTranslation()),
+        mScaleValues(scale.getScale()),
+        mVoxelSize(std::abs(mScaleValues(0)),
+                         std::abs(mScaleValues(1)),
+                         std::abs(mScaleValues(2))),
+        mScaleValuesInverse(1.0 / scale.getScale())
+    {
+        mInvScaleSqr = mScaleValuesInverse * mScaleValuesInverse;
+        mInvTwiceScale = mScaleValuesInverse / 2;
+    }
+
+    ScaleTranslateMap(const ScaleTranslateMap& other):
+        MapBase(),
+        mTranslation(other.mTranslation),
+        mScaleValues(other.mScaleValues),
+        mVoxelSize(other.mVoxelSize),
+        mScaleValuesInverse(other.mScaleValuesInverse),
+        mInvScaleSqr(other.mInvScaleSqr),
+        mInvTwiceScale(other.mInvTwiceScale)
+    {}
+
+    ~ScaleTranslateMap() {}
+
+    /// Return a MapBase::Ptr to a new ScaleTranslateMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new ScaleTranslateMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new ScaleTranslateMap(*this)); }
+
+    MapBase::Ptr inverseMap() const
+    {
+        return MapBase::Ptr(new ScaleTranslateMap(
+            mScaleValuesInverse, -mScaleValuesInverse * mTranslation));
+    }
+
+    static bool isRegistered() { return MapRegistry::isRegistered(ScaleTranslateMap::mapType()); }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            ScaleTranslateMap::mapType(),
+            ScaleTranslateMap::create);
+    }
+
+    Name type() const { return mapType(); }
+    static Name mapType() { return Name("ScaleTranslateMap"); }
+
+    /// Return @c true (a ScaleTranslateMap is always linear).
+    bool isLinear() const { return true; }
+
+    /// @brief Return @c true if the scale values have the same magnitude
+    /// (eg. -1, 1, -1 would be a rotation).
+    bool hasUniformScale() const
+    {
+        bool value = isApproxEqual(
+            std::abs(mScaleValues.x()), std::abs(mScaleValues.y()), double(5e-7));
+        value = value && isApproxEqual(
+            std::abs(mScaleValues.x()), std::abs(mScaleValues.z()), double(5e-7));
+        return value;
+    }
+
+    /// Return the image of @c under the map
+    Vec3d applyMap(const Vec3d& in) const
+    {
+        return Vec3d(
+            in.x() * mScaleValues.x() + mTranslation.x(),
+            in.y() * mScaleValues.y() + mTranslation.y(),
+            in.z() * mScaleValues.z() + mTranslation.z());
+    }
+    /// Return the pre-image of @c under the map
+    Vec3d applyInverseMap(const Vec3d& in) const
+    {
+        return Vec3d(
+            (in.x() - mTranslation.x() ) * mScaleValuesInverse.x(),
+            (in.y() - mTranslation.y() ) * mScaleValuesInverse.y(),
+            (in.z() - mTranslation.z() ) * mScaleValuesInverse.z());
+    }
+
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in, const Vec3d&) const { return applyJacobian(in); }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in) const { return in * mScaleValues; }
+
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d&) const { return applyInverseJacobian(in); }
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in) const { return in * mScaleValuesInverse; }
+
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    /// This tranforms range-space gradients to domain-space gradients
+    Vec3d applyJT(const Vec3d& in, const Vec3d&) const { return applyJT(in); }
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    Vec3d applyJT(const Vec3d& in) const { return applyJacobian(in); }
+
+    /// @brief Return the transpose of the inverse Jacobian of the map applied to @a in
+    /// @details Ignores second argument
+    Vec3d applyIJT(const Vec3d& in, const Vec3d& ) const { return applyIJT(in);}
+    /// Return the transpose of the inverse Jacobian of the map applied to @c in
+    Vec3d applyIJT(const Vec3d& in) const
+    {
+        return Vec3d(
+            in.x() * mScaleValuesInverse.x(),
+            in.y() * mScaleValuesInverse.y(),
+            in.z() * mScaleValuesInverse.z());
+    }
+    /// Return the Jacobian Curvature: zero for a linear map
+    Mat3d applyIJC(const Mat3d& in) const
+    {
+        Mat3d tmp;
+        for (int i=0; i<3; i++){
+            tmp.setRow(i, in.row(i)*mScaleValuesInverse(i));
+        }
+        for (int i=0; i<3; i++){
+            tmp.setCol(i, tmp.col(i)*mScaleValuesInverse(i));
+        }
+        return tmp;
+    }
+    Mat3d applyIJC(const Mat3d& in, const Vec3d&, const Vec3d& ) const { return applyIJC(in); }
+
+    /// Return the product of the scale values, ignores argument
+    double determinant(const Vec3d& ) const { return determinant(); }
+    /// Return the product of the scale values
+    double determinant() const { return mScaleValues.x()*mScaleValues.y()*mScaleValues.z(); }
+    /// Return the absolute values of the scale values
+    Vec3d voxelSize() const { return mVoxelSize;}
+    /// Return the absolute values of the scale values, ignores
+    ///argument
+    Vec3d voxelSize(const Vec3d&) const { return voxelSize();}
+
+    /// Returns the scale values
+    const Vec3d& getScale() const { return mScaleValues; }
+    /// Returns the translation
+    const Vec3d& getTranslation() const { return mTranslation; }
+
+    /// Return the square of the scale.  Used to optimize some finite difference calculations
+    const Vec3d& getInvScaleSqr() const {return mInvScaleSqr;}
+    /// Return 1/(2 scale). Used to optimize some finite difference calculations
+    const Vec3d& getInvTwiceScale() const {return mInvTwiceScale;}
+    /// Return 1/(scale)
+    const Vec3d& getInvScale() const {return mScaleValuesInverse; }
+
+    /// read serialization
+    void read(std::istream& is)
+    {
+        mTranslation.read(is);
+        mScaleValues.read(is);
+        mVoxelSize.read(is);
+        mScaleValuesInverse.read(is);
+        mInvScaleSqr.read(is);
+        mInvTwiceScale.read(is);
+    }
+    /// write serialization
+    void write(std::ostream& os) const
+    {
+        mTranslation.write(os);
+        mScaleValues.write(os);
+        mVoxelSize.write(os);
+        mScaleValuesInverse.write(os);
+        mInvScaleSqr.write(os);
+        mInvTwiceScale.write(os);
+    }
+    /// string serialization, useful for debuging
+    std::string str() const
+    {
+        std::ostringstream buffer;
+        buffer << " - translation: " << mTranslation << std::endl;
+        buffer << " - scale: " << mScaleValues << std::endl;
+        buffer << " - voxel dimensions: " << mVoxelSize << std::endl;
+        return buffer.str();
+    }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const ScaleTranslateMap& other) const
+    {
+        // ::eq() uses a tolerance
+        if (!mScaleValues.eq(other.mScaleValues)) { return false; }
+        if (!mTranslation.eq(other.mTranslation)) { return false; }
+        return true;
+    }
+
+    bool operator!=(const ScaleTranslateMap& other) const { return !(*this == other); }
+
+    /// Return AffineMap::Ptr to an AffineMap equivalent to *this
+    AffineMap::Ptr getAffineMap() const
+    {
+        AffineMap::Ptr affineMap(new AffineMap(math::scale<Mat4d>(mScaleValues)));
+        affineMap->accumPostTranslation(mTranslation);
+        return affineMap;
+    }
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropraite operation.
+    MapBase::Ptr preRotate(double radians, Axis axis) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreRotation(axis, radians);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr preTranslate(const Vec3d& t) const
+    {
+        const Vec3d& s = mScaleValues;
+        const Vec3d scaled_trans( t.x() * s.x(),
+                                  t.y() * s.y(),
+                                  t.z() * s.z() );
+        return MapBase::Ptr( new ScaleTranslateMap(mScaleValues, mTranslation + scaled_trans));
+    }
+
+    MapBase::Ptr preScale(const Vec3d& v) const;
+
+    MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of postfixing the appropraite operation.
+    MapBase::Ptr postRotate(double radians, Axis axis) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostRotation(axis, radians);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr postTranslate(const Vec3d& t) const
+    {
+        return MapBase::Ptr( new ScaleTranslateMap(mScaleValues, mTranslation + t));
+    }
+
+    MapBase::Ptr postScale(const Vec3d& v) const;
+
+    MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+private:
+    Vec3d mTranslation, mScaleValues, mVoxelSize, mScaleValuesInverse,
+        mInvScaleSqr, mInvTwiceScale;
+}; // class ScaleTanslateMap
+
+
+inline MapBase::Ptr
+ScaleMap::postTranslate(const Vec3d& t) const
+{
+    return MapBase::Ptr(new ScaleTranslateMap(mScaleValues, t));
+}
+
+
+inline MapBase::Ptr
+ScaleMap::preTranslate(const Vec3d& t) const
+{
+
+    const Vec3d& s = mScaleValues;
+    const Vec3d scaled_trans( t.x() * s.x(),
+                              t.y() * s.y(),
+                              t.z() * s.z() );
+    return MapBase::Ptr(new ScaleTranslateMap(mScaleValues, scaled_trans));
+}
+
+
+/// @brief A specialized Affine transform that uniformaly scales along the principal axis
+/// and then translates the result.
+class OPENVDB_API UniformScaleTranslateMap: public ScaleTranslateMap
+{
+public:
+    typedef boost::shared_ptr<UniformScaleTranslateMap>       Ptr;
+    typedef boost::shared_ptr<const UniformScaleTranslateMap> ConstPtr;
+
+    UniformScaleTranslateMap():ScaleTranslateMap(Vec3d(1,1,1), Vec3d(0,0,0)) {}
+    UniformScaleTranslateMap(double scale, const Vec3d& translate):
+        ScaleTranslateMap(Vec3d(scale,scale,scale), translate) {}
+    UniformScaleTranslateMap(const UniformScaleMap& scale, const TranslationMap& translate):
+        ScaleTranslateMap(scale.getScale(), translate.getTranslation()) {}
+
+    UniformScaleTranslateMap(const UniformScaleTranslateMap& other):ScaleTranslateMap(other) {}
+    ~UniformScaleTranslateMap() {}
+
+    /// Return a MapBase::Ptr to a new UniformScaleTranslateMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new UniformScaleTranslateMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new UniformScaleTranslateMap(*this)); }
+
+    MapBase::Ptr inverseMap() const
+    {
+        const Vec3d& scaleInv = getInvScale();
+        const Vec3d& trans = getTranslation();
+        return MapBase::Ptr(new UniformScaleTranslateMap(scaleInv[0], -scaleInv[0] * trans));
+    }
+
+    static bool isRegistered()
+    {
+        return MapRegistry::isRegistered(UniformScaleTranslateMap::mapType());
+    }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+                                 UniformScaleTranslateMap::mapType(),
+                                 UniformScaleTranslateMap::create);
+    }
+
+    Name type() const { return mapType(); }
+    static Name mapType() { return Name("UniformScaleTranslateMap"); }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const UniformScaleTranslateMap& other) const
+    {
+        return ScaleTranslateMap::operator==(other);
+    }
+    bool operator!=(const UniformScaleTranslateMap& other) const { return !(*this == other); }
+
+    /// @brief Return a MapBase::Ptr to a UniformScaleTranslateMap that is
+    /// the result of prepending translation on this map.
+    MapBase::Ptr preTranslate(const Vec3d& t) const
+    {
+        const double scale = this->getScale().x();
+        const Vec3d  new_trans = this->getTranslation() + scale * t;
+        return MapBase::Ptr( new UniformScaleTranslateMap(scale, new_trans));
+    }
+
+    /// @brief Return a MapBase::Ptr to a UniformScaleTranslateMap that is
+    /// the result of postfixing translation on this map.
+    MapBase::Ptr postTranslate(const Vec3d& t) const
+    {
+        const double scale = this->getScale().x();
+        return MapBase::Ptr( new UniformScaleTranslateMap(scale, this->getTranslation() + t));
+    }
+}; // class UniformScaleTanslateMap
+
+
+inline MapBase::Ptr
+UniformScaleMap::postTranslate(const Vec3d& t) const
+{
+    const double scale = this->getScale().x();
+    return MapBase::Ptr(new UniformScaleTranslateMap(scale, t));
+}
+
+
+inline MapBase::Ptr
+UniformScaleMap::preTranslate(const Vec3d& t) const
+{
+    const double scale = this->getScale().x();
+    return MapBase::Ptr(new UniformScaleTranslateMap(scale, scale*t));
+}
+
+
+inline MapBase::Ptr
+TranslationMap::preScale(const Vec3d& v) const
+{
+    if (isApproxEqual(v[0],v[1]) && isApproxEqual(v[0],v[2])) {
+        return MapBase::Ptr(new UniformScaleTranslateMap(v[0], mTranslation));
+    } else {
+        return MapBase::Ptr(new ScaleTranslateMap(v, mTranslation));
+    }
+}
+
+
+inline MapBase::Ptr
+TranslationMap::postScale(const Vec3d& v) const
+{
+    if (isApproxEqual(v[0],v[1]) && isApproxEqual(v[0],v[2])) {
+        return MapBase::Ptr(new UniformScaleTranslateMap(v[0], v[0]*mTranslation));
+    } else {
+        const Vec3d trans(mTranslation.x()*v.x(),
+                          mTranslation.y()*v.y(),
+                          mTranslation.z()*v.z());
+        return MapBase::Ptr(new ScaleTranslateMap(v, trans));
+    }
+}
+
+
+inline MapBase::Ptr
+ScaleTranslateMap::preScale(const Vec3d& v) const
+{
+    const Vec3d new_scale( v * mScaleValues );
+    if (isApproxEqual(new_scale[0],new_scale[1]) && isApproxEqual(new_scale[0],new_scale[2])) {
+        return MapBase::Ptr( new UniformScaleTranslateMap(new_scale[0], mTranslation));
+    } else {
+        return MapBase::Ptr( new ScaleTranslateMap(new_scale, mTranslation));
+    }
+}
+
+
+inline MapBase::Ptr
+ScaleTranslateMap::postScale(const Vec3d& v) const
+{
+    const Vec3d new_scale( v * mScaleValues );
+    const Vec3d new_trans( mTranslation.x()*v.x(),
+                           mTranslation.y()*v.y(),
+                           mTranslation.z()*v.z() );
+
+    if (isApproxEqual(new_scale[0],new_scale[1]) && isApproxEqual(new_scale[0],new_scale[2])) {
+        return MapBase::Ptr( new UniformScaleTranslateMap(new_scale[0], new_trans));
+    } else {
+        return MapBase::Ptr( new ScaleTranslateMap(new_scale, new_trans));
+    }
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief A specialized linear transform that performs a unitary maping
+/// i.e. rotation  and or reflection.
+class OPENVDB_API UnitaryMap: public MapBase
+{
+public:
+    typedef boost::shared_ptr<UnitaryMap>       Ptr;
+    typedef boost::shared_ptr<const UnitaryMap> ConstPtr;
+
+    /// default constructor makes an Idenity.
+    UnitaryMap(): mAffineMap(Mat4d::identity())
+    {
+    }
+
+    UnitaryMap(const Vec3d& axis, double radians)
+    {
+        Mat3d matrix;
+        matrix.setToRotation(axis, radians);
+        mAffineMap = AffineMap(matrix);
+    }
+
+    UnitaryMap(Axis axis, double radians)
+    {
+        Mat4d matrix;
+        matrix.setToRotation(axis, radians);
+        mAffineMap = AffineMap(matrix);
+    }
+
+    UnitaryMap(const Mat3d& m)
+    {
+        // test that the mat3 is a rotation || reflection
+        if (!isUnitary(m)) {
+            OPENVDB_THROW(ArithmeticError, "Matrix initializing unitary map was not unitary");
+        }
+
+        Mat4d matrix(Mat4d::identity());
+        matrix.setMat3(m);
+        mAffineMap = AffineMap(matrix);
+    }
+
+    UnitaryMap(const Mat4d& m)
+    {
+        if (!isInvertible(m)) {
+            OPENVDB_THROW(ArithmeticError,
+                "4x4 Matrix initializing unitary map was not unitary: not invertible");
+        }
+
+        if (!isAffine(m)) {
+            OPENVDB_THROW(ArithmeticError,
+                "4x4 Matrix initializing unitary map was not unitary: not affine");
+        }
+
+        if (hasTranslation(m)) {
+            OPENVDB_THROW(ArithmeticError,
+                "4x4 Matrix initializing unitary map was not unitary: had translation");
+        }
+
+        if (!isUnitary(m.getMat3())) {
+            OPENVDB_THROW(ArithmeticError,
+                "4x4 Matrix initializing unitary map was not unitary");
+        }
+
+        mAffineMap = AffineMap(m);
+    }
+
+    UnitaryMap(const UnitaryMap& other):
+        MapBase(other),
+        mAffineMap(other.mAffineMap)
+    {
+    }
+
+    UnitaryMap(const UnitaryMap& first, const UnitaryMap& second):
+        mAffineMap(*(first.getAffineMap()), *(second.getAffineMap()))
+    {
+    }
+
+    ~UnitaryMap() {}
+    /// Return a MapBase::Ptr to a new UnitaryMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new UnitaryMap()); }
+    /// Returns a MapBase::Ptr to a deep copy of *this
+    MapBase::Ptr copy() const { return MapBase::Ptr(new UnitaryMap(*this)); }
+
+    MapBase::Ptr inverseMap() const
+    {
+        return MapBase::Ptr(new UnitaryMap(mAffineMap.getMat4().inverse()));
+    }
+
+    static bool isRegistered() { return MapRegistry::isRegistered(UnitaryMap::mapType()); }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            UnitaryMap::mapType(),
+            UnitaryMap::create);
+    }
+
+    /// Return @c UnitaryMap
+    Name type() const { return mapType(); }
+    /// Return @c UnitaryMap
+    static Name mapType() { return Name("UnitaryMap"); }
+
+    /// Return @c true (a UnitaryMap is always linear).
+    bool isLinear() const { return true; }
+
+    /// Return @c false (by convention true)
+    bool hasUniformScale() const { return true; }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const UnitaryMap& other) const
+    {
+        // compare underlying linear map.
+        if (mAffineMap!=other.mAffineMap)  return false;
+        return true;
+    }
+
+    bool operator!=(const UnitaryMap& other) const { return !(*this == other); }
+    /// Return the image of @c in under the map
+    Vec3d applyMap(const Vec3d& in) const { return mAffineMap.applyMap(in); }
+    /// Return the pre-image of @c in under the map
+    Vec3d applyInverseMap(const Vec3d& in) const { return mAffineMap.applyInverseMap(in); }
+
+    Vec3d applyJacobian(const Vec3d& in, const Vec3d&) const { return applyJacobian(in); }
+    /// Return the Jacobian of the map applied to @a in.
+    Vec3d applyJacobian(const Vec3d& in) const { return mAffineMap.applyJacobian(in); }
+
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d&) const { return applyInverseJacobian(in); }
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in) const { return mAffineMap.applyInverseJacobian(in); }
+
+
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    /// This tranforms range-space gradients to domain-space gradients
+    Vec3d applyJT(const Vec3d& in, const Vec3d&) const { return applyJT(in); }
+    /// Return the Jacobian Transpose of the map applied to @a in.
+    Vec3d applyJT(const Vec3d& in) const {
+        // The transpose of the unitary map is its inverse
+        return applyInverseMap(in);
+    }
+
+
+    /// @brief Return the transpose of the inverse Jacobian of the map applied to @a in
+    /// @details Ignores second argument
+    Vec3d applyIJT(const Vec3d& in, const Vec3d& ) const { return applyIJT(in);}
+    /// Return the transpose of the inverse Jacobian of the map applied to @c in
+    Vec3d applyIJT(const Vec3d& in) const { return mAffineMap.applyIJT(in); }
+    /// Return the Jacobian Curvature: zero for a linear map
+    Mat3d applyIJC(const Mat3d& in) const { return mAffineMap.applyIJC(in); }
+    Mat3d applyIJC(const Mat3d& in, const Vec3d&, const Vec3d& ) const { return applyIJC(in); }
+    /// Return the determinant of the Jacobian, ignores argument
+    double determinant(const Vec3d& ) const { return determinant(); }
+    /// Return the determinant of the Jacobian
+    double determinant() const { return mAffineMap.determinant(); }
+
+
+    /// @brief Returns the lengths of the images
+    /// of the segments
+    /// \f$(0,0,0)-(1,0,0)\f$, \f$(0,0,0)-(0,1,0)\f$,
+    /// \f$(0,0,0)-(0,0,1)\f$
+    Vec3d voxelSize() const { return mAffineMap.voxelSize();}
+    Vec3d voxelSize(const Vec3d&) const { return voxelSize();}
+
+    /// read serialization
+    void read(std::istream& is)
+    {
+        mAffineMap.read(is);
+    }
+
+    /// write serialization
+    void write(std::ostream& os) const
+    {
+        mAffineMap.write(os);
+    }
+    /// string serialization, useful for debuging
+    std::string str() const
+    {
+        std::ostringstream buffer;
+        buffer << mAffineMap.str();
+        return buffer.str();
+    }
+    /// Return AffineMap::Ptr to an AffineMap equivalent to *this
+    AffineMap::Ptr getAffineMap() const { return AffineMap::Ptr(new AffineMap(mAffineMap)); }
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropraite operation.
+    MapBase::Ptr preRotate(double radians, Axis axis) const
+    {
+        UnitaryMap first(axis, radians);
+        UnitaryMap::Ptr unitaryMap(new UnitaryMap(first, *this));
+        return boost::static_pointer_cast<MapBase, UnitaryMap>(unitaryMap);
+    }
+    MapBase::Ptr preTranslate(const Vec3d& t) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreTranslation(t);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr preScale(const Vec3d& v) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreScale(v);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPreShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+
+    //@{
+    /// @brief  Return a MapBase::Ptr to a new map that is the result
+    /// of postfixing the appropraite operation.
+    MapBase::Ptr postRotate(double radians, Axis axis) const
+    {
+        UnitaryMap second(axis, radians);
+        UnitaryMap::Ptr unitaryMap(new UnitaryMap(*this, second));
+        return boost::static_pointer_cast<MapBase, UnitaryMap>(unitaryMap);
+    }
+    MapBase::Ptr postTranslate(const Vec3d& t) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostTranslation(t);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr postScale(const Vec3d& v) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostScale(v);
+        return simplify(affineMap);
+    }
+    MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const
+    {
+        AffineMap::Ptr affineMap = getAffineMap();
+        affineMap->accumPostShear(axis0, axis1, shear);
+        return simplify(affineMap);
+    }
+    //@}
+
+private:
+    AffineMap  mAffineMap;
+}; // class UnitaryMap
+
+
+////////////////////////////////////////
+
+
+/// @brief  This map is composed of three steps.
+/// First it will take a box of size (Lx X  Ly X Lz) defined by a member data bounding box
+/// and map it into a frustum with near plane (1 X Ly/Lx) and prescribed depth
+/// Then this frustum is transformed by an internal second map: most often a uniform scale,
+/// but other effects can be achieved by accumulating translation, shear and rotation: these
+/// are all applied to the second map
+class OPENVDB_API NonlinearFrustumMap: public MapBase
+{
+public:
+    typedef boost::shared_ptr<NonlinearFrustumMap>       Ptr;
+    typedef boost::shared_ptr<const NonlinearFrustumMap> ConstPtr;
+
+    NonlinearFrustumMap():
+        MapBase(),
+        mBBox(Vec3d(0), Vec3d(1)),
+        mTaper(1),
+        mDepth(1)
+    {
+        init();
+    }
+
+    /// @brief Constructor that takes an index-space bounding box
+    /// to be mapped into a frustum with a given @a depth and @a taper
+    /// (defined as ratio of nearplane/farplane).
+    NonlinearFrustumMap(const BBoxd& bb, double taper, double depth):
+        MapBase(),mBBox(bb), mTaper(taper), mDepth(depth)
+    {
+        init();
+    }
+
+    /// @brief Constructor that takes an index-space bounding box
+    /// to be mapped into a frustum with a given @a depth and @a taper
+    /// (defined as ratio of nearplane/farplane).
+    /// @details This frustum is further modifed by the @a secondMap,
+    /// intended to be a simple translation and rotation and uniform scale
+   NonlinearFrustumMap(const BBoxd& bb, double taper, double depth,
+        const MapBase::Ptr& secondMap):
+        mBBox(bb), mTaper(taper), mDepth(depth)
+    {
+        if (!secondMap->isLinear() ) {
+              OPENVDB_THROW(ArithmeticError,
+                "The second map in the Frustum transfrom must be linear");
+        }
+        mSecondMap = *( secondMap->getAffineMap() );
+        init();
+    }
+
+    NonlinearFrustumMap(const NonlinearFrustumMap& other):
+        MapBase(),
+        mBBox(other.mBBox),
+        mTaper(other.mTaper),
+        mDepth(other.mDepth),
+        mSecondMap(other.mSecondMap),
+        mHasSimpleAffine(other.mHasSimpleAffine)
+    {
+        init();
+    }
+
+    /// @brief Constructor from a camera frustum
+    ///
+    /// @param position the tip of the frustum (i.e., the camera's position).
+    /// @param direction a vector pointing from @a position toward the near plane.
+    /// @param up a non-unit vector describing the direction and extent of
+    ///     the frustum's intersection on the near plane.  Together,
+    ///     @a up must be orthogonal to @a direction.
+    /// @param aspect the aspect ratio of the frustum intersection with near plane
+    ///     defined as width / height
+    /// @param z_near,depth the distance from @a position along @a direction to the
+    ///     near and far planes of the frustum.
+    /// @param x_count the number of voxels, aligned with @a left,
+    ///     across the face of the frustum
+    /// @param z_count the number of voxels, aligned with @a direction,
+    ///     between the near and far planes
+    NonlinearFrustumMap(const Vec3d& position,
+                        const Vec3d& direction,
+                        const Vec3d& up,
+                        double aspect /* width / height */,
+                        double z_near, double depth,
+                        Coord::ValueType x_count, Coord::ValueType z_count) {
+
+        /// @todo check that depth > 0
+        /// @todo check up.length > 0
+        /// @todo check that direction dot up = 0
+        if (!(depth > 0)) {
+            OPENVDB_THROW(ArithmeticError,
+                "The frustum depth must be non-zero and positive");
+        }
+        if (!(up.length() > 0)) {
+            OPENVDB_THROW(ArithmeticError,
+                "The frustum height must be non-zero and positive");
+        }
+        if (!(aspect > 0)) {
+            OPENVDB_THROW(ArithmeticError,
+                "The frustum aspect ratio  must be non-zero and positive");
+        }
+        if (!(isApproxEqual(up.dot(direction), 0.))) {
+            OPENVDB_THROW(ArithmeticError,
+                "The frustum up orientation must be perpendicular to into-frustum direction");
+        }
+
+        double near_plane_height = 2 * up.length();
+        double near_plane_width = aspect * near_plane_height;
+
+        Coord::ValueType y_count = static_cast<int>(Round(x_count / aspect));
+
+        mBBox = BBoxd(Vec3d(0,0,0), Vec3d(x_count, y_count, z_count));
+        mDepth = depth / near_plane_width;  // depth non-dimensionalized on width
+        double gamma = near_plane_width / z_near;
+        mTaper = 1./(mDepth*gamma + 1.);
+
+        Vec3d direction_unit = direction;
+        direction_unit.normalize();
+
+        Mat4d r1(Mat4d::identity());
+        r1.setToRotation(/*from*/Vec3d(0,0,1), /*to */direction_unit);
+        Mat4d r2(Mat4d::identity());
+        Vec3d temp = r1.inverse().transform(up);
+        r2.setToRotation(/*from*/Vec3d(0,1,0), /*to*/temp );
+        Mat4d scale = math::scale<Mat4d>(
+            Vec3d(near_plane_width, near_plane_width, near_plane_width));
+
+        // move the near plane to origin, rotate to align with axis, and scale down
+        // T_inv * R1_inv * R2_inv * scale_inv
+        Mat4d mat = scale * r2 * r1;
+        mat.setTranslation(position + z_near*direction_unit);
+
+        mSecondMap = AffineMap(mat);
+
+        init();
+    }
+
+    ~NonlinearFrustumMap(){}
+    /// Return a MapBase::Ptr to a new NonlinearFrustumMap
+    static MapBase::Ptr create() { return MapBase::Ptr(new NonlinearFrustumMap()); }
+    /// Return a MapBase::Ptr to a deep copy of this map
+    MapBase::Ptr copy() const { return MapBase::Ptr(new NonlinearFrustumMap(*this)); }
+
+    /// @brief Not implemented, since there is currently no map type that can
+    /// represent the inverse of a frustum
+    /// @throw NotImplementedError
+    MapBase::Ptr inverseMap() const
+    {
+        OPENVDB_THROW(NotImplementedError,
+            "inverseMap() is not implemented for NonlinearFrustumMap");
+    }
+    static bool isRegistered() { return MapRegistry::isRegistered(NonlinearFrustumMap::mapType()); }
+
+    static void registerMap()
+    {
+        MapRegistry::registerMap(
+            NonlinearFrustumMap::mapType(),
+            NonlinearFrustumMap::create);
+    }
+    /// Return @c NonlinearFrustumMap
+    Name type() const { return mapType(); }
+    /// Return @c NonlinearFrustumMap
+    static Name mapType() { return Name("NonlinearFrustumMap"); }
+
+    /// Return @c false (a NonlinearFrustumMap is never linear).
+    bool isLinear() const { return false; }
+
+    /// Return @c false (by convention false)
+    bool hasUniformScale() const { return false; }
+
+    /// Return @c true if the map is equivalent to an identity
+    bool isIdentity() const
+    {
+        // The frustum can only be consistent with a linear map if the taper value is 1
+        if (!isApproxEqual(mTaper, double(1)) ) return false;
+
+        // There are various ways an identity can decomposed between the two parts of the
+        // map.  Best to just check that the principle vectors are stationary.
+        const Vec3d e1(1,0,0);
+        if (!applyMap(e1).eq(e1)) return false;
+
+        const Vec3d e2(0,1,0);
+        if (!applyMap(e2).eq(e2)) return false;
+
+        const Vec3d e3(0,0,1);
+        if (!applyMap(e3).eq(e3)) return false;
+
+        return true;
+    }
+
+    virtual bool isEqual(const MapBase& other) const { return isEqualBase(*this, other); }
+
+    bool operator==(const NonlinearFrustumMap& other) const
+    {
+        if (mBBox!=other.mBBox) return false;
+        if (!isApproxEqual(mTaper, other.mTaper)) return false;
+        if (!isApproxEqual(mDepth, other.mDepth)) return false;
+
+        // Two linear transforms are equivalent iff they have the same translation
+        // and have the same affects on orthongal spanning basis check translation
+        Vec3d e(0,0,0);
+        if (!mSecondMap.applyMap(e).eq(other.mSecondMap.applyMap(e))) return false;
+        /// check spanning vectors
+        e(0) = 1;
+        if (!mSecondMap.applyMap(e).eq(other.mSecondMap.applyMap(e))) return false;
+        e(0) = 0;
+        e(1) = 1;
+        if (!mSecondMap.applyMap(e).eq(other.mSecondMap.applyMap(e))) return false;
+        e(1) = 0;
+        e(2) = 1;
+        if (!mSecondMap.applyMap(e).eq(other.mSecondMap.applyMap(e))) return false;
+        return true;
+    }
+
+    bool operator!=(const NonlinearFrustumMap& other) const { return !(*this == other); }
+
+    /// Return the image of @c in under the map
+    Vec3d applyMap(const Vec3d& in) const
+    {
+        return mSecondMap.applyMap(applyFrustumMap(in));
+    }
+
+    /// Return the pre-image of @c in under the map
+    Vec3d applyInverseMap(const Vec3d& in) const
+    {
+        return applyFrustumInverseMap(mSecondMap.applyInverseMap(in));
+    }
+    /// Return the Jacobian of the linear second map applied to @c in
+    Vec3d applyJacobian(const Vec3d& in) const { return mSecondMap.applyJacobian(in); }
+    /// Return the Jacobian defined at @c isloc applied to @c in
+    Vec3d applyJacobian(const Vec3d& in, const Vec3d& isloc) const
+    {
+        // Move the center of the x-face of the bbox
+        // to the origin in index space.
+        Vec3d centered(isloc);
+        centered = centered - mBBox.min();
+        centered.x() -= mXo;
+        centered.y() -= mYo;
+
+        // scale the z-direction on depth / K count
+        const double zprime = centered.z()*mDepthOnLz;
+
+        const double scale = (mGamma * zprime + 1.) / mLx;
+        const double scale2 = mGamma * mDepthOnLz / mLx;
+
+        const Vec3d tmp(scale * in.x() + scale2 * centered.x()* in.z(),
+                        scale * in.y() + scale2 * centered.y()* in.z(),
+                        mDepthOnLz * in.z());
+
+        return mSecondMap.applyJacobian(tmp);
+    }
+
+
+    /// Return the Inverse Jacobian of the map applied to @a in. (i.e. inverse map with out translation)
+    Vec3d applyInverseJacobian(const Vec3d& in) const { return mSecondMap.applyInverseJacobian(in); }
+    /// Return the Inverse Jacobian defined at @c isloc of the map applied to @a in.
+    Vec3d applyInverseJacobian(const Vec3d& in, const Vec3d& isloc) const {
+
+        // Move the center of the x-face of the bbox
+        // to the origin in index space.
+        Vec3d centered(isloc);
+        centered = centered - mBBox.min();
+        centered.x() -= mXo;
+        centered.y() -= mYo;
+
+        // scale the z-direction on depth / K count
+        const double zprime = centered.z()*mDepthOnLz;
+
+        const double scale = (mGamma * zprime + 1.) / mLx;
+        const double scale2 = mGamma * mDepthOnLz / mLx;
+
+
+        Vec3d out = mSecondMap.applyInverseJacobian(in);
+
+        out.x() = (out.x() - scale2 * centered.x() * out.z() / mDepthOnLz) / scale;
+        out.y() = (out.y() - scale2 * centered.y() * out.z() / mDepthOnLz) / scale;
+        out.z() = out.z() / mDepthOnLz;
+
+        return out;
+    }
+
+
+
+    /// Return the Jacobian Transpose of the map applied to vector @c in at @c indexloc.
+    /// This tranforms range-space gradients to domain-space gradients.
+    ///
+    Vec3d applyJT(const Vec3d& in, const Vec3d& isloc) const {
+        const Vec3d tmp = mSecondMap.applyJT(in);
+        // Move the center of the x-face of the bbox
+        // to the origin in index space.
+        Vec3d centered(isloc);
+        centered = centered - mBBox.min();
+        centered.x() -= mXo;
+        centered.y() -= mYo;
+
+        // scale the z-direction on depth / K count
+        const double zprime = centered.z()*mDepthOnLz;
+
+        const double scale = (mGamma * zprime + 1.) / mLx;
+        const double scale2 = mGamma * mDepthOnLz / mLx;
+
+        return Vec3d(scale * tmp.x(),
+                     scale * tmp.y(),
+                     scale2 * centered.x()* tmp.x() +
+                     scale2 * centered.y()* tmp.y() +
+                     mDepthOnLz * tmp.z());
+    }
+    /// Return the Jacobian Transpose of the second map applied to @c in.
+    Vec3d applyJT(const Vec3d& in) const {
+        return mSecondMap.applyJT(in);
+    }
+
+    /// Return the transpose of the inverse Jacobian of the linear second map applied to @c in
+    Vec3d applyIJT(const Vec3d& in) const { return mSecondMap.applyIJT(in); }
+
+    // the Jacobian of the nonlinear part of the transform is a sparse matrix
+    // Jacobian^(-T) =
+    //
+    //    (Lx)(  1/s               0              0 )
+    //        (  0                1/s             0 )
+    //        (  -(x-xo)g/(sLx)   -(y-yo)g/(sLx)  Lz/(Depth Lx)   )
+    /// Return the transpose of the inverse Jacobain (at @c locW applied to @c in.
+    /// @c ijk is the location in the pre-image space (e.g. index space)
+    Vec3d applyIJT(const Vec3d& d1_is, const Vec3d& ijk) const
+    {
+        const Vec3d loc = applyFrustumMap(ijk);
+        const double s = mGamma * loc.z() + 1.;
+
+        // verify that we aren't at the singularity
+        if (isApproxEqual(s, 0.)) {
+            OPENVDB_THROW(ArithmeticError, "Tried to evaluate the frustum transform"
+                " at the singular focal point (e.g. camera)");
+        }
+
+        const double sinv = 1.0/s;        // 1/(z*gamma + 1)
+        const double pt0 = mLx * sinv;    // Lx / (z*gamma +1)
+        const double pt1 = mGamma * pt0;  // gamma * Lx / ( z*gamma +1)
+        const double pt2 = pt1 * sinv;    // gamma * Lx / ( z*gamma +1)**2
+
+        const Mat3d& jacinv = mSecondMap.getConstJacobianInv();
+
+        // compute \frac{\partial E_i}{\partial x_j}
+        Mat3d gradE(Mat3d::zero());
+        for (int j = 0; j < 3; ++j ) {
+            gradE(0,j) =  pt0 * jacinv(0,j) -  pt2 * loc.x()*jacinv(2,j);
+            gradE(1,j) =  pt0 * jacinv(1,j) -  pt2 * loc.y()*jacinv(2,j);
+            gradE(2,j) = (1./mDepthOnLz) * jacinv(2,j);
+        }
+
+        Vec3d result;
+        for (int i = 0; i < 3; ++i) {
+            result(i) = d1_is(0) * gradE(0,i) + d1_is(1) * gradE(1,i) + d1_is(2) * gradE(2,i);
+        }
+
+        return result;
+
+    }
+
+    /// Return the Jacobian Curvature for the linear second map
+    Mat3d applyIJC(const Mat3d& in) const { return mSecondMap.applyIJC(in); }
+    /// Return the Jacobian Curvature: all the second derivatives in range space
+    /// @param d2_is second derivative matrix computed in index space
+    /// @param d1_is gradient computed in index space
+    /// @param ijk  the index space location where the result is computed
+    Mat3d applyIJC(const Mat3d& d2_is, const Vec3d& d1_is, const Vec3d& ijk) const
+    {
+        const Vec3d loc = applyFrustumMap(ijk);
+
+        const double s =  mGamma * loc.z()  + 1.;
+
+        // verify that we aren't at the singularity
+        if (isApproxEqual(s, 0.)) {
+            OPENVDB_THROW(ArithmeticError, "Tried to evaluate the frustum transform"
+                " at the singular focal point (e.g. camera)");
+        }
+
+        // precompute
+        const double sinv = 1.0/s;     // 1/(z*gamma + 1)
+        const double pt0 = mLx * sinv;   // Lx / (z*gamma +1)
+        const double pt1 = mGamma * pt0;   // gamma * Lx / ( z*gamma +1)
+        const double pt2 = pt1 * sinv;   // gamma * Lx / ( z*gamma +1)**2
+        const double pt3 = pt2 * sinv;   // gamma * Lx / ( z*gamma +1)**3
+
+        const Mat3d& jacinv = mSecondMap.getConstJacobianInv();
+
+        // compute \frac{\partial^2 E_i}{\partial x_j \partial x_k}
+
+        Mat3d matE0(Mat3d::zero());
+        Mat3d matE1(Mat3d::zero()); // matE2 = 0
+        for(int j = 0; j < 3; j++) {
+            for (int k = 0; k < 3; k++) {
+
+                const double pt4 =  2. * jacinv(2,j) * jacinv(2,k) * pt3;
+
+                matE0(j,k) = -(jacinv(0,j) * jacinv(2,k) + jacinv(2,j) * jacinv(0,k)) * pt2 +
+                    pt4 * loc.x();
+
+                matE1(j,k) = -(jacinv(1,j) * jacinv(2,k) + jacinv(2,j) * jacinv(1,k)) * pt2 +
+                    pt4 * loc.y();
+            }
+        }
+
+        // compute \frac{\partial E_i}{\partial x_j}
+        Mat3d gradE(Mat3d::zero());
+        for (int j = 0; j < 3; ++j ) {
+            gradE(0,j) =  pt0 * jacinv(0,j) -  pt2 * loc.x()*jacinv(2,j);
+            gradE(1,j) =  pt0 * jacinv(1,j) -  pt2 * loc.y()*jacinv(2,j);
+            gradE(2,j) = (1./mDepthOnLz) * jacinv(2,j);
+        }
+
+        Mat3d result(Mat3d::zero());
+        // compute \fac{\partial E_j}{\partial x_m} \fac{\partial E_i}{\partial x_n}
+        // \frac{\partial^2 input}{\partial E_i \partial E_j}
+        for (int m = 0; m < 3; ++m ) {
+            for ( int n = 0; n < 3; ++n) {
+                for (int i = 0; i < 3; ++i ) {
+                    for (int j = 0; j < 3; ++j) {
+                        result(m, n) += gradE(j, m) * gradE(i, n) * d2_is(i, j);
+                    }
+                }
+            }
+        }
+
+         for (int m = 0; m < 3; ++m ) {
+            for ( int n = 0; n < 3; ++n) {
+                result(m, n) +=
+                    matE0(m, n) * d1_is(0) + matE1(m, n) * d1_is(1);// + matE2(m, n) * d1_is(2);
+            }
+        }
+
+         return result;
+    }
+
+    /// Return the determinant of the Jacobian of linear second map
+    double determinant() const {return mSecondMap.determinant();} // no implementation
+
+    /// Return the determinate of the Jacobian evaluated at @c loc
+    /// @c loc is a location in the pre-image space (e.g., index space)
+    double determinant(const Vec3d& loc) const
+    {
+        double s = mGamma * loc.z() + 1.0;
+        double frustum_determinant = s * s * mDepthOnLzLxLx;
+        return mSecondMap.determinant() * frustum_determinant;
+    }
+
+    /// Return the size of a voxel at the center of the near plane
+    Vec3d voxelSize() const
+    {
+        const Vec3d loc( 0.5*(mBBox.min().x() + mBBox.max().x()),
+                         0.5*(mBBox.min().y() + mBBox.max().y()),
+                         mBBox.min().z());
+
+        return voxelSize(loc);
+
+    }
+
+    /// @brief Returns the lengths of the images of the three segments
+    /// from @a loc to @a loc + (1,0,0), from @a loc to @a loc + (0,1,0)
+    /// and from @a loc to @a loc + (0,0,1)
+    /// @param loc  a location in the pre-image space (e.g., index space)
+    Vec3d voxelSize(const Vec3d& loc) const
+    {
+        Vec3d out, pos = applyMap(loc);
+        out(0) = (applyMap(loc + Vec3d(1,0,0)) - pos).length();
+        out(1) = (applyMap(loc + Vec3d(0,1,0)) - pos).length();
+        out(2) = (applyMap(loc + Vec3d(0,0,1)) - pos).length();
+        return out;
+    }
+
+    AffineMap::Ptr getAffineMap() const { return mSecondMap.getAffineMap(); }
+
+    /// set the taper value, the ratio of nearplane width / far plane width
+    void setTaper(double t) { mTaper = t; init();}
+    /// Return the taper value.
+    double getTaper() const { return mTaper; }
+    /// set the frustum depth: distance between near and far plane = frustm depth * frustm x-width
+    void setDepth(double d) { mDepth = d; init();}
+    /// Return the unscaled frustm depth
+    double getDepth() const { return mDepth; }
+    // gamma a non-dimensional  number:  nearplane x-width / camera to near plane distance
+    double getGamma() const { return mGamma; }
+
+    /// Return the bounding box that defines the frustum in pre-image space
+    const BBoxd& getBBox() const { return mBBox; }
+
+    /// Return MapBase::Ptr& to the second map
+    const AffineMap& secondMap() const { return mSecondMap; }
+    /// Return @c true if the  the bounding box in index space that defines the region that
+    /// is maped into the frustum is non-zero, otherwise @c false
+    bool isValid() const { return !mBBox.empty();}
+
+    /// Return @c true if the second map is a uniform scale, Rotation and translation
+    bool hasSimpleAffine() const { return mHasSimpleAffine; }
+
+    /// read serialization
+    void read(std::istream& is)
+    {
+        // for backward compatibility with earlier version
+        if (io::getFormatVersion(is) < OPENVDB_FILE_VERSION_FLOAT_FRUSTUM_BBOX ) {
+            CoordBBox bb;
+            bb.read(is);
+            mBBox = BBoxd(bb.min().asVec3d(), bb.max().asVec3d());
+        } else {
+            mBBox.read(is);
+        }
+
+        is.read(reinterpret_cast<char*>(&mTaper), sizeof(double));
+        is.read(reinterpret_cast<char*>(&mDepth), sizeof(double));
+
+        // Read the second maps type.
+        Name type = readString(is);
+
+        // Check if the map has been registered.
+        if(!MapRegistry::isRegistered(type)) {
+            OPENVDB_THROW(KeyError, "Map " << type << " is not registered");
+        }
+
+        // Create the second map of the type and then read it in.
+        MapBase::Ptr proxy =  math::MapRegistry::createMap(type);
+        proxy->read(is);
+        mSecondMap = *(proxy->getAffineMap());
+        init();
+    }
+
+    /// write serialization
+    void write(std::ostream& os) const
+    {
+        mBBox.write(os);
+        os.write(reinterpret_cast<const char*>(&mTaper), sizeof(double));
+        os.write(reinterpret_cast<const char*>(&mDepth), sizeof(double));
+
+        writeString(os, mSecondMap.type());
+        mSecondMap.write(os);
+    }
+
+    /// string serialization, useful for debuging
+    std::string str() const
+    {
+        std::ostringstream buffer;
+        buffer << " - taper: " << mTaper << std::endl;
+        buffer << " - depth: " << mDepth << std::endl;
+        buffer << " SecondMap: "<< mSecondMap.type() << std::endl;
+        buffer << mSecondMap.str() << std::endl;
+        return buffer.str();
+    }
+
+    //@{
+    /// @brief Return a MapBase::Ptr to a new map that is the result
+    /// of prepending the appropriate operation to the linear part of this map
+    MapBase::Ptr preRotate(double radians, Axis axis = X_AXIS) const
+    {
+        return MapBase::Ptr(
+            new NonlinearFrustumMap(mBBox, mTaper, mDepth, mSecondMap.preRotate(radians, axis)));
+    }
+    MapBase::Ptr preTranslate(const Vec3d& t) const
+    {
+        return MapBase::Ptr(
+            new NonlinearFrustumMap(mBBox, mTaper, mDepth, mSecondMap.preTranslate(t)));
+    }
+    MapBase::Ptr preScale(const Vec3d& s) const
+    {
+        return MapBase::Ptr(
+            new NonlinearFrustumMap(mBBox, mTaper, mDepth, mSecondMap.preScale(s)));
+    }
+    MapBase::Ptr preShear(double shear, Axis axis0, Axis axis1) const
+    {
+        return MapBase::Ptr(new NonlinearFrustumMap(
+            mBBox, mTaper, mDepth, mSecondMap.preShear(shear, axis0, axis1)));
+    }
+    //@}
+
+    //@{
+    /// @brief Return a MapBase::Ptr to a new map that is the result
+    /// of postfixing the appropiate operation to the linear part of this map.
+    MapBase::Ptr postRotate(double radians, Axis axis = X_AXIS) const
+    {
+        return MapBase::Ptr(
+            new NonlinearFrustumMap(mBBox, mTaper, mDepth, mSecondMap.postRotate(radians, axis)));
+    }
+    MapBase::Ptr postTranslate(const Vec3d& t) const
+    {
+        return MapBase::Ptr(
+            new NonlinearFrustumMap(mBBox, mTaper, mDepth, mSecondMap.postTranslate(t)));
+    }
+    MapBase::Ptr postScale(const Vec3d& s) const
+    {
+        return MapBase::Ptr(
+            new NonlinearFrustumMap(mBBox, mTaper, mDepth, mSecondMap.postScale(s)));
+    }
+    MapBase::Ptr postShear(double shear, Axis axis0, Axis axis1) const
+    {
+        return MapBase::Ptr(new NonlinearFrustumMap(
+            mBBox, mTaper, mDepth, mSecondMap.postShear(shear, axis0, axis1)));
+    }
+    //@}
+
+private:
+    void init()
+    {
+        // set up as a frustum
+        mLx = mBBox.extents().x();
+        mLy = mBBox.extents().y();
+        mLz = mBBox.extents().z();
+
+        if (isApproxEqual(mLx,0.) || isApproxEqual(mLy,0.) || isApproxEqual(mLz,0.) ) {
+            OPENVDB_THROW(ArithmeticError, "The index space bounding box"
+                " must have at least two index points in each direction.");
+        }
+
+        mXo = 0.5* mLx;
+        mYo = 0.5* mLy;
+
+        // mDepth is non-dimensionalized on near
+        mGamma = (1./mTaper - 1) / mDepth;
+
+        mDepthOnLz = mDepth/mLz;
+        mDepthOnLzLxLx = mDepthOnLz/(mLx * mLx);
+
+        /// test for shear and non-uniform scale
+        mHasSimpleAffine = true;
+        Vec3d tmp = mSecondMap.voxelSize();
+
+        /// false if there is non-uniform scale
+        if (!isApproxEqual(tmp(0), tmp(1))) { mHasSimpleAffine = false; return; }
+        if (!isApproxEqual(tmp(0), tmp(2))) { mHasSimpleAffine = false; return; }
+
+        Vec3d trans = mSecondMap.applyMap(Vec3d(0,0,0));
+        /// look for shear
+        Vec3d tmp1 = mSecondMap.applyMap(Vec3d(1,0,0)) - trans;
+        Vec3d tmp2 = mSecondMap.applyMap(Vec3d(0,1,0)) - trans;
+        Vec3d tmp3 = mSecondMap.applyMap(Vec3d(0,0,1)) - trans;
+
+        /// false if there is shear
+        if (!isApproxEqual(tmp1.dot(tmp2), 0., 1.e-7)) { mHasSimpleAffine  = false; return; }
+        if (!isApproxEqual(tmp2.dot(tmp3), 0., 1.e-7)) { mHasSimpleAffine  = false; return; }
+        if (!isApproxEqual(tmp3.dot(tmp1), 0., 1.e-7)) { mHasSimpleAffine  = false; return; }
+    }
+
+    Vec3d applyFrustumMap(const Vec3d& in) const
+    {
+
+        // Move the center of the x-face of the bbox
+        // to the origin in index space.
+        Vec3d out(in);
+        out = out - mBBox.min();
+        out.x() -= mXo;
+        out.y() -= mYo;
+
+        // scale the z-direction on depth / K count
+        out.z() *= mDepthOnLz;
+
+        double scale = (mGamma * out.z() + 1.)/ mLx;
+
+        // scale the x-y on the length I count and apply tapper
+        out.x() *= scale ;
+        out.y() *= scale ;
+
+        return out;
+    }
+
+    Vec3d applyFrustumInverseMap(const Vec3d& in) const
+    {
+        // invert taper and resize:  scale = 1/( (z+1)/2 (mt-1) + 1)
+        Vec3d out(in);
+        double invScale = mLx / (mGamma * out.z() + 1.);
+        out.x() *= invScale;
+        out.y() *= invScale;
+
+        out.x() += mXo;
+        out.y() += mYo;
+
+        out.z() /= mDepthOnLz;
+
+        // move back
+        out = out +  mBBox.min();
+        return out;
+    }
+
+    // bounding box in index space used in Frustum transforms.
+    BBoxd   mBBox;
+
+    // taper value used in constructing Frustums.
+    double      mTaper;
+    double      mDepth;
+
+    // defines the second map
+    AffineMap mSecondMap;
+
+    // these are derived from the above.
+    double mLx, mLy, mLz;
+    double mXo, mYo, mGamma, mDepthOnLz, mDepthOnLzLxLx;
+
+    // true: if the mSecondMap is linear and has no shear, and has no non-uniform scale
+    bool mHasSimpleAffine;
+}; // class NonlinearFrustumMap
+
+
+////////////////////////////////////////
+
+
+///  @brief Creates the composition of two maps, each of which could be a composition.
+///  In the case that each component of the composition classified as linear an
+///  acceleration AffineMap is stored.
+template<typename FirstMapType, typename SecondMapType>
+class CompoundMap
+{
+public:
+    typedef CompoundMap<FirstMapType, SecondMapType>    MyType;
+
+    typedef boost::shared_ptr<MyType>       Ptr;
+    typedef boost::shared_ptr<const MyType> ConstPtr;
+
+
+    CompoundMap() { updateAffineMatrix(); }
+
+    CompoundMap(const FirstMapType& f, const SecondMapType& s): mFirstMap(f), mSecondMap(s)
+    {
+        updateAffineMatrix();
+    }
+
+    CompoundMap(const MyType& other):
+        mFirstMap(other.mFirstMap),
+        mSecondMap(other.mSecondMap),
+        mAffineMap(other.mAffineMap)
+    {}
+
+    Name type() const { return mapType(); }
+    static Name mapType()
+    {
+        return (FirstMapType::mapType() + Name(":") + SecondMapType::mapType());
+    }
+
+    bool operator==(const MyType& other) const
+    {
+        if (mFirstMap != other.mFirstMap)   return false;
+        if (mSecondMap != other.mSecondMap) return false;
+        if (mAffineMap != other.mAffineMap) return false;
+        return true;
+    }
+
+    bool operator!=(const MyType& other) const { return !(*this == other); }
+
+    MyType& operator=(const MyType& other)
+    {
+        mFirstMap = other.mFirstMap;
+        mSecondMap = other.mSecondMap;
+        mAffineMap = other.mAffineMap;
+        return *this;
+    }
+
+    bool isIdentity() const
+    {
+        if (is_linear<MyType>::value) {
+            return mAffineMap.isIdentity();
+        } else {
+            return mFirstMap.isIdentity()&&mSecondMap.isIdentity();
+        }
+    }
+
+    bool isDiagonal() const {
+        if (is_linear<MyType>::value) {
+            return mAffineMap.isDiagonal();
+        } else {
+            return mFirstMap.isDiagonal()&&mSecondMap.isDiagonal();
+        }
+    }
+
+    AffineMap::Ptr getAffineMap() const
+    {
+        if (is_linear<MyType>::value) {
+            AffineMap::Ptr affine(new AffineMap(mAffineMap));
+            return affine;
+        } else {
+            OPENVDB_THROW(ArithmeticError,
+                "Constant affine matrix representation not possible for this nonlinear map");
+        }
+    }
+
+    // direct decompotion
+    const FirstMapType& firstMap() const { return mFirstMap; }
+    const SecondMapType& secondMap() const {return mSecondMap; }
+
+    void setFirstMap(const FirstMapType& first) { mFirstMap = first; updateAffineMatrix(); }
+    void setSecondMap(const SecondMapType& second) { mSecondMap = second; updateAffineMatrix(); }
+
+    void read(std::istream& is)
+    {
+        mAffineMap.read(is);
+        mFirstMap.read(is);
+        mSecondMap.read(is);
+    }
+    void write(std::ostream& os) const
+    {
+        mAffineMap.write(os);
+        mFirstMap.write(os);
+        mSecondMap.write(os);
+    }
+
+private:
+    void updateAffineMatrix()
+    {
+        if (is_linear<MyType>::value) {
+            // both maps need to be linear, these methods are only defined for linear maps
+            AffineMap::Ptr first = mFirstMap.getAffineMap();
+            AffineMap::Ptr second= mSecondMap.getAffineMap();
+            mAffineMap = AffineMap(*first, *second);
+        }
+    }
+
+    FirstMapType   mFirstMap;
+    SecondMapType  mSecondMap;
+    // used for acceleration
+    AffineMap      mAffineMap;
+}; // class CompoundMap
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_MAPS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Mat.h b/nuparu/include/openvdb_new/math/Mat.h
new file mode 100644
index 00000000..d413e77d
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Mat.h
@@ -0,0 +1,1026 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Mat.h
+/// @author Joshua Schpok
+
+#ifndef OPENVDB_MATH_MAT_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_MAT_HAS_BEEN_INCLUDED
+
+#include <math.h>
+#include <iostream>
+#include <boost/format.hpp>
+#include <openvdb/Exceptions.h>
+#include "Math.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @class Mat "Mat.h"
+/// A base class for square matrices.
+template<unsigned SIZE, typename T>
+class Mat
+{
+public:
+    typedef T value_type;
+    typedef T ValueType;
+    enum SIZE_ { size = SIZE };
+
+    // Number of cols, rows, elements
+    static unsigned numRows() { return SIZE; }
+    static unsigned numColumns() { return SIZE; }
+    static unsigned numElements() { return SIZE*SIZE; }
+
+    /// Default ctor.  Does nothing.  Required because declaring a copy (or
+    /// other) constructor means the default constructor gets left out.
+    Mat() { }
+
+    /// Copy constructor.  Used when the class signature matches exactly.
+    Mat(Mat const &src) {
+        for (unsigned i(0); i < numElements(); ++i) {
+            mm[i] = src.mm[i];
+        }
+    }
+
+    /// @return string representation of matrix
+    /// Since output is multiline, optional indentation argument prefixes
+    /// each newline with that much white space. It does not indent
+    /// the first line, since you might be calling this inline:
+    ///
+    /// cout << "matrix: " << mat.str(7)
+    ///
+    /// matrix: [[1 2]
+    ///          [3 4]]
+    std::string
+    str(unsigned indentation = 0) const {
+
+        std::string ret;
+        std::string indent;
+
+        // We add +1 since we're indenting one for the first '['
+        indent.append(indentation+1, ' ');
+
+        ret.append("[");
+
+        // For each row,
+        for (unsigned i(0); i < SIZE; i++) {
+
+            ret.append("[");
+
+            // For each column
+            for (unsigned j(0); j < SIZE; j++) {
+
+                // Put a comma after everything except the last
+                if (j) ret.append(", ");
+                ret.append((boost::format("%1%") % mm[(i*SIZE)+j]).str());
+            }
+
+            ret.append("]");
+
+            // At the end of every row (except the last)...
+            if (i < SIZE-1 )
+                // ...suffix the row bracket with a comma, newline, and
+                // advance indentation
+                ret.append((boost::format(",\n%1%") % indent).str());
+        }
+
+        ret.append("]");
+
+        return ret;
+    }
+
+    /// Write a Mat to an output stream
+    friend std::ostream& operator<<(
+        std::ostream& ostr,
+        const Mat<SIZE, T>& m)
+    {
+        ostr << m.str();
+        return ostr;
+    }
+
+    void write(std::ostream& os) const {
+        os.write(reinterpret_cast<const char*>(&mm), sizeof(T)*SIZE*SIZE);
+    }
+
+    void read(std::istream& is) {
+        is.read(reinterpret_cast<char*>(&mm), sizeof(T)*SIZE*SIZE);
+    }
+
+
+protected:
+    T mm[SIZE*SIZE];
+};
+
+
+template<typename T> class Quat;
+template<typename T> class Vec3;
+
+/// @brief Return the rotation matrix specified by the given quaternion.
+/// @details The quaternion is normalized and used to construct the matrix.
+/// Note that the matrix is transposed to match post-multiplication semantics.
+template<class MatType>
+MatType
+rotation(const Quat<typename MatType::value_type> &q,
+    typename MatType::value_type eps = static_cast<typename MatType::value_type>(1.0e-8))
+{
+    typedef typename MatType::value_type T;
+
+    T qdot(q.dot(q));
+    T s(0);
+
+    if (!isApproxEqual(qdot, T(0.0),eps)) {
+        s = T(2.0 / qdot);
+    }
+
+    T x  = s*q.x();
+    T y  = s*q.y();
+    T z  = s*q.z();
+    T wx = x*q.w();
+    T wy = y*q.w();
+    T wz = z*q.w();
+    T xx = x*q.x();
+    T xy = y*q.x();
+    T xz = z*q.x();
+    T yy = y*q.y();
+    T yz = z*q.y();
+    T zz = z*q.z();
+
+    MatType r;
+    r[0][0]=T(1) - (yy+zz); r[0][1]=xy + wz;        r[0][2]=xz - wy;
+    r[1][0]=xy - wz;        r[1][1]=T(1) - (xx+zz); r[1][2]=yz + wx;
+    r[2][0]=xz + wy;        r[2][1]=yz - wx;        r[2][2]=T(1) - (xx+yy);
+
+    if(MatType::numColumns() == 4) padMat4(r);
+    return r;
+}
+
+
+
+/// @brief Return a matrix for rotation by @a angle radians about the given @a axis.
+/// @param axis   The axis (one of X, Y, Z) to rotate about.
+/// @param angle  The rotation angle, in radians.
+template<class MatType>
+MatType
+rotation(Axis axis, typename MatType::value_type angle)
+{
+    typedef typename MatType::value_type T;
+    T c = static_cast<T>(cos(angle));
+    T s = static_cast<T>(sin(angle));
+
+    MatType result;
+    result.setIdentity();
+
+    switch (axis) {
+    case X_AXIS:
+        result[1][1]  = c;
+        result[1][2]  = s;
+        result[2][1]  = -s;
+        result[2][2] = c;
+        return result;
+    case Y_AXIS:
+        result[0][0]  = c;
+        result[0][2]  = -s;
+        result[2][0]  = s;
+        result[2][2] = c;
+        return result;
+    case Z_AXIS:
+        result[0][0] = c;
+        result[0][1] = s;
+        result[1][0] = -s;
+        result[1][1] = c;
+        return result;
+    default:
+        throw ValueError("Unrecognized rotation axis");
+    }
+}
+
+
+/// @brief Return a matrix for rotation by @a angle radians about the given @a axis.
+/// @note The axis must be a unit vector.
+template<class MatType>
+MatType
+rotation(const Vec3<typename MatType::value_type> &_axis, typename MatType::value_type angle)
+{
+    typedef typename MatType::value_type T;
+    T txy, txz, tyz, sx, sy, sz;
+
+    Vec3<T> axis(_axis.unit());
+
+    // compute trig properties of angle:
+    T c(cos(double(angle)));
+    T s(sin(double(angle)));
+    T t(1 - c);
+
+    MatType result;
+    // handle diagonal elements
+    result[0][0] = axis[0]*axis[0] * t + c;
+    result[1][1] = axis[1]*axis[1] * t + c;
+    result[2][2] = axis[2]*axis[2] * t + c;
+
+    txy = axis[0]*axis[1] * t;
+    sz = axis[2] * s;
+
+    txz = axis[0]*axis[2] * t;
+    sy = axis[1] * s;
+
+    tyz = axis[1]*axis[2] * t;
+    sx = axis[0] * s;
+
+    // right handed space
+    // Contribution from rotation about 'z'
+    result[0][1] = txy + sz;
+    result[1][0] = txy - sz;
+    // Contribution from rotation about 'y'
+    result[0][2] = txz - sy;
+    result[2][0] = txz + sy;
+    // Contribution from rotation about 'x'
+    result[1][2] = tyz + sx;
+    result[2][1] = tyz - sx;
+
+    if(MatType::numColumns() == 4) padMat4(result);
+    return MatType(result);
+}
+
+
+/// @brief Return the Euler angles composing the given rotation matrix.
+/// @details Optional axes arguments describe in what order elementary rotations
+/// are applied. Note that in our convention, XYZ means Rz * Ry * Rx.
+/// Because we are using rows rather than columns to represent the
+/// local axes of a coordinate frame, the interpretation from a local
+/// reference point of view is to first rotate about the x axis, then
+/// about the newly rotated y axis, and finally by the new local z axis.
+/// From a fixed reference point of view, the interpretation is to
+/// rotate about the stationary world z, y, and x axes respectively.
+///
+/// Irrespective of the Euler angle convention, in the case of distinct
+/// axes, eulerAngles() returns the x, y, and z angles in the corresponding
+/// x, y, z components of the returned Vec3. For the XZX convention, the
+/// left X value is returned in Vec3.x, and the right X value in Vec3.y.
+/// For the ZXZ convention the left Z value is returned in Vec3.z and
+/// the right Z value in Vec3.y
+///
+/// Examples of reconstructing r from its Euler angle decomposition
+///
+/// v = eulerAngles(r, ZYX_ROTATION);
+/// rx.setToRotation(Vec3d(1,0,0), v[0]);
+/// ry.setToRotation(Vec3d(0,1,0), v[1]);
+/// rz.setToRotation(Vec3d(0,0,1), v[2]);
+/// r = rx * ry * rz;
+///
+/// v = eulerAngles(r, ZXZ_ROTATION);
+/// rz1.setToRotation(Vec3d(0,0,1), v[2]);
+/// rx.setToRotation (Vec3d(1,0,0), v[0]);
+/// rz2.setToRotation(Vec3d(0,0,1), v[1]);
+/// r = rz2 * rx * rz1;
+///
+/// v = eulerAngles(r, XZX_ROTATION);
+/// rx1.setToRotation (Vec3d(1,0,0), v[0]);
+/// rx2.setToRotation (Vec3d(1,0,0), v[1]);
+/// rz.setToRotation  (Vec3d(0,0,1), v[2]);
+/// r = rx2 * rz * rx1;
+///
+template<class MatType>
+Vec3<typename MatType::value_type>
+eulerAngles(
+    const MatType& mat,
+    RotationOrder rotationOrder,
+    typename MatType::value_type eps = static_cast<typename MatType::value_type>(1.0e-8))
+{
+    typedef typename MatType::value_type ValueType;
+    typedef Vec3<ValueType> V;
+    ValueType phi, theta, psi;
+
+    switch(rotationOrder)
+    {
+    case XYZ_ROTATION:
+        if (isApproxEqual(mat[2][0], ValueType(1.0), eps)) {
+            theta = ValueType(M_PI_2);
+            phi = ValueType(0.5 * atan2(mat[1][2], mat[1][1]));
+            psi = phi;
+        } else if (isApproxEqual(mat[2][0], ValueType(-1.0), eps)) {
+            theta = ValueType(-M_PI_2);
+            phi = ValueType(0.5 * atan2(mat[1][2], mat[1][1]));
+            psi = -phi;
+        } else {
+            psi = ValueType(atan2(-mat[1][0],mat[0][0]));
+            phi = ValueType(atan2(-mat[2][1],mat[2][2]));
+            theta = ValueType(atan2(mat[2][0],
+                sqrt( mat[2][1]*mat[2][1] +
+                    mat[2][2]*mat[2][2])));
+        }
+        return V(phi, theta, psi);
+    case ZXY_ROTATION:
+        if (isApproxEqual(mat[1][2], ValueType(1.0), eps)) {
+            theta = ValueType(M_PI_2);
+            phi = ValueType(0.5 * atan2(mat[0][1], mat[0][0]));
+            psi = phi;
+        } else if (isApproxEqual(mat[1][2], ValueType(-1.0), eps)) {
+            theta = ValueType(-M_PI/2);
+            phi = ValueType(0.5 * atan2(mat[0][1],mat[2][1]));
+            psi = -phi;
+        } else {
+            psi = ValueType(atan2(-mat[0][2], mat[2][2]));
+            phi = ValueType(atan2(-mat[1][0], mat[1][1]));
+            theta = ValueType(atan2(mat[1][2],
+                        sqrt(mat[0][2] * mat[0][2] +
+                                mat[2][2] * mat[2][2])));
+        }
+        return V(theta, psi, phi);
+
+    case YZX_ROTATION:
+        if (isApproxEqual(mat[0][1], ValueType(1.0), eps)) {
+            theta = ValueType(M_PI_2);
+            phi = ValueType(0.5 * atan2(mat[2][0], mat[2][2]));
+            psi = phi;
+        } else if (isApproxEqual(mat[0][1], ValueType(-1.0), eps)) {
+            theta = ValueType(-M_PI/2);
+            phi = ValueType(0.5 * atan2(mat[2][0], mat[1][0]));
+            psi = -phi;
+        } else {
+            psi = ValueType(atan2(-mat[2][1], mat[1][1]));
+            phi = ValueType(atan2(-mat[0][2], mat[0][0]));
+            theta = ValueType(atan2(mat[0][1],
+                sqrt(mat[0][0] * mat[0][0] +
+                        mat[0][2] * mat[0][2])));
+        }
+        return V(psi, phi, theta);
+
+    case XZX_ROTATION:
+
+        if (isApproxEqual(mat[0][0], ValueType(1.0), eps)) {
+            theta = ValueType(0.0);
+            phi = ValueType(0.5 * atan2(mat[1][2], mat[1][1]));
+            psi = phi;
+        } else if (isApproxEqual(mat[0][0], ValueType(-1.0), eps)) {
+            theta = ValueType(M_PI);
+            psi = ValueType(0.5 * atan2(mat[2][1], -mat[1][1]));
+            phi = - psi;
+        } else {
+            psi = ValueType(atan2(mat[2][0], -mat[1][0]));
+            phi = ValueType(atan2(mat[0][2], mat[0][1]));
+            theta = ValueType(atan2(sqrt(mat[0][1] * mat[0][1] +
+                                mat[0][2] * mat[0][2]),
+                            mat[0][0]));
+        }
+        return V(phi, psi, theta);
+
+    case ZXZ_ROTATION:
+
+        if (isApproxEqual(mat[2][2], ValueType(1.0), eps)) {
+            theta = ValueType(0.0);
+            phi = ValueType(0.5 * atan2(mat[0][1], mat[0][0]));
+            psi = phi;
+        } else if (isApproxEqual(mat[2][2], ValueType(-1.0), eps)) {
+            theta = ValueType(M_PI);
+            phi = ValueType(0.5 * atan2(mat[0][1], mat[0][0]));
+            psi = -phi;
+        } else {
+            psi = ValueType(atan2(mat[0][2], mat[1][2]));
+            phi = ValueType(atan2(mat[2][0], -mat[2][1]));
+            theta = ValueType(atan2(sqrt(mat[0][2] * mat[0][2] +
+                                mat[1][2] * mat[1][2]),
+                            mat[2][2]));
+        }
+        return V(theta, psi, phi);
+
+    case YXZ_ROTATION:
+
+        if (isApproxEqual(mat[2][1], ValueType(1.0), eps)) {
+            theta = ValueType(-M_PI_2);
+            phi = ValueType(0.5 * atan2(-mat[1][0], mat[0][0]));
+            psi = phi;
+        } else if (isApproxEqual(mat[2][1], ValueType(-1.0), eps)) {
+            theta = ValueType(M_PI_2);
+            phi = ValueType(0.5 * atan2(mat[1][0], mat[0][0]));
+            psi = -phi;
+        } else {
+            psi = ValueType(atan2(mat[0][1], mat[1][1]));
+            phi = ValueType(atan2(mat[2][0], mat[2][2]));
+            theta = ValueType(atan2(-mat[2][1],
+                sqrt(mat[0][1] * mat[0][1] +
+                        mat[1][1] * mat[1][1])));
+        }
+        return V(theta, phi, psi);
+
+    case ZYX_ROTATION:
+
+        if (isApproxEqual(mat[0][2], ValueType(1.0), eps)) {
+            theta = ValueType(-M_PI_2);
+            phi = ValueType(0.5 * atan2(-mat[1][0], mat[1][1]));
+            psi = phi;
+        } else if (isApproxEqual(mat[0][2], ValueType(-1.0), eps)) {
+            theta = ValueType(M_PI_2);
+            phi = ValueType(0.5 * atan2(mat[2][1], mat[2][0]));
+            psi = -phi;
+        } else {
+            psi = ValueType(atan2(mat[1][2], mat[2][2]));
+            phi = ValueType(atan2(mat[0][1], mat[0][0]));
+            theta = ValueType(atan2(-mat[0][2],
+                sqrt(mat[0][1] * mat[0][1] +
+                        mat[0][0] * mat[0][0])));
+        }
+        return V(psi, theta, phi);
+
+    case XZY_ROTATION:
+
+        if (isApproxEqual(mat[1][0], ValueType(-1.0), eps)) {
+            theta = ValueType(M_PI_2);
+            psi = ValueType(0.5 * atan2(mat[2][1], mat[2][2]));
+            phi = -psi;
+        } else if (isApproxEqual(mat[1][0], ValueType(1.0), eps)) {
+            theta = ValueType(-M_PI_2);
+            psi = ValueType(0.5 * atan2(- mat[2][1], mat[2][2]));
+            phi = psi;
+        } else {
+            psi = ValueType(atan2(mat[2][0], mat[0][0]));
+            phi = ValueType(atan2(mat[1][2], mat[1][1]));
+            theta = ValueType(atan2(- mat[1][0],
+                            sqrt(mat[1][1] * mat[1][1] +
+                                    mat[1][2] * mat[1][2])));
+        }
+        return V(phi, psi, theta);
+    }
+
+    OPENVDB_THROW(NotImplementedError, "Euler extraction sequence not implemented");
+}
+
+
+/// @brief Return a rotation matrix that maps @a v1 onto @a v2
+/// about the cross product of @a v1 and @a v2.
+template<class MatType>
+MatType
+rotation(
+    const Vec3<typename MatType::value_type>& _v1,
+    const Vec3<typename MatType::value_type>& _v2,
+    typename MatType::value_type eps=1.0e-8)
+{
+    typedef typename MatType::value_type T;
+    Vec3<T> v1(_v1);
+    Vec3<T> v2(_v2);
+
+    // Check if v1 and v2 are unit length
+    if (!isApproxEqual(1.0, v1.dot(v1), eps)) {
+        v1.normalize();
+    }
+    if (!isApproxEqual(1.0, v2.dot(v2), eps)) {
+        v2.normalize();
+    }
+
+    Vec3<T> cross;
+    cross.cross(v1, v2);
+
+    if (isApproxEqual(cross[0], 0.0, eps) &&
+        isApproxEqual(cross[1], 0.0, eps) &&
+        isApproxEqual(cross[2], 0.0, eps)) {
+
+
+        // Given two unit vectors v1 and v2 that are nearly parallel, build a
+        // rotation matrix that maps v1 onto v2. First find which principal axis
+        // p is closest to perpendicular to v1. Find a reflection that exchanges
+        // v1 and p, and find a reflection that exchanges p2 and v2. The desired
+        // rotation matrix is the composition of these two reflections. See the
+        // paper "Efficiently Building a Matrix to Rotate One Vector to
+        // Another" by Tomas Moller and John Hughes in Journal of Graphics
+        // Tools Vol 4, No 4 for details.
+
+        Vec3<T> u, v, p(0.0, 0.0, 0.0);
+
+        double x = Abs(v1[0]);
+        double y = Abs(v1[1]);
+        double z = Abs(v1[2]);
+
+        if (x < y) {
+            if (z < x) {
+                p[2] = 1;
+            } else {
+                p[0] = 1;
+            }
+        } else {
+            if (z < y) {
+                p[2] = 1;
+            } else {
+                p[1] = 1;
+            }
+        }
+        u = p - v1;
+        v = p - v2;
+
+        double udot = u.dot(u);
+        double vdot = v.dot(v);
+
+        double a = -2 / udot;
+        double b = -2 / vdot;
+        double c = 4 * u.dot(v) / (udot * vdot);
+
+        MatType result;
+        result.setIdentity();
+
+        for (int j = 0; j < 3; j++) {
+            for (int i = 0; i < 3; i++)
+                result[i][j] =
+                    a * u[i] * u[j] + b * v[i] * v[j] + c * v[j] * u[i];
+        }
+        result[0][0] += 1.0;
+        result[1][1] += 1.0;
+        result[2][2] += 1.0;
+
+        if(MatType::numColumns() == 4) padMat4(result);
+        return result;
+
+    } else {
+        double c = v1.dot(v2);
+        double a = (1.0 - c) / cross.dot(cross);
+
+        double a0 = a * cross[0];
+        double a1 = a * cross[1];
+        double a2 = a * cross[2];
+
+        double a01 = a0 * cross[1];
+        double a02 = a0 * cross[2];
+        double a12 = a1 * cross[2];
+
+        MatType r;
+
+        r[0][0] = c + a0 * cross[0];
+        r[0][1] = a01 + cross[2];
+        r[0][2] = a02 - cross[1],
+        r[1][0] = a01 - cross[2];
+        r[1][1] = c + a1 * cross[1];
+        r[1][2] = a12 + cross[0];
+        r[2][0] = a02 + cross[1];
+        r[2][1] = a12 - cross[0];
+        r[2][2] = c + a2 * cross[2];
+
+        if(MatType::numColumns() == 4) padMat4(r);
+        return r;
+
+    }
+}
+
+
+/// Return a matrix that scales by @a s.
+template<class MatType>
+MatType
+scale(const Vec3<typename MatType::value_type>& s)
+{
+    // Gets identity, then sets top 3 diagonal
+    // Inefficient by 3 sets.
+
+    MatType result;
+    result.setIdentity();
+    result[0][0] = s[0];
+    result[1][1] = s[1];
+    result[2][2] = s[2];
+
+    return result;
+}
+
+
+/// Return a Vec3 representing the lengths of the passed matrix's upper 3x3's rows.
+template<class MatType>
+Vec3<typename MatType::value_type>
+getScale(const MatType &mat)
+{
+    typedef Vec3<typename MatType::value_type> V;
+    return V(
+        V(mat[0][0], mat[0][1], mat[0][2]).length(),
+        V(mat[1][0], mat[1][1], mat[1][2]).length(),
+        V(mat[2][0], mat[2][1], mat[2][2]).length());
+}
+
+
+/// @brief Return a copy of the given matrix with its upper 3x3 rows normalized.
+/// @details This can be geometrically interpreted as a matrix with no scaling
+/// along its major axes.
+template<class MatType>
+MatType
+unit(const MatType &mat, typename MatType::value_type eps = 1.0e-8)
+{
+    Vec3<typename MatType::value_type> dud;
+    return unit(mat, eps, dud);
+}
+
+
+/// @brief Return a copy of the given matrix with its upper 3x3 rows normalized,
+/// and return the length of each of these rows in @a scaling.
+/// @details This can be geometrically interpretted as a matrix with no scaling
+/// along its major axes, and the scaling in the input vector
+template<class MatType>
+MatType
+unit(
+    const MatType &in,
+    typename MatType::value_type eps,
+    Vec3<typename MatType::value_type>& scaling)
+{
+    typedef typename MatType::value_type T;
+    MatType result(in);
+
+    for (int i(0); i < 3; i++) {
+        try {
+            const Vec3<T> u(
+                Vec3<T>(in[i][0], in[i][1], in[i][2]).unit(eps, scaling[i]));
+            for (int j=0; j<3; j++) result[i][j] = u[j];
+        } catch (ArithmeticError&) {
+            for (int j=0; j<3; j++) result[i][j] = 0;
+        }
+    }
+    return result;
+}
+
+
+/// @brief Set the matrix to a shear along @a axis0 by a fraction of @a axis1.
+/// @param axis0 The fixed axis of the shear.
+/// @param axis1 The shear axis.
+/// @param shear The shear factor.
+template <class MatType>
+MatType
+shear(Axis axis0, Axis axis1, typename MatType::value_type shear)
+{
+    int index0 = static_cast<int>(axis0);
+    int index1 = static_cast<int>(axis1);
+
+    MatType result;
+    result.setIdentity();
+    if (axis0 == axis1) {
+        result[index1][index0] = shear + 1;
+    } else {
+        result[index1][index0] = shear;
+    }
+
+    return result;
+}
+
+
+/// Return a matrix as the cross product of the given vector.
+template<class MatType>
+MatType
+skew(const Vec3<typename MatType::value_type> &skew)
+{
+    typedef typename MatType::value_type T;
+
+    MatType r;
+    r[0][0] = T(0);      r[0][1] = skew.z();  r[0][2] = -skew.y();
+    r[1][0] = -skew.z(); r[1][1] = T(0);      r[2][1] = skew.x();
+    r[2][0] = skew.y();  r[2][1] = -skew.x(); r[2][2] = T(0);
+
+    if(MatType::numColumns() == 4) padMat4(r);
+    return r;
+}
+
+
+/// @brief Return an orientation matrix such that z points along @a direction,
+/// and y is along the @a direction / @a vertical plane.
+template<class MatType>
+MatType
+aim(const Vec3<typename MatType::value_type>& direction,
+    const Vec3<typename MatType::value_type>& vertical)
+{
+    typedef typename MatType::value_type T;
+    Vec3<T> forward(direction.unit());
+    Vec3<T> horizontal(vertical.unit().cross(forward).unit());
+    Vec3<T> up(forward.cross(horizontal).unit());
+
+    MatType r;
+
+    r[0][0]=horizontal.x(); r[0][1]=horizontal.y(); r[0][2]=horizontal.z();
+    r[1][0]=up.x();         r[1][1]=up.y();         r[1][2]=up.z();
+    r[2][0]=forward.x();    r[2][1]=forward.y();    r[2][2]=forward.z();
+
+    if(MatType::numColumns() == 4) padMat4(r);
+    return r;
+}
+
+/// @brief    This function snaps a specific axis to a specific direction,
+///           preserving scaling.
+/// @details  It does this using minimum energy, thus posing a unique solution if
+///           basis & direction aren't parallel.
+/// @note     @a direction need not be unit.
+template<class MatType>
+inline MatType
+snapMatBasis(const MatType& source, Axis axis, const Vec3<typename MatType::value_type>& direction)
+{
+    typedef typename MatType::value_type T;
+
+    Vec3<T> unitDir(direction.unit());
+    Vec3<T> ourUnitAxis(source.row(axis).unit());
+
+    // Are the two parallel?
+    T parallel = unitDir.dot(ourUnitAxis);
+
+    // Already snapped!
+    if (isApproxEqual(parallel, T(1.0))) return source;
+
+    if (isApproxEqual(parallel, T(-1.0))) {
+        OPENVDB_THROW(ValueError, "Cannot snap to inverse axis");
+    }
+
+    // Find angle between our basis and the one specified
+    T angleBetween(angle(unitDir, ourUnitAxis));
+    // Caclulate axis to rotate along
+    Vec3<T> rotationAxis = unitDir.cross(ourUnitAxis);
+
+    MatType rotation;
+    rotation.setToRotation(rotationAxis, angleBetween);
+
+    return source * rotation;
+}
+
+/// @brief Write 0s along Mat4's last row and column, and a 1 on its diagonal.
+/// @details Useful initialization when we're initializing just the 3x3 block.
+template<class MatType>
+static MatType&
+padMat4(MatType& dest)
+{
+    dest[0][3] = dest[1][3] = dest[2][3] = 0;
+    dest[3][2] = dest[3][1] = dest[3][0] = 0;
+    dest[3][3] = 1;
+
+    return dest;
+}
+
+
+/// @brief Solve for A=B*B, given A.
+/// @details Denman-Beavers square root iteration
+template <typename MatType>
+inline void
+sqrtSolve(const MatType &aA, MatType &aB, double aTol=0.01)
+{
+    unsigned int iterations = (unsigned int)(log(aTol)/log(0.5));
+    MatType Y[2];
+    MatType Z[2];
+    MatType invY;
+    MatType invZ;
+
+    unsigned int current = 0;
+
+    Y[0]=aA;
+    Z[0] = MatType::identity();
+
+    unsigned int iteration;
+    for (iteration=0; iteration<iterations; iteration++)
+    {
+        unsigned int last = current;
+        current = !current;
+
+        invY = Y[last].inverse();
+        invZ = Z[last].inverse();
+
+        Y[current]=0.5*(Y[last]+invZ);
+        Z[current]=0.5*(Z[last]+invY);
+    }
+
+    MatType &R = Y[current];
+
+    aB=R;
+}
+
+
+template <typename MatType>
+inline void
+powSolve(const MatType &aA, MatType &aB, double aPower, double aTol=0.01)
+{
+    unsigned int iterations = (unsigned int)(log(aTol)/log(0.5));
+
+    const bool inverted = ( aPower < 0.0 );
+
+    if (inverted) {
+        aPower = -aPower;
+    }
+
+    unsigned int whole = (unsigned int)aPower;
+    double fraction = aPower - whole;
+
+    MatType R;
+    R = MatType::identity();
+
+    MatType partial = aA;
+
+    double contribution = 1.0;
+
+    unsigned int iteration;
+
+    for (iteration=0; iteration< iterations; iteration++)
+    {
+        sqrtSolve(partial, partial, aTol);
+        contribution *= 0.5;
+
+        if (fraction>=contribution)
+        {
+            R *= partial;
+            fraction-=contribution;
+        }
+    }
+
+    partial = aA;
+    while (whole)
+    {
+        if (whole & 1) {
+            R *= partial;
+        }
+        whole>>=1;
+        if(whole) {
+            partial*=partial;
+        }
+    }
+
+    if (inverted) {
+        aB = R.inverse();
+    }
+    else {
+        aB = R;
+    }
+}
+
+
+/// @brief Determine if a matrix is an identity matrix.
+template<typename MatType>
+inline bool
+isIdentity(const MatType& m)
+{
+    return m.eq(MatType::identity());
+}
+
+
+/// @brief Determine if a matrix is invertible.
+template<typename MatType>
+inline bool
+isInvertible(const MatType& m)
+{
+    typedef typename MatType::ValueType  value_type;
+    return !isApproxEqual(m.det(), (value_type)0);
+}
+
+
+/// @brief Determine if a matrix is symmetric.
+/// @details This implicitly uses math::isApproxEqual() to determine equality.
+template<typename MatType>
+inline bool
+isSymmetric(const MatType& m)
+{
+    return m.eq(m.transpose());
+}
+
+
+/// Determine if a matrix is unitary (i.e., rotation or reflection).
+template<typename MatType>
+inline bool
+isUnitary(const MatType& m)
+{
+    typedef typename MatType::ValueType value_type;
+    if (!isApproxEqual(std::abs(m.det()), value_type(1.0))) return false;
+    // check that the matrix transpose is the inverse
+    MatType temp = m * m.transpose();
+    return temp.eq(MatType::identity());
+}
+
+
+/// Determine if a matrix is diagonal.
+template<typename MatType>
+inline bool
+isDiagonal(const MatType& mat)
+{
+    int n = MatType::size;
+    typename MatType::ValueType temp(0);
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (i != j) {
+                temp+=std::abs(mat(i,j));
+            }
+        }
+    }
+    return isApproxEqual(temp, typename MatType::ValueType(0.0));
+}
+
+
+/// Return the @f$L_\infty@f$ norm of an N x N matrix.
+template<typename MatType>
+typename MatType::ValueType
+lInfinityNorm(const MatType& matrix)
+{
+    int n = MatType::size;
+    typename MatType::ValueType norm = 0;
+
+    for( int j = 0; j<n; ++j) {
+        typename MatType::ValueType column_sum = 0;
+
+        for (int i = 0; i<n; ++i) {
+            column_sum += fabs(matrix(i,j));
+        }
+        norm = std::max(norm, column_sum);
+    }
+
+    return norm;
+}
+
+
+/// Return the @f$L_1@f$ norm of an N x N matrix.
+template<typename MatType>
+typename MatType::ValueType
+lOneNorm(const MatType& matrix)
+{
+    int n = MatType::size;
+    typename MatType::ValueType norm = 0;
+
+    for( int i = 0; i<n; ++i) {
+        typename MatType::ValueType row_sum = 0;
+
+        for (int j = 0; j<n; ++j) {
+            row_sum += fabs(matrix(i,j));
+        }
+        norm = std::max(norm, row_sum);
+    }
+
+    return norm;
+}
+
+
+/// @brief Decompose an invertible 3x3 matrix into a unitary matrix
+/// followed by a symmetric matrix (positive semi-definite Hermitian),
+/// i.e., M = U * S.
+/// @details If det(U) = 1 it is a rotation, otherwise det(U) = -1,
+/// meaning there is some part reflection.
+/// See "Computing the polar decomposition with applications"
+/// Higham, N.J. - SIAM J. Sc. Stat Comput 7(4):1160-1174
+template<typename MatType>
+bool
+polarDecomposition(const MatType& input, MatType& unitary,
+    MatType& positive_hermitian, unsigned int MAX_ITERATIONS=100)
+{
+    unitary = input;
+    MatType new_unitary(input);
+    MatType unitary_inv;
+
+    if (fabs(unitary.det()) < math::Tolerance<typename MatType::ValueType>::value()) return false;
+
+    unsigned int iteration(0);
+
+    typename MatType::ValueType linf_of_u;
+    typename MatType::ValueType l1nm_of_u;
+    typename MatType::ValueType linf_of_u_inv;
+    typename MatType::ValueType l1nm_of_u_inv;
+    typename MatType::ValueType l1_error = 100;
+    double gamma;
+
+    do {
+        unitary_inv = unitary.inverse();
+        linf_of_u = lInfinityNorm(unitary);
+        l1nm_of_u = lOneNorm(unitary);
+
+        linf_of_u_inv = lInfinityNorm(unitary_inv);
+        l1nm_of_u_inv = lOneNorm(unitary_inv);
+
+        gamma = sqrt( sqrt( (l1nm_of_u_inv * linf_of_u_inv ) / (l1nm_of_u * linf_of_u) ));
+
+        new_unitary = 0.5*(gamma * unitary + (1./gamma) * unitary_inv.transpose() );
+
+        l1_error = lInfinityNorm(unitary - new_unitary);
+        unitary = new_unitary;
+
+        /// this generally converges in less than ten iterations
+        if (iteration > MAX_ITERATIONS) return false;
+        iteration++;
+    } while (l1_error > math::Tolerance<typename MatType::ValueType>::value());
+
+    positive_hermitian = unitary.transpose() * input;
+    return true;
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_MAT_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Mat3.h b/nuparu/include/openvdb_new/math/Mat3.h
new file mode 100644
index 00000000..1b64e883
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Mat3.h
@@ -0,0 +1,867 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_MAT3_H_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_MAT3_H_HAS_BEEN_INCLUDED
+
+#include <iomanip>
+#include <assert.h>
+#include <math.h>
+#include <openvdb/Exceptions.h>
+#include "Vec3.h"
+#include "Mat.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename T> class Vec3;
+template<typename T> class Mat4;
+template<typename T> class Quat;
+
+/// @class Mat3 Mat3.h
+/// @brief 3x3 matrix class.
+template<typename T>
+class Mat3: public Mat<3, T>
+{
+public:
+    /// Data type held by the matrix.
+    typedef T         value_type;
+    typedef T         ValueType;
+    typedef Mat<3, T> MyBase;
+    /// Trivial constructor, the matrix is NOT initialized
+    Mat3() {}
+
+    /// Constructor given the quaternion rotation, e.g.    Mat3f m(q);
+    /// The quaternion is normalized and used to construct the matrix
+    Mat3(const Quat<T> &q)
+    { setToRotation(q); }
+
+
+    /// Constructor given array of elements, the ordering is in row major form:
+    /** @verbatim
+        a b c
+        d e f
+        g h i
+        @endverbatim */
+    template<typename Source>
+    Mat3(Source a, Source b, Source c,
+         Source d, Source e, Source f,
+         Source g, Source h, Source i)
+    {
+        MyBase::mm[0] = static_cast<ValueType>(a);
+        MyBase::mm[1] = static_cast<ValueType>(b);
+        MyBase::mm[2] = static_cast<ValueType>(c);
+        MyBase::mm[3] = static_cast<ValueType>(d);
+        MyBase::mm[4] = static_cast<ValueType>(e);
+        MyBase::mm[5] = static_cast<ValueType>(f);
+        MyBase::mm[6] = static_cast<ValueType>(g);
+        MyBase::mm[7] = static_cast<ValueType>(h);
+        MyBase::mm[8] = static_cast<ValueType>(i);
+    } // constructor1Test
+
+    /// Construct matrix from rows or columns vectors (defaults to rows
+    /// for historical reasons)
+    template<typename Source>
+    Mat3(const Vec3<Source> &v1, const Vec3<Source> &v2, const Vec3<Source> &v3, bool rows = true)
+    {
+        if (rows) {
+            this->setRows(v1, v2, v3);
+        } else {
+            this->setColumns(v1, v2, v3);
+        }
+    }
+
+    /// Constructor given array of elements, the ordering is in row major form:\n
+    /// a[0] a[1] a[2]\n
+    /// a[3] a[4] a[5]\n
+    /// a[6] a[7] a[8]\n
+    template<typename Source>
+    Mat3(Source *a)
+    {
+        MyBase::mm[0] = a[0];
+        MyBase::mm[1] = a[1];
+        MyBase::mm[2] = a[2];
+        MyBase::mm[3] = a[3];
+        MyBase::mm[4] = a[4];
+        MyBase::mm[5] = a[5];
+        MyBase::mm[6] = a[6];
+        MyBase::mm[7] = a[7];
+        MyBase::mm[8] = a[8];
+    } // constructor1Test
+
+    /// Copy constructor
+    Mat3(const Mat<3, T> &m)
+    {
+        for (int i=0; i<3; ++i) {
+            for (int j=0; j<3; ++j) {
+                MyBase::mm[i*3 + j] = m[i][j];
+            }
+        }
+    }
+
+    /// Conversion constructor
+    template<typename Source>
+    explicit Mat3(const Mat3<Source> &m)
+    {
+        for (int i=0; i<3; ++i) {
+            for (int j=0; j<3; ++j) {
+                MyBase::mm[i*3 + j] = static_cast<T>(m[i][j]);
+            }
+        }
+    }
+
+    /// Conversion from Mat4 (copies top left)
+    explicit Mat3(const Mat4<T> &m)
+    {
+        for (int i=0; i<3; ++i) {
+            for (int j=0; j<3; ++j) {
+                MyBase::mm[i*3 + j] = m[i][j];
+            }
+        }
+    }
+
+    /// Predefined constant for identity matrix
+    static const Mat3<T>& identity() {
+        return sIdentity;
+    }
+
+    /// Predefined constant for zero matrix
+    static const Mat3<T>& zero() {
+        return sZero;
+    }
+
+    /// Set ith row to vector v
+    void setRow(int i, const Vec3<T> &v)
+    {
+        // assert(i>=0 && i<3);
+        int i3 = i * 3;
+
+        MyBase::mm[i3+0] = v[0];
+        MyBase::mm[i3+1] = v[1];
+        MyBase::mm[i3+2] = v[2];
+    } // rowColumnTest
+
+    /// Get ith row, e.g.    Vec3d v = m.row(1);
+    Vec3<T> row(int i) const
+    {
+        // assert(i>=0 && i<3);
+        return Vec3<T>((*this)(i,0), (*this)(i,1), (*this)(i,2));
+    } // rowColumnTest
+
+    /// Set jth column to vector v
+    void setCol(int j, const Vec3<T>& v)
+    {
+        // assert(j>=0 && j<3);
+        MyBase::mm[0+j] = v[0];
+        MyBase::mm[3+j] = v[1];
+        MyBase::mm[6+j] = v[2];
+    } // rowColumnTest
+
+    /// Get jth column, e.g.    Vec3d v = m.col(0);
+    Vec3<T> col(int j) const
+    {
+        // assert(j>=0 && j<3);
+        return Vec3<T>((*this)(0,j), (*this)(1,j), (*this)(2,j));
+    } // rowColumnTest
+
+    // NB: The following two methods were changed to
+    // work around a gccWS5 compiler issue related to strict
+    // aliasing (see FX-475).
+
+    //@{
+    /// Array style reference to ith row
+    /// e.g.    m[1][2] = 4;
+    T* operator[](int i) { return &(MyBase::mm[i*3]); }
+    const T* operator[](int i) const { return &(MyBase::mm[i*3]); }
+    //@}
+
+    T* asPointer() {return MyBase::mm;}
+    const T* asPointer() const {return MyBase::mm;}
+
+    /// Alternative indexed reference to the elements
+    /// Note that the indices are row first and column second.
+    /// e.g.    m(0,0) = 1;
+    T& operator()(int i, int j)
+    {
+        // assert(i>=0 && i<3);
+        // assert(j>=0 && j<3);
+        return MyBase::mm[3*i+j];
+    } // trivial
+
+    /// Alternative indexed constant reference to the elements,
+    /// Note that the indices are row first and column second.
+    /// e.g.    float f = m(1,0);
+    T operator()(int i, int j) const
+    {
+        // assert(i>=0 && i<3);
+        // assert(j>=0 && j<3);
+        return MyBase::mm[3*i+j];
+    } // trivial
+
+    /// Set the rows of "this" matrix to the vectors v1, v2, v3
+    void setRows(const Vec3<T> &v1, const Vec3<T> &v2, const Vec3<T> &v3)
+    {
+        MyBase::mm[0] = v1[0];
+        MyBase::mm[1] = v1[1];
+        MyBase::mm[2] = v1[2];
+        MyBase::mm[3] = v2[0];
+        MyBase::mm[4] = v2[1];
+        MyBase::mm[5] = v2[2];
+        MyBase::mm[6] = v3[0];
+        MyBase::mm[7] = v3[1];
+        MyBase::mm[8] = v3[2];
+    } // setRows
+    
+    /// Set the columns of "this" matrix to the vectors v1, v2, v3
+    void setColumns(const Vec3<T> &v1, const Vec3<T> &v2, const Vec3<T> &v3)
+    {
+        MyBase::mm[0] = v1[0];
+        MyBase::mm[1] = v2[0];
+        MyBase::mm[2] = v3[0];
+        MyBase::mm[3] = v1[1];
+        MyBase::mm[4] = v2[1];
+        MyBase::mm[5] = v3[1];
+        MyBase::mm[6] = v1[2];
+        MyBase::mm[7] = v2[2];
+        MyBase::mm[8] = v3[2];
+    } // setColumns
+
+    /// Set the rows of "this" matrix to the vectors v1, v2, v3
+    OPENVDB_DEPRECATED void setBasis(const Vec3<T> &v1, const Vec3<T> &v2, const Vec3<T> &v3)
+    {
+        this->setRows(v1, v2, v3);
+    }
+
+    /// Set diagonal and symmetric triangular components
+    void setSymmetric(const Vec3<T> &vdiag, const Vec3<T> &vtri)
+    {
+        MyBase::mm[0] = vdiag[0];
+        MyBase::mm[1] = vtri[0];
+        MyBase::mm[2] = vtri[1];
+        MyBase::mm[3] = vtri[0];
+        MyBase::mm[4] = vdiag[1];
+        MyBase::mm[5] = vtri[2];
+        MyBase::mm[6] = vtri[1];
+        MyBase::mm[7] = vtri[2];
+        MyBase::mm[8] = vdiag[2];
+    } // setSymmetricTest
+
+    /// Returns matrix with prescribed diagonal and symmetric triangular
+    /// components
+    static Mat3 symmetric(const Vec3<T> &vdiag, const Vec3<T> &vtri)
+    {
+        return Mat3(
+                    vdiag[0], vtri[0], vtri[1],
+                    vtri[0], vdiag[1], vtri[2],
+                    vtri[1], vtri[2], vdiag[2]
+                    );
+    }
+
+    /// Set the matrix as cross product of the given vector
+    void setSkew(const Vec3<T> &v)
+    {*this = skew(v);}
+
+    /// @brief Set this matrix to the rotation matrix specified by the quaternion
+    /// @details The quaternion is normalized and used to construct the matrix.
+    /// Note that the matrix is transposed to match post-multiplication semantics.
+    void setToRotation(const Quat<T> &q)
+    {*this = rotation<Mat3<T> >(q);}
+
+    /// @brief Set this matrix to the rotation specified by @a axis and @a angle
+    /// @details The axis must be unit vector
+    void setToRotation(const Vec3<T> &axis, T angle)
+    {*this = rotation<Mat3<T> >(axis, angle);}
+
+    /// Set this matrix to zero
+    void setZero()
+    {
+        MyBase::mm[0] = 0;
+        MyBase::mm[1] = 0;
+        MyBase::mm[2] = 0;
+        MyBase::mm[3] = 0;
+        MyBase::mm[4] = 0;
+        MyBase::mm[5] = 0;
+        MyBase::mm[6] = 0;
+        MyBase::mm[7] = 0;
+        MyBase::mm[8] = 0;
+    } // trivial
+
+    /// Set "this" matrix to identity
+    void setIdentity()
+    {
+        MyBase::mm[0] = 1;
+        MyBase::mm[1] = 0;
+        MyBase::mm[2] = 0;
+        MyBase::mm[3] = 0;
+        MyBase::mm[4] = 1;
+        MyBase::mm[5] = 0;
+        MyBase::mm[6] = 0;
+        MyBase::mm[7] = 0;
+        MyBase::mm[8] = 1;
+    } // trivial
+
+    /// Assignment operator
+    template<typename Source>
+    const Mat3& operator=(const Mat3<Source> &m)
+    {
+        const Source *src = m.asPointer();
+
+        // don't suppress type conversion warnings
+        std::copy(src, (src + this->numElements()), MyBase::mm);
+        return *this;
+    } // opEqualToTest
+
+    /// Test if "this" is equivalent to m with tolerance of eps value
+    bool eq(const Mat3 &m, T eps=1.0e-8) const
+    {
+        return (isApproxEqual(MyBase::mm[0],m.mm[0],eps) &&
+                isApproxEqual(MyBase::mm[1],m.mm[1],eps) &&
+                isApproxEqual(MyBase::mm[2],m.mm[2],eps) &&
+                isApproxEqual(MyBase::mm[3],m.mm[3],eps) &&
+                isApproxEqual(MyBase::mm[4],m.mm[4],eps) &&
+                isApproxEqual(MyBase::mm[5],m.mm[5],eps) &&
+                isApproxEqual(MyBase::mm[6],m.mm[6],eps) &&
+                isApproxEqual(MyBase::mm[7],m.mm[7],eps) &&
+                isApproxEqual(MyBase::mm[8],m.mm[8],eps));
+    } // trivial
+
+    /// Negation operator, for e.g.   m1 = -m2;
+    Mat3<T> operator-() const
+    {
+        return Mat3<T>(
+                       -MyBase::mm[0], -MyBase::mm[1], -MyBase::mm[2],
+                       -MyBase::mm[3], -MyBase::mm[4], -MyBase::mm[5],
+                       -MyBase::mm[6], -MyBase::mm[7], -MyBase::mm[8]
+                       );
+    } // trivial
+
+    /// Multiplication operator, e.g.   M = scalar * M;
+    // friend Mat3 operator*(T scalar, const Mat3& m) {
+    //     return m*scalar;
+    // }
+
+    /// @brief Returns m, where \f$m_{i,j} *= scalar\f$ for \f$i, j \in [0, 2]\f$
+    template <typename S>
+    const Mat3<T>& operator*=(S scalar)
+    {
+        MyBase::mm[0] *= scalar;
+        MyBase::mm[1] *= scalar;
+        MyBase::mm[2] *= scalar;
+        MyBase::mm[3] *= scalar;
+        MyBase::mm[4] *= scalar;
+        MyBase::mm[5] *= scalar;
+        MyBase::mm[6] *= scalar;
+        MyBase::mm[7] *= scalar;
+        MyBase::mm[8] *= scalar;
+        return *this;
+    }
+
+    /// @brief Returns m0, where \f$m0_{i,j} += m1_{i,j}\f$ for \f$i, j \in [0, 2]\f$
+    template <typename S>
+    const Mat3<T> &operator+=(const Mat3<S> &m1)
+    {
+        const S *s = m1.asPointer();
+
+        MyBase::mm[0] += s[0];
+        MyBase::mm[1] += s[1];
+        MyBase::mm[2] += s[2];
+        MyBase::mm[3] += s[3];
+        MyBase::mm[4] += s[4];
+        MyBase::mm[5] += s[5];
+        MyBase::mm[6] += s[6];
+        MyBase::mm[7] += s[7];
+        MyBase::mm[8] += s[8];
+        return *this;
+    }
+
+    /// @brief Returns m0, where \f$m0_{i,j} -= m1_{i,j}\f$ for \f$i, j \in [0, 2]\f$
+    template <typename S>
+    const Mat3<T> &operator-=(const Mat3<S> &m1)
+    {
+        const S *s = m1.asPointer();
+
+        MyBase::mm[0] -= s[0];
+        MyBase::mm[1] -= s[1];
+        MyBase::mm[2] -= s[2];
+        MyBase::mm[3] -= s[3];
+        MyBase::mm[4] -= s[4];
+        MyBase::mm[5] -= s[5];
+        MyBase::mm[6] -= s[6];
+        MyBase::mm[7] -= s[7];
+        MyBase::mm[8] -= s[8];
+        return *this;
+    }
+
+    /// @brief Returns m0, where \f$m0_{i,j} *= m1_{i,j}\f$ for \f$i, j \in [0, 2]\f$
+    template <typename S>
+    const Mat3<T> &operator*=(const Mat3<S> &m1)
+    {
+        Mat3<T> m0(*this);
+        const T* s0 = m0.asPointer();
+        const S* s1 = m1.asPointer();
+
+        MyBase::mm[0] = static_cast<T>(s0[0] * s1[0] +
+                                       s0[1] * s1[3] +
+                                       s0[2] * s1[6]);
+        MyBase::mm[1] = static_cast<T>(s0[0] * s1[1] +
+                                       s0[1] * s1[4] +
+                                       s0[2] * s1[7]);
+        MyBase::mm[2] = static_cast<T>(s0[0] * s1[2] +
+                                       s0[1] * s1[5] +
+                                       s0[2] * s1[8]);
+
+        MyBase::mm[3] = static_cast<T>(s0[3] * s1[0] +
+                                       s0[4] * s1[3] +
+                                       s0[5] * s1[6]);
+        MyBase::mm[4] = static_cast<T>(s0[3] * s1[1] +
+                                       s0[4] * s1[4] +
+                                       s0[5] * s1[7]);
+        MyBase::mm[5] = static_cast<T>(s0[3] * s1[2] +
+                                       s0[4] * s1[5] +
+                                       s0[5] * s1[8]);
+
+        MyBase::mm[6] = static_cast<T>(s0[6] * s1[0] +
+                                       s0[7] * s1[3] +
+                                       s0[8] * s1[6]);
+        MyBase::mm[7] = static_cast<T>(s0[6] * s1[1] +
+                                       s0[7] * s1[4] +
+                                       s0[8] * s1[7]);
+        MyBase::mm[8] = static_cast<T>(s0[6] * s1[2] +
+                                       s0[7] * s1[5] +
+                                       s0[8] * s1[8]);
+
+        return *this;
+    }
+
+    /// @brief Return the cofactor matrix of "this"
+    Mat3 cofactor() const
+    {
+        return Mat3<T>(
+          MyBase::mm[4] * MyBase::mm[8] - MyBase::mm[5] * MyBase::mm[7],
+          MyBase::mm[5] * MyBase::mm[6] - MyBase::mm[3] * MyBase::mm[8],
+          MyBase::mm[3] * MyBase::mm[7] - MyBase::mm[4] * MyBase::mm[6],
+          MyBase::mm[2] * MyBase::mm[7] - MyBase::mm[1] * MyBase::mm[8],
+          MyBase::mm[0] * MyBase::mm[8] - MyBase::mm[2] * MyBase::mm[6],
+          MyBase::mm[1] * MyBase::mm[6] - MyBase::mm[0] * MyBase::mm[7],
+          MyBase::mm[1] * MyBase::mm[5] - MyBase::mm[2] * MyBase::mm[4],
+          MyBase::mm[2] * MyBase::mm[3] - MyBase::mm[0] * MyBase::mm[5],
+          MyBase::mm[0] * MyBase::mm[4] - MyBase::mm[1] * MyBase::mm[3]);
+    }
+
+    /// returns adjoint of "this", i.e. the transpose of the cofactor of "this" 
+    Mat3 adjoint() const
+    {
+        return Mat3<T>(
+          MyBase::mm[4] * MyBase::mm[8] - MyBase::mm[5] * MyBase::mm[7],
+          MyBase::mm[2] * MyBase::mm[7] - MyBase::mm[1] * MyBase::mm[8],
+          MyBase::mm[1] * MyBase::mm[5] - MyBase::mm[2] * MyBase::mm[4],
+          MyBase::mm[5] * MyBase::mm[6] - MyBase::mm[3] * MyBase::mm[8],
+          MyBase::mm[0] * MyBase::mm[8] - MyBase::mm[2] * MyBase::mm[6],
+          MyBase::mm[2] * MyBase::mm[3] - MyBase::mm[0] * MyBase::mm[5],
+          MyBase::mm[3] * MyBase::mm[7] - MyBase::mm[4] * MyBase::mm[6],
+          MyBase::mm[1] * MyBase::mm[6] - MyBase::mm[0] * MyBase::mm[7],
+          MyBase::mm[0] * MyBase::mm[4] - MyBase::mm[1] * MyBase::mm[3]);
+        
+    } // adjointTest
+    
+    /// returns transpose of this
+    Mat3 transpose() const
+    {
+        return Mat3<T>(
+          MyBase::mm[0], MyBase::mm[3], MyBase::mm[6],
+          MyBase::mm[1], MyBase::mm[4], MyBase::mm[7],
+          MyBase::mm[2], MyBase::mm[5], MyBase::mm[8]);
+
+    } // transposeTest
+
+    /// returns inverse of this
+    /// @throws ArithmeticError if singular
+    Mat3 inverse(T tolerance = 0) const
+    {
+        Mat3<T> inv(this->adjoint());
+
+        const T det = inv.mm[0]*MyBase::mm[0] + inv.mm[1]*MyBase::mm[3] + inv.mm[2]*MyBase::mm[6];
+
+        // If the determinant is 0, m was singular and "this" will contain junk.
+        if (isApproxEqual(det,T(0.0),tolerance)) {
+            OPENVDB_THROW(ArithmeticError, "Inversion of singular 3x3 matrix");
+        }
+        return inv * (T(1)/det);
+    } // invertTest
+
+    /// Determinant of matrix
+    T det() const
+    {
+        const T co00 = MyBase::mm[4]*MyBase::mm[8] - MyBase::mm[5]*MyBase::mm[7];
+        const T co10 = MyBase::mm[5]*MyBase::mm[6] - MyBase::mm[3]*MyBase::mm[8];
+        const T co20 = MyBase::mm[3]*MyBase::mm[7] - MyBase::mm[4]*MyBase::mm[6];
+        return MyBase::mm[0]*co00  + MyBase::mm[1]*co10 + MyBase::mm[2]*co20;
+    } // determinantTest
+
+    /// Trace of matrix
+    T trace() const
+    {
+        return MyBase::mm[0]+MyBase::mm[4]+MyBase::mm[8];
+    }
+
+    /// This function snaps a specific axis to a specific direction,
+    /// preserving scaling. It does this using minimum energy, thus
+    /// posing a unique solution if basis & direction arent parralel.
+    /// Direction need not be unit.
+    Mat3 snapBasis(Axis axis, const Vec3<T> &direction)
+    {
+        return snapMatBasis(*this, axis, direction);
+    }
+
+    /// Return the transformed vector by "this" matrix.
+    /// This function is equivalent to post-multiplying the matrix.
+    template<typename T0>
+    Vec3<T0> transform(const Vec3<T0> &v) const
+    {
+        return static_cast< Vec3<T0> >(v * *this);
+    } // xformVectorTest
+
+    /// Return the transformed vector by transpose of "this" matrix.
+    /// This function is equivalent to pre-multiplying the matrix.
+    template<typename T0>
+    Vec3<T0> pretransform(const Vec3<T0> &v) const
+    {
+        return static_cast< Vec3<T0> >(*this * v);
+    } // xformTVectorTest
+
+
+    /// Treats diag as a diagonal matrix and returns the
+    /// multiplication of "this" with diag (from the right).
+    Mat3 timesDiagonal(const Vec3<T>& diag) const
+    {
+        Mat3 ret(*this);
+
+        ret.mm[0] *= diag(0);
+        ret.mm[1] *= diag(1);
+        ret.mm[2] *= diag(2);
+        ret.mm[3] *= diag(0);
+        ret.mm[4] *= diag(1);
+        ret.mm[5] *= diag(2);
+        ret.mm[6] *= diag(0);
+        ret.mm[7] *= diag(1);
+        ret.mm[8] *= diag(2);
+        return ret;
+    }
+
+private:
+    static const Mat3<T> sIdentity;
+    static const Mat3<T> sZero;
+}; // class Mat3
+
+
+template <typename T>
+const Mat3<T> Mat3<T>::sIdentity = Mat3<T>(1, 0, 0,
+                                           0, 1, 0,
+                                           0, 0, 1);
+
+template <typename T>
+const Mat3<T> Mat3<T>::sZero = Mat3<T>(0, 0, 0,
+                                       0, 0, 0,
+                                       0, 0, 0);
+
+/// @relates Mat3
+/// @brief Equality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+bool operator==(const Mat3<T0> &m0, const Mat3<T1> &m1)
+{
+    const T0 *t0 = m0.asPointer();
+    const T1 *t1 = m1.asPointer();
+
+    for (int i=0; i<9; ++i) {
+        if (!isExactlyEqual(t0[i], t1[i])) return false;
+    }
+    return true;
+}
+
+/// @relates Mat3
+/// @brief Inequality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+bool operator!=(const Mat3<T0> &m0, const Mat3<T1> &m1) { return !(m0 == m1); }
+
+/// @relates Mat3
+/// @brief Returns M, where \f$M_{i,j} = m_{i,j} * scalar\f$ for \f$i, j \in [0, 2]\f$
+template <typename S, typename T>
+Mat3<typename promote<S, T>::type> operator*(S scalar, const Mat3<T> &m)
+{ return m*scalar; }
+
+/// @relates Mat3
+/// @brief Returns M, where \f$M_{i,j} = m_{i,j} * scalar\f$ for \f$i, j \in [0, 2]\f$
+template <typename S, typename T>
+Mat3<typename promote<S, T>::type> operator*(const Mat3<T> &m, S scalar)
+{
+    Mat3<typename promote<S, T>::type> result(m);
+    result *= scalar;
+    return result;
+}
+
+/// @relates Mat3
+/// @brief Returns M, where  \f$M_{i,j} = m0_{i,j} + m1_{i,j}\f$ for \f$i, j \in [0, 2]\f$
+template <typename T0, typename T1>
+Mat3<typename promote<T0, T1>::type> operator+(const Mat3<T0> &m0, const Mat3<T1> &m1)
+{
+    Mat3<typename promote<T0, T1>::type> result(m0);
+    result += m1;
+    return result;
+}
+
+/// @relates Mat3
+/// @brief Returns M, where  \f$M_{i,j} = m0_{i,j} - m1_{i,j}\f$ for \f$i, j \in [0, 2]\f$
+template <typename T0, typename T1>
+Mat3<typename promote<T0, T1>::type> operator-(const Mat3<T0> &m0, const Mat3<T1> &m1)
+{
+    Mat3<typename promote<T0, T1>::type> result(m0);
+    result -= m1;
+    return result;
+}
+
+
+/// @brief Matrix multiplication.
+///
+/// Returns M, where
+///     \f$M_{ij} = \sum_{n=0}^2\left(m0_{nj} + m1_{in}\right)\f$ for \f$i, j \in [0, 2]\f$
+template <typename T0, typename T1>
+Mat3<typename promote<T0, T1>::type>operator*(const Mat3<T0> &m0, const Mat3<T1> &m1)
+{
+    Mat3<typename promote<T0, T1>::type> result(m0);
+    result *= m1;
+    return result;
+}
+
+/// @relates Mat3
+/// @brief Returns v, where \f$v_{i} = \sum_{n=0}^2 m_{i,n} * v_n\f$ for \f$i \in [0, 2]\f$
+template<typename T, typename MT>
+inline Vec3<typename promote<T, MT>::type>
+operator*(const Mat3<MT> &_m, const Vec3<T> &_v)
+{
+    MT const *m = _m.asPointer();
+    return Vec3<typename promote<T, MT>::type>(
+        _v[0]*m[0] + _v[1]*m[1] + _v[2]*m[2],
+        _v[0]*m[3] + _v[1]*m[4] + _v[2]*m[5],
+        _v[0]*m[6] + _v[1]*m[7] + _v[2]*m[8]);
+}
+
+/// @relates Mat3
+/// @brief Returns v, where \f$v_{i} = \sum_{n=0}^2 m_{n,i} * v_n\f$ for \f$i \in [0, 2]\f$
+template<typename T, typename MT>
+inline Vec3<typename promote<T, MT>::type>
+operator*(const Vec3<T> &_v, const Mat3<MT> &_m)
+{
+    MT const *m = _m.asPointer();
+    return Vec3<typename promote<T, MT>::type>(
+        _v[0]*m[0] + _v[1]*m[3] + _v[2]*m[6],
+        _v[0]*m[1] + _v[1]*m[4] + _v[2]*m[7],
+        _v[0]*m[2] + _v[1]*m[5] + _v[2]*m[8]);
+}
+
+/// @relates Mat3
+/// @brief Returns v, where \f$v_{i} = \sum_{n=0}^2 m_{i,n} * v_n\f$ for \f$i \in [0, 2]\f$
+template<typename T, typename MT>
+inline Vec3<T> &operator *= (Vec3<T> &_v, const Mat3<MT> &_m)
+{
+    Vec3<T> mult = _v * _m;
+    _v = mult;
+    return _v;
+}
+
+/// Returns outer product of v1, v2, i.e. v1 v2^T if v1 and v2 are
+/// column vectors, e.g.   M = Mat3f::outerproduct(v1,v2);
+template <typename T>
+Mat3<T> outerProduct(const Vec3<T>& v1, const Vec3<T>& v2)
+{
+    return Mat3<T>(v1[0]*v2[0], v1[0]*v2[1], v1[0]*v2[2], 
+                   v1[1]*v2[0], v1[1]*v2[1], v1[1]*v2[2], 
+                   v1[2]*v2[0], v1[2]*v2[1], v1[2]*v2[2]);
+}// outerProduct
+
+typedef Mat3<float>  Mat3s;
+typedef Mat3<double> Mat3d;
+
+#if DWREAL_IS_DOUBLE == 1
+typedef Mat3d    Mat3f;
+#else
+typedef Mat3s    Mat3f;
+#endif // DWREAL_IS_DOUBLE
+
+
+/// Interpolate the rotation between m1 and m2 using Mat::powSolve.
+/// Unlike slerp, translation is not treated independently.
+/// This results in smoother animation results.
+template<typename T, typename T0>
+Mat3<T> powLerp(const Mat3<T0> &m1, const Mat3<T0> &m2, T t)
+{
+    Mat3<T> x = m1.inverse() * m2;
+    powSolve(x, x, t);
+    Mat3<T> m = m1 * x;
+    return m;
+}
+
+
+namespace {
+    template<typename T>
+    void pivot(int i, int j, Mat3<T>& S, Vec3<T>& D, Mat3<T>& Q)
+    {
+        const int& n = Mat3<T>::size;  // should be 3
+        T temp;
+        /// scratch variables used in pivoting
+        double cotan_of_2_theta;
+        double tan_of_theta;
+        double cosin_of_theta;
+        double sin_of_theta;
+        double z;
+
+        double Sij = S(i,j);
+
+        double Sjj_minus_Sii = D[j] - D[i];
+
+        if (fabs(Sjj_minus_Sii) * (10*math::Tolerance<T>::value()) > fabs(Sij)) {
+            tan_of_theta = Sij / Sjj_minus_Sii;
+        } else {
+            /// pivot on Sij
+            cotan_of_2_theta = 0.5*Sjj_minus_Sii / Sij ;
+
+            if (cotan_of_2_theta < 0.) {
+                tan_of_theta =
+                    -1./(sqrt(1. + cotan_of_2_theta*cotan_of_2_theta) - cotan_of_2_theta);
+            } else {
+                tan_of_theta =
+                    1./(sqrt(1. + cotan_of_2_theta*cotan_of_2_theta) + cotan_of_2_theta);
+            }
+        }
+
+        cosin_of_theta = 1./sqrt( 1. + tan_of_theta * tan_of_theta);
+        sin_of_theta = cosin_of_theta * tan_of_theta;
+        z = tan_of_theta * Sij;
+        S(i,j) = 0;
+        D[i] -= z;
+        D[j] += z;
+        for (int k = 0; k < i; ++k) {
+            temp = S(k,i);
+            S(k,i) = cosin_of_theta * temp - sin_of_theta * S(k,j);
+            S(k,j)= sin_of_theta * temp + cosin_of_theta * S(k,j);
+        }
+        for (int k = i+1; k < j; ++k) {
+            temp = S(i,k);
+            S(i,k) = cosin_of_theta * temp - sin_of_theta * S(k,j);
+            S(k,j) = sin_of_theta * temp + cosin_of_theta * S(k,j);
+        }
+        for (int k = j+1; k < n; ++k) {
+            temp = S(i,k);
+            S(i,k) = cosin_of_theta * temp - sin_of_theta * S(j,k);
+            S(j,k) = sin_of_theta * temp + cosin_of_theta * S(j,k);
+        }
+        for (int k = 0; k < n; ++k)
+            {
+                temp = Q(k,i);
+                Q(k,i) = cosin_of_theta * temp - sin_of_theta*Q(k,j);
+                Q(k,j) = sin_of_theta * temp + cosin_of_theta*Q(k,j);
+            }
+    }
+}
+
+
+/// @brief Use Jacobi iterations to decompose a symmetric 3x3 matrix
+/// (diagonalize and compute eigenvectors)
+/// @details This is based on the "Efficient numerical diagonalization of Hermitian 3x3 matrices"
+/// Joachim Kopp.  arXiv.org preprint: physics/0610206
+/// with the addition of largest pivot
+template<typename T>
+bool diagonalizeSymmetricMatrix(const Mat3<T>& input, Mat3<T>& Q, Vec3<T>& D,
+    unsigned int MAX_ITERATIONS=250)
+{
+    /// use Givens rotation matrix to eliminate off-diagonal entries.
+    /// initialize the rotation matrix as idenity
+    Q  = Mat3<T>::identity();
+    int n = Mat3<T>::size;  // should be 3
+
+    /// temp matrix.  Assumed to be symmetric
+    Mat3<T> S(input);
+
+    for (int i = 0; i < n; ++i) {
+        D[i] = S(i,i);
+    }
+
+    unsigned int iterations(0);
+    /// Just iterate over all the non-diagonal enteries
+    /// using the largest as a pivot.
+    do {
+        /// check for absolute convergence
+        /// are symmetric off diagonals all zero
+        double er = 0;
+        for (int i = 0; i < n; ++i) {
+            for (int j = i+1; j < n; ++j) {
+                er += fabs(S(i,j));
+            }
+        }
+        if (std::abs(er) < math::Tolerance<T>::value()) {
+            return true;
+        }
+        iterations++;
+
+        T max_element = 0;
+        int ip = 0;
+        int jp = 0;
+        /// loop over all the off-diagonals above the diagonal
+        for (int i = 0; i < n; ++i) {
+            for (int j = i+1; j < n; ++j){
+
+                if ( fabs(D[i]) * (10*math::Tolerance<T>::value()) > fabs(S(i,j))) {
+                    /// value too small to pivot on
+                    S(i,j) = 0;
+                }
+                if (fabs(S(i,j)) > max_element) {
+                    max_element = fabs(S(i,j));
+                    ip = i;
+                    jp = j;
+                }
+            }
+        }
+        pivot(ip, jp, S, D, Q);
+    } while (iterations < MAX_ITERATIONS);
+
+    return false;
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_MAT3_H_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Mat4.h b/nuparu/include/openvdb_new/math/Mat4.h
new file mode 100644
index 00000000..a7c2adbc
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Mat4.h
@@ -0,0 +1,1398 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_MAT4_H_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_MAT4_H_HAS_BEEN_INCLUDED
+
+#include <openvdb/Exceptions.h>
+#include <openvdb/Platform.h>
+#include <iomanip>
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+#include "Math.h"
+#include "Mat3.h"
+#include "Vec3.h"
+#include "Vec4.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename T> class Vec4;
+
+
+/// @class Mat4 Mat4.h
+/// @brief 4x4 -matrix class.
+template<typename T>
+class Mat4: public Mat<4, T>
+{
+public:
+    /// Data type held by the matrix.
+    typedef T                   value_type;
+    typedef T                   ValueType;
+    typedef Mat<4, T>           MyBase;
+
+    /// Trivial constructor, the matrix is NOT initialized
+    Mat4() {}
+
+    /// Constructor given array of elements, the ordering is in row major form:
+    /** @verbatim
+        a[ 0] a[1]  a[ 2] a[ 3]
+        a[ 4] a[5]  a[ 6] a[ 7]
+        a[ 8] a[9]  a[10] a[11]
+        a[12] a[13] a[14] a[15]
+        @endverbatim */
+    template<typename Source>
+    Mat4(Source *a)
+    {
+        for (int i = 0; i < 16; i++) {
+            MyBase::mm[i] = a[i];
+        }
+    }
+
+    /// Constructor given array of elements, the ordering is in row major form:
+    /** @verbatim
+        a b c d
+        e f g h
+        i j k l
+        m n o p
+        @endverbatim */
+    template<typename Source>
+    Mat4(Source a, Source b, Source c, Source d,
+         Source e, Source f, Source g, Source h,
+         Source i, Source j, Source k, Source l,
+         Source m, Source n, Source o, Source p)
+    {
+        MyBase::mm[ 0] = T(a);
+        MyBase::mm[ 1] = T(b);
+        MyBase::mm[ 2] = T(c);
+        MyBase::mm[ 3] = T(d);
+
+        MyBase::mm[ 4] = T(e);
+        MyBase::mm[ 5] = T(f);
+        MyBase::mm[ 6] = T(g);
+        MyBase::mm[ 7] = T(h);
+
+        MyBase::mm[ 8] = T(i);
+        MyBase::mm[ 9] = T(j);
+        MyBase::mm[10] = T(k);
+        MyBase::mm[11] = T(l);
+
+        MyBase::mm[12] = T(m);
+        MyBase::mm[13] = T(n);
+        MyBase::mm[14] = T(o);
+        MyBase::mm[15] = T(p);
+    }
+
+    /// Construct matrix from rows or columns vectors (defaults to rows
+    /// for historical reasons)
+    template<typename Source>
+    Mat4(const Vec4<Source> &v1, const Vec4<Source> &v2,
+         const Vec4<Source> &v3, const Vec4<Source> &v4, bool rows = true)
+    {
+        if (rows) {
+            this->setRows(v1, v2, v3, v4);
+        } else {
+            this->setColumns(v1, v2, v3, v4);
+        }
+    }
+
+    /// Copy constructor
+    Mat4(const Mat<4, T> &m)
+    {
+        for (int i = 0; i < 4; ++i) {
+            for (int j = 0; j < 4; ++j) {
+                MyBase::mm[i*4 + j] = m[i][j];
+            }
+        }
+    }
+
+    /// Conversion constructor
+    template<typename Source>
+    explicit Mat4(const Mat4<Source> &m)
+    {
+        const Source *src = m.asPointer();
+
+        for (int i=0; i<16; ++i) {
+            MyBase::mm[i] = static_cast<T>(src[i]);
+        }
+    }
+
+    /// Predefined constant for identity matrix
+    static const Mat4<T>& identity() {
+        return sIdentity;
+    }
+
+    /// Predefined constant for zero matrix
+    static const Mat4<T>& zero() {
+        return sZero;
+    }
+
+    /// Set ith row to vector v
+    void setRow(int i, const Vec4<T> &v)
+    {
+        // assert(i>=0 && i<4);
+        int i4 = i * 4;
+        MyBase::mm[i4+0] = v[0];
+        MyBase::mm[i4+1] = v[1];
+        MyBase::mm[i4+2] = v[2];
+        MyBase::mm[i4+3] = v[3];
+    }
+
+    /// Get ith row, e.g.    Vec4f v = m.row(1);
+    Vec4<T> row(int i) const
+    {
+        // assert(i>=0 && i<3);
+        return Vec4<T>((*this)(i,0), (*this)(i,1), (*this)(i,2), (*this)(i,3));
+    }
+
+    /// Set jth column to vector v
+    void setCol(int j, const Vec4<T>& v)
+    {
+        // assert(j>=0 && j<4);
+        MyBase::mm[ 0+j] = v[0];
+        MyBase::mm[ 4+j] = v[1];
+        MyBase::mm[ 8+j] = v[2];
+        MyBase::mm[12+j] = v[3];
+    }
+
+    /// Get jth column, e.g.    Vec4f v = m.col(0);
+    Vec4<T> col(int j) const
+    {
+        // assert(j>=0 && j<4);
+        return Vec4<T>((*this)(0,j), (*this)(1,j), (*this)(2,j), (*this)(3,j));
+    }
+
+    //@{
+    /// Array style reference to ith row
+    /// e.g.    m[1][3] = 4;
+    T* operator[](int i) { return &(MyBase::mm[i<<2]); }
+    const T* operator[](int i) const { return &(MyBase::mm[i<<2]); }
+    //@}
+
+    /// Direct access to the internal data
+    T* asPointer() {return MyBase::mm;}
+    const T* asPointer() const {return MyBase::mm;}
+
+    /// Alternative indexed reference to the elements
+    /// Note that the indices are row first and column second.
+    /// e.g.    m(0,0) = 1;
+    T& operator()(int i, int j)
+    {
+        // assert(i>=0 && i<4);
+        // assert(j>=0 && j<4);
+        return MyBase::mm[4*i+j];
+    }
+
+    /// Alternative indexed constant reference to the elements,
+    /// Note that the indices are row first and column second.
+    /// e.g.    float f = m(1,0);
+    T operator()(int i, int j) const
+    {
+        // assert(i>=0 && i<4);
+        // assert(j>=0 && j<4);
+        return MyBase::mm[4*i+j];
+    }
+
+    /// Set the rows of "this" matrix to the vectors v1, v2, v3, v4
+    void setRows(const Vec4<T> &v1, const Vec4<T> &v2,
+                 const Vec4<T> &v3, const Vec4<T> &v4)
+    {
+        MyBase::mm[ 0] = v1[0];
+        MyBase::mm[ 1] = v1[1];
+        MyBase::mm[ 2] = v1[2];
+        MyBase::mm[ 3] = v1[3];
+
+        MyBase::mm[ 4] = v2[0];
+        MyBase::mm[ 5] = v2[1];
+        MyBase::mm[ 6] = v2[2];
+        MyBase::mm[ 7] = v2[3];
+
+        MyBase::mm[ 8] = v3[0];
+        MyBase::mm[ 9] = v3[1];
+        MyBase::mm[10] = v3[2];
+        MyBase::mm[11] = v3[3];
+
+        MyBase::mm[12] = v4[0];
+        MyBase::mm[13] = v4[1];
+        MyBase::mm[14] = v4[2];
+        MyBase::mm[15] = v4[3];
+    }
+
+    /// Set the columns of "this" matrix to the vectors v1, v2, v3, v4
+    void setColumns(const Vec4<T> &v1, const Vec4<T> &v2,
+                    const Vec4<T> &v3, const Vec4<T> &v4)
+    {
+        MyBase::mm[ 0] = v1[0];
+        MyBase::mm[ 1] = v2[0];
+        MyBase::mm[ 2] = v3[0];
+        MyBase::mm[ 3] = v4[0];
+
+        MyBase::mm[ 4] = v1[1];
+        MyBase::mm[ 5] = v2[1];
+        MyBase::mm[ 6] = v3[1];
+        MyBase::mm[ 7] = v4[1];
+
+        MyBase::mm[ 8] = v1[2];
+        MyBase::mm[ 9] = v2[2];
+        MyBase::mm[10] = v3[2];
+        MyBase::mm[11] = v4[2];
+
+        MyBase::mm[12] = v1[3];
+        MyBase::mm[13] = v2[3];
+        MyBase::mm[14] = v3[3];
+        MyBase::mm[15] = v4[3];
+    }
+
+    /// Set the rows of "this" matrix to the vectors v1, v2, v3, v4
+    OPENVDB_DEPRECATED void setBasis(const Vec4<T> &v1, const Vec4<T> &v2,
+                                     const Vec4<T> &v3, const Vec4<T> &v4)
+    {
+        this->setRows(v1, v2, v3, v4);
+    }
+
+
+    // Set "this" matrix to zero
+    void setZero()
+    {
+        MyBase::mm[ 0] = 0;
+        MyBase::mm[ 1] = 0;
+        MyBase::mm[ 2] = 0;
+        MyBase::mm[ 3] = 0;
+        MyBase::mm[ 4] = 0;
+        MyBase::mm[ 5] = 0;
+        MyBase::mm[ 6] = 0;
+        MyBase::mm[ 7] = 0;
+        MyBase::mm[ 8] = 0;
+        MyBase::mm[ 9] = 0;
+        MyBase::mm[10] = 0;
+        MyBase::mm[11] = 0;
+        MyBase::mm[12] = 0;
+        MyBase::mm[13] = 0;
+        MyBase::mm[14] = 0;
+        MyBase::mm[15] = 0;
+    }
+
+    /// Set "this" matrix to identity
+    void setIdentity()
+    {
+        MyBase::mm[ 0] = 1;
+        MyBase::mm[ 1] = 0;
+        MyBase::mm[ 2] = 0;
+        MyBase::mm[ 3] = 0;
+
+        MyBase::mm[ 4] = 0;
+        MyBase::mm[ 5] = 1;
+        MyBase::mm[ 6] = 0;
+        MyBase::mm[ 7] = 0;
+
+        MyBase::mm[ 8] = 0;
+        MyBase::mm[ 9] = 0;
+        MyBase::mm[10] = 1;
+        MyBase::mm[11] = 0;
+
+        MyBase::mm[12] = 0;
+        MyBase::mm[13] = 0;
+        MyBase::mm[14] = 0;
+        MyBase::mm[15] = 1;
+    }
+
+
+    /// Set upper left to a Mat3
+    void setMat3(const Mat3<T> &m)
+    {
+        for (int i = 0; i < 3; i++)
+            for (int j=0; j < 3; j++)
+                MyBase::mm[i*4+j] = m[i][j];
+    }
+
+    Mat3<T> getMat3() const
+    {
+        Mat3<T> m;
+
+        for (int i = 0; i < 3; i++)
+            for (int j = 0; j < 3; j++)
+                m[i][j] = MyBase::mm[i*4+j];
+
+        return m;
+    }
+
+    /// Return the translation component
+    Vec3<T> getTranslation() const
+    {
+        return Vec3<T>(MyBase::mm[12], MyBase::mm[13], MyBase::mm[14]);
+    }
+
+    void setTranslation(const Vec3<T> &t)
+    {
+        MyBase::mm[12] = t[0];
+        MyBase::mm[13] = t[1];
+        MyBase::mm[14] = t[2];
+    }
+
+    /// Assignment operator
+    template<typename Source>
+    const Mat4& operator=(const Mat4<Source> &m)
+    {
+        const Source *src = m.asPointer();
+
+        // don't suppress warnings when assigning from different numerical types
+        std::copy(src, (src + this->numElements()), MyBase::mm);
+        return *this;
+    }
+
+    /// Test if "this" is equivalent to m with tolerance of eps value
+    bool eq(const Mat4 &m, T eps=1.0e-8) const
+    {
+        for (int i = 0; i < 16; i++) {
+            if (!isApproxEqual(MyBase::mm[i], m.mm[i], eps))
+                return false;
+        }
+        return true;
+    }
+
+    /// Negation operator, for e.g.   m1 = -m2;
+    Mat4<T> operator-() const
+    {
+        return Mat4<T>(
+                       -MyBase::mm[ 0], -MyBase::mm[ 1], -MyBase::mm[ 2], -MyBase::mm[ 3],
+                       -MyBase::mm[ 4], -MyBase::mm[ 5], -MyBase::mm[ 6], -MyBase::mm[ 7],
+                       -MyBase::mm[ 8], -MyBase::mm[ 9], -MyBase::mm[10], -MyBase::mm[11],
+                       -MyBase::mm[12], -MyBase::mm[13], -MyBase::mm[14], -MyBase::mm[15]
+                       );
+    } // trivial
+
+    /// Return m, where \f$m_{i,j} *= scalar\f$ for \f$i, j \in [0, 3]\f$
+    template <typename S>
+    const Mat4<T>& operator*=(S scalar)
+    {
+        MyBase::mm[ 0] *= scalar;
+        MyBase::mm[ 1] *= scalar;
+        MyBase::mm[ 2] *= scalar;
+        MyBase::mm[ 3] *= scalar;
+
+        MyBase::mm[ 4] *= scalar;
+        MyBase::mm[ 5] *= scalar;
+        MyBase::mm[ 6] *= scalar;
+        MyBase::mm[ 7] *= scalar;
+
+        MyBase::mm[ 8] *= scalar;
+        MyBase::mm[ 9] *= scalar;
+        MyBase::mm[10] *= scalar;
+        MyBase::mm[11] *= scalar;
+
+        MyBase::mm[12] *= scalar;
+        MyBase::mm[13] *= scalar;
+        MyBase::mm[14] *= scalar;
+        MyBase::mm[15] *= scalar;
+        return *this;
+    }
+
+    /// @brief Returns m0, where \f$m0_{i,j} += m1_{i,j}\f$ for \f$i, j \in [0, 3]\f$
+    template <typename S>
+    const Mat4<T> &operator+=(const Mat4<S> &m1)
+    {
+        const S* s = m1.asPointer();
+
+        MyBase::mm[ 0] += s[ 0];
+        MyBase::mm[ 1] += s[ 1];
+        MyBase::mm[ 2] += s[ 2];
+        MyBase::mm[ 3] += s[ 3];
+
+        MyBase::mm[ 4] += s[ 4];
+        MyBase::mm[ 5] += s[ 5];
+        MyBase::mm[ 6] += s[ 6];
+        MyBase::mm[ 7] += s[ 7];
+
+        MyBase::mm[ 8] += s[ 8];
+        MyBase::mm[ 9] += s[ 9];
+        MyBase::mm[10] += s[10];
+        MyBase::mm[11] += s[11];
+
+        MyBase::mm[12] += s[12];
+        MyBase::mm[13] += s[13];
+        MyBase::mm[14] += s[14];
+        MyBase::mm[15] += s[15];
+
+        return *this;
+    }
+
+    /// @brief Returns m0, where \f$m0_{i,j} -= m1_{i,j}\f$ for \f$i, j \in [0, 3]\f$
+    template <typename S>
+    const Mat4<T> &operator-=(const Mat4<S> &m1)
+    {
+        const S* s = m1.asPointer();
+
+        MyBase::mm[ 0] -= s[ 0];
+        MyBase::mm[ 1] -= s[ 1];
+        MyBase::mm[ 2] -= s[ 2];
+        MyBase::mm[ 3] -= s[ 3];
+
+        MyBase::mm[ 4] -= s[ 4];
+        MyBase::mm[ 5] -= s[ 5];
+        MyBase::mm[ 6] -= s[ 6];
+        MyBase::mm[ 7] -= s[ 7];
+
+        MyBase::mm[ 8] -= s[ 8];
+        MyBase::mm[ 9] -= s[ 9];
+        MyBase::mm[10] -= s[10];
+        MyBase::mm[11] -= s[11];
+
+        MyBase::mm[12] -= s[12];
+        MyBase::mm[13] -= s[13];
+        MyBase::mm[14] -= s[14];
+        MyBase::mm[15] -= s[15];
+
+        return *this;
+    }
+
+    /// Return m, where \f$m_{i,j} = \sum_{k} m0_{i,k}*m1_{k,j}\f$ for \f$i, j \in [0, 3]\f$
+    template <typename S>
+    const Mat4<T> &operator*=(const Mat4<S> &m1)
+    {
+        Mat4<T> m0(*this);
+
+        const T* s0 = m0.asPointer();
+        const S* s1 = m1.asPointer();
+
+        for (int i = 0; i < 4; i++) {
+            int i4 = 4 * i;
+            MyBase::mm[i4+0] = static_cast<T>(s0[i4+0] * s1[ 0] +
+                                              s0[i4+1] * s1[ 4] +
+                                              s0[i4+2] * s1[ 8] +
+                                              s0[i4+3] * s1[12]);
+
+            MyBase::mm[i4+1] = static_cast<T>(s0[i4+0] * s1[ 1] +
+                                              s0[i4+1] * s1[ 5] +
+                                              s0[i4+2] * s1[ 9] +
+                                              s0[i4+3] * s1[13]);
+
+            MyBase::mm[i4+2] = static_cast<T>(s0[i4+0] * s1[ 2] +
+                                              s0[i4+1] * s1[ 6] +
+                                              s0[i4+2] * s1[10] +
+                                              s0[i4+3] * s1[14]);
+
+            MyBase::mm[i4+3] = static_cast<T>(s0[i4+0] * s1[ 3] +
+                                              s0[i4+1] * s1[ 7] +
+                                              s0[i4+2] * s1[11] +
+                                              s0[i4+3] * s1[15]);
+        }
+        return *this;
+    }
+
+    /// @return transpose of this
+    Mat4 transpose() const
+    {
+        return Mat4<T>(
+                       MyBase::mm[ 0], MyBase::mm[ 4], MyBase::mm[ 8], MyBase::mm[12],
+                       MyBase::mm[ 1], MyBase::mm[ 5], MyBase::mm[ 9], MyBase::mm[13],
+                       MyBase::mm[ 2], MyBase::mm[ 6], MyBase::mm[10], MyBase::mm[14],
+                       MyBase::mm[ 3], MyBase::mm[ 7], MyBase::mm[11], MyBase::mm[15]
+                       );
+    }
+
+
+    /// @return inverse of this
+    /// @throw ArithmeticError if singular
+    Mat4 inverse(T tolerance = 0) const
+    {
+        //
+        // inv [ A  | b ]  =  [ E  | f ]    A: 3x3, b: 3x1, c': 1x3 d: 1x1
+        //     [ c' | d ]     [ g' | h ]
+        //
+        // If A is invertible use
+        //
+        //   E  = A^-1 + p*h*r
+        //   p  = A^-1 * b
+        //   f  = -p * h
+        //   g' = -h * c'
+        //   h  = 1 / (d - c'*p)
+        //   r' = c'*A^-1
+        //
+        // Otherwise use gauss-jordan elimination
+        //
+
+        //
+        // We create this alias to ourself so we can easily use own subscript
+        // operator.
+        const Mat4<T>& m(*this);
+
+        T m0011 = m[0][0] * m[1][1];
+        T m0012 = m[0][0] * m[1][2];
+        T m0110 = m[0][1] * m[1][0];
+        T m0210 = m[0][2] * m[1][0];
+        T m0120 = m[0][1] * m[2][0];
+        T m0220 = m[0][2] * m[2][0];
+
+        T detA = m0011 * m[2][2] - m0012 * m[2][1] - m0110 * m[2][2]
+               + m0210 * m[2][1] + m0120 * m[1][2] - m0220 * m[1][1];
+
+        bool hasPerspective =
+                (!isExactlyEqual(m[0][3], T(0.0)) ||
+                 !isExactlyEqual(m[1][3], T(0.0)) ||
+                 !isExactlyEqual(m[2][3], T(0.0)) ||
+                 !isExactlyEqual(m[3][3], T(1.0)));
+
+        T det;
+        if (hasPerspective) {
+            det = m[0][3] * det3(m, 1,2,3, 0,2,1)
+                + m[1][3] * det3(m, 2,0,3, 0,2,1)
+                + m[2][3] * det3(m, 3,0,1, 0,2,1)
+                + m[3][3] * detA;
+        } else {
+            det = detA * m[3][3];
+        }
+
+        Mat4<T> inv;
+        bool invertible;
+
+        if (isApproxEqual(det,T(0.0),tolerance)) {
+            invertible = false;
+
+        } else if (isApproxEqual(detA,T(0.0),T(1e-8))) {
+            // det is too small to rely on inversion by subblocks
+            invertible = m.invert(inv, tolerance);
+
+        } else {
+            invertible = true;
+            detA = 1.0 / detA;
+
+            //
+            // Calculate A^-1
+            //
+            inv[0][0] = detA * ( m[1][1] * m[2][2] - m[1][2] * m[2][1]);
+            inv[0][1] = detA * (-m[0][1] * m[2][2] + m[0][2] * m[2][1]);
+            inv[0][2] = detA * ( m[0][1] * m[1][2] - m[0][2] * m[1][1]);
+
+            inv[1][0] = detA * (-m[1][0] * m[2][2] + m[1][2] * m[2][0]);
+            inv[1][1] = detA * ( m[0][0] * m[2][2] - m0220);
+            inv[1][2] = detA * ( m0210   - m0012);
+
+            inv[2][0] = detA * ( m[1][0] * m[2][1] - m[1][1] * m[2][0]);
+            inv[2][1] = detA * ( m0120 - m[0][0] * m[2][1]);
+            inv[2][2] = detA * ( m0011 - m0110);
+
+            if (hasPerspective) {
+                //
+                // Calculate r, p, and h
+                //
+                Vec3<T> r;
+                r[0] = m[3][0] * inv[0][0] + m[3][1] * inv[1][0]
+                     + m[3][2] * inv[2][0];
+                r[1] = m[3][0] * inv[0][1] + m[3][1] * inv[1][1]
+                     + m[3][2] * inv[2][1];
+                r[2] = m[3][0] * inv[0][2] + m[3][1] * inv[1][2]
+                     + m[3][2] * inv[2][2];
+
+                Vec3<T> p;
+                p[0] = inv[0][0] * m[0][3] + inv[0][1] * m[1][3]
+                     + inv[0][2] * m[2][3];
+                p[1] = inv[1][0] * m[0][3] + inv[1][1] * m[1][3]
+                     + inv[1][2] * m[2][3];
+                p[2] = inv[2][0] * m[0][3] + inv[2][1] * m[1][3]
+                     + inv[2][2] * m[2][3];
+
+                T h = m[3][3] - p.dot(Vec3<T>(m[3][0],m[3][1],m[3][2]));
+                if (isApproxEqual(h,T(0.0),tolerance)) {
+                    invertible = false;
+
+                } else {
+                    h = 1.0 / h;
+
+                    //
+                    // Calculate h, g, and f
+                    //
+                    inv[3][3] = h;
+                    inv[3][0] = -h * r[0];
+                    inv[3][1] = -h * r[1];
+                    inv[3][2] = -h * r[2];
+
+                    inv[0][3] = -h * p[0];
+                    inv[1][3] = -h * p[1];
+                    inv[2][3] = -h * p[2];
+
+                    //
+                    // Calculate E
+                    //
+                    p *= h;
+                    inv[0][0] += p[0] * r[0];
+                    inv[0][1] += p[0] * r[1];
+                    inv[0][2] += p[0] * r[2];
+                    inv[1][0] += p[1] * r[0];
+                    inv[1][1] += p[1] * r[1];
+                    inv[1][2] += p[1] * r[2];
+                    inv[2][0] += p[2] * r[0];
+                    inv[2][1] += p[2] * r[1];
+                    inv[2][2] += p[2] * r[2];
+                }
+            } else {
+                // Equations are much simpler in the non-perspective case
+                inv[3][0] = - (m[3][0] * inv[0][0] + m[3][1] * inv[1][0]
+                                + m[3][2] * inv[2][0]);
+                inv[3][1] = - (m[3][0] * inv[0][1] + m[3][1] * inv[1][1]
+                                + m[3][2] * inv[2][1]);
+                inv[3][2] = - (m[3][0] * inv[0][2] + m[3][1] * inv[1][2]
+                                + m[3][2] * inv[2][2]);
+                inv[0][3] = 0.0;
+                inv[1][3] = 0.0;
+                inv[2][3] = 0.0;
+                inv[3][3] = 1.0;
+            }
+        }
+
+        if (!invertible) OPENVDB_THROW(ArithmeticError, "Inversion of singular 4x4 matrix");
+        return inv;
+    }
+
+
+    /// Determinant of matrix
+    T det() const
+    {
+        const T *ap;
+        Mat3<T> submat;
+        T       det;
+        T       *sp;
+        int     i, j, k, sign;
+
+        det = 0;
+        sign = 1;
+        for (i = 0; i < 4; i++) {
+            ap = &MyBase::mm[ 0];
+            sp = submat.asPointer();
+            for (j = 0; j < 4; j++) {
+                for (k = 0; k < 4; k++) {
+                    if ((k != i) && (j != 0)) {
+                        *sp++ = *ap;
+                    }
+                    ap++;
+                }
+            }
+
+            det += sign * MyBase::mm[i] * submat.det();
+            sign = -sign;
+        }
+
+        return det;
+    }
+
+    /// Sets the matrix to a matrix that translates by v
+    static Mat4 translation(const Vec3d& v)
+    {
+        return Mat4(
+            T(1),     T(0),    T(0),     T(0),
+            T(0),     T(1),    T(0),     T(0),
+            T(0),     T(0),    T(1),     T(0),
+            T(v.x()), T(v.y()),T(v.z()), T(1));
+    }
+
+    /// Sets the matrix to a matrix that translates by v
+    template <typename T0>
+    void setToTranslation(const Vec3<T0>& v)
+    {
+        MyBase::mm[ 0] = 1;
+        MyBase::mm[ 1] = 0;
+        MyBase::mm[ 2] = 0;
+        MyBase::mm[ 3] = 0;
+
+        MyBase::mm[ 4] = 0;
+        MyBase::mm[ 5] = 1;
+        MyBase::mm[ 6] = 0;
+        MyBase::mm[ 7] = 0;
+
+        MyBase::mm[ 8] = 0;
+        MyBase::mm[ 9] = 0;
+        MyBase::mm[10] = 1;
+        MyBase::mm[11] = 0;
+
+        MyBase::mm[12] = v.x();
+        MyBase::mm[13] = v.y();
+        MyBase::mm[14] = v.z();
+        MyBase::mm[15] = 1;
+    }
+
+    /// Left multiples by the specified translation, i.e.  Trans * (*this)
+    template <typename T0>
+    void preTranslate(const Vec3<T0>& tr)
+    {
+        Vec3<T> tmp(tr.x(), tr.y(), tr.z());
+        Mat4<T> Tr = Mat4<T>::translation(tmp);
+
+        *this =  Tr * (*this);
+
+    }
+
+    /// Right multiplies by the specified translation matrix, i.e. (*this) * Trans
+    template <typename T0>
+    void postTranslate(const Vec3<T0>& tr)
+    {
+        Vec3<T> tmp(tr.x(), tr.y(), tr.z());
+        Mat4<T> Tr = Mat4<T>::translation(tmp);
+
+        *this = (*this) * Tr;
+
+    }
+
+
+    /// Sets the matrix to a matrix that scales by v
+    template <typename T0>
+    void setToScale(const Vec3<T0>& v)
+    {
+        this->setIdentity();
+        MyBase::mm[ 0] = v.x();
+        MyBase::mm[ 5] = v.y();
+        MyBase::mm[10] = v.z();
+    }
+
+    // Left multiples by the specified scale matrix, i.e. Sc * (*this)
+    template <typename T0>
+    void preScale(const Vec3<T0>& v)
+    {
+        MyBase::mm[ 0] *= v.x();
+        MyBase::mm[ 1] *= v.x();
+        MyBase::mm[ 2] *= v.x();
+        MyBase::mm[ 3] *= v.x();
+
+        MyBase::mm[ 4] *= v.y();
+        MyBase::mm[ 5] *= v.y();
+        MyBase::mm[ 6] *= v.y();
+        MyBase::mm[ 7] *= v.y();
+
+        MyBase::mm[ 8] *= v.z();
+        MyBase::mm[ 9] *= v.z();
+        MyBase::mm[10] *= v.z();
+        MyBase::mm[11] *= v.z();
+    }
+
+
+
+    // Right multiples by the specified scale matrix, i.e. (*this) * Sc
+    template <typename T0>
+    void postScale(const Vec3<T0>& v)
+    {
+
+        MyBase::mm[ 0] *= v.x();
+        MyBase::mm[ 1] *= v.y();
+        MyBase::mm[ 2] *= v.z();
+
+        MyBase::mm[ 4] *= v.x();
+        MyBase::mm[ 5] *= v.y();
+        MyBase::mm[ 6] *= v.z();
+
+        MyBase::mm[ 8] *= v.x();
+        MyBase::mm[ 9] *= v.y();
+        MyBase::mm[10] *= v.z();
+
+        MyBase::mm[12] *= v.x();
+        MyBase::mm[13] *= v.y();
+        MyBase::mm[14] *= v.z();
+
+    }
+
+
+    /// @brief Sets the matrix to a rotation about the given axis.
+    /// @param axis The axis (one of X, Y, Z) to rotate about.
+    /// @param angle The rotation angle, in radians.
+    void setToRotation(Axis axis, T angle) {*this = rotation<Mat4<T> >(axis, angle);}
+
+    /// @brief Sets the matrix to a rotation about an arbitrary axis
+    /// @param axis The axis of rotation (cannot be zero-length)
+    /// @param angle The rotation angle, in radians.
+    void setToRotation(const Vec3<T>& axis, T angle) {*this = rotation<Mat4<T> >(axis, angle);}
+
+    /// @brief Sets the matrix to a rotation that maps v1 onto v2 about the cross
+    /// product of v1 and v2.
+    void setToRotation(const Vec3<T>& v1, const Vec3<T>& v2) {*this = rotation<Mat4<T> >(v1, v2);}
+
+
+    /// @brief Left multiplies by a rotation clock-wiseabout the given axis into this matrix.
+    /// @param axis The axis (one of X, Y, Z) of rotation.
+    /// @param angle The clock-wise rotation angle, in radians.
+    void preRotate(Axis axis, T angle)
+    {
+        T c = static_cast<T>(cos(angle));
+        T s = -static_cast<T>(sin(angle)); // the "-" makes it clockwise
+
+        switch (axis) {
+        case X_AXIS:
+            {
+                T a4, a5, a6, a7;
+
+                a4 = c * MyBase::mm[ 4] - s * MyBase::mm[ 8];
+                a5 = c * MyBase::mm[ 5] - s * MyBase::mm[ 9];
+                a6 = c * MyBase::mm[ 6] - s * MyBase::mm[10];
+                a7 = c * MyBase::mm[ 7] - s * MyBase::mm[11];
+
+
+                MyBase::mm[ 8] = s * MyBase::mm[ 4] + c * MyBase::mm[ 8];
+                MyBase::mm[ 9] = s * MyBase::mm[ 5] + c * MyBase::mm[ 9];
+                MyBase::mm[10] = s * MyBase::mm[ 6] + c * MyBase::mm[10];
+                MyBase::mm[11] = s * MyBase::mm[ 7] + c * MyBase::mm[11];
+
+                MyBase::mm[ 4] = a4;
+                MyBase::mm[ 5] = a5;
+                MyBase::mm[ 6] = a6;
+                MyBase::mm[ 7] = a7;
+            }
+            break;
+
+        case Y_AXIS:
+            {
+                T a0, a1, a2, a3;
+
+                a0 = c * MyBase::mm[ 0] + s * MyBase::mm[ 8];
+                a1 = c * MyBase::mm[ 1] + s * MyBase::mm[ 9];
+                a2 = c * MyBase::mm[ 2] + s * MyBase::mm[10];
+                a3 = c * MyBase::mm[ 3] + s * MyBase::mm[11];
+
+                MyBase::mm[ 8] = -s * MyBase::mm[ 0] + c * MyBase::mm[ 8];
+                MyBase::mm[ 9] = -s * MyBase::mm[ 1] + c * MyBase::mm[ 9];
+                MyBase::mm[10] = -s * MyBase::mm[ 2] + c * MyBase::mm[10];
+                MyBase::mm[11] = -s * MyBase::mm[ 3] + c * MyBase::mm[11];
+
+
+                MyBase::mm[ 0] = a0;
+                MyBase::mm[ 1] = a1;
+                MyBase::mm[ 2] = a2;
+                MyBase::mm[ 3] = a3;
+            }
+            break;
+
+        case Z_AXIS:
+            {
+                T a0, a1, a2, a3;
+
+                a0 = c * MyBase::mm[ 0] - s * MyBase::mm[ 4];
+                a1 = c * MyBase::mm[ 1] - s * MyBase::mm[ 5];
+                a2 = c * MyBase::mm[ 2] - s * MyBase::mm[ 6];
+                a3 = c * MyBase::mm[ 3] - s * MyBase::mm[ 7];
+
+                MyBase::mm[ 4] = s * MyBase::mm[ 0] + c * MyBase::mm[ 4];
+                MyBase::mm[ 5] = s * MyBase::mm[ 1] + c * MyBase::mm[ 5];
+                MyBase::mm[ 6] = s * MyBase::mm[ 2] + c * MyBase::mm[ 6];
+                MyBase::mm[ 7] = s * MyBase::mm[ 3] + c * MyBase::mm[ 7];
+
+                MyBase::mm[ 0] = a0;
+                MyBase::mm[ 1] = a1;
+                MyBase::mm[ 2] = a2;
+                MyBase::mm[ 3] = a3;
+            }
+            break;
+
+        default:
+            assert(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
+        }
+    }
+
+
+    /// @brief Right multiplies by a rotation clock-wiseabout the given axis into this matrix.
+    /// @param axis The axis (one of X, Y, Z) of rotation.
+    /// @param angle The clock-wise rotation angle, in radians.
+    void postRotate(Axis axis, T angle)
+    {
+        T c = static_cast<T>(cos(angle));
+        T s = -static_cast<T>(sin(angle)); // the "-" makes it clockwise
+
+
+
+        switch (axis) {
+        case X_AXIS:
+            {
+                T a2, a6, a10, a14;
+
+                a2  = c * MyBase::mm[ 2] - s * MyBase::mm[ 1];
+                a6  = c * MyBase::mm[ 6] - s * MyBase::mm[ 5];
+                a10 = c * MyBase::mm[10] - s * MyBase::mm[ 9];
+                a14 = c * MyBase::mm[14] - s * MyBase::mm[13];
+
+
+                MyBase::mm[ 1] = c * MyBase::mm[ 1] + s * MyBase::mm[ 2];
+                MyBase::mm[ 5] = c * MyBase::mm[ 5] + s * MyBase::mm[ 6];
+                MyBase::mm[ 9] = c * MyBase::mm[ 9] + s * MyBase::mm[10];
+                MyBase::mm[13] = c * MyBase::mm[13] + s * MyBase::mm[14];
+
+                MyBase::mm[ 2] = a2;
+                MyBase::mm[ 6] = a6;
+                MyBase::mm[10] = a10;
+                MyBase::mm[14] = a14;
+            }
+            break;
+
+        case Y_AXIS:
+            {
+                T a2, a6, a10, a14;
+
+                a2  = c * MyBase::mm[ 2] + s * MyBase::mm[ 0];
+                a6  = c * MyBase::mm[ 6] + s * MyBase::mm[ 4];
+                a10 = c * MyBase::mm[10] + s * MyBase::mm[ 8];
+                a14 = c * MyBase::mm[14] + s * MyBase::mm[12];
+
+                MyBase::mm[ 0] = c * MyBase::mm[ 0] - s * MyBase::mm[ 2];
+                MyBase::mm[ 4] = c * MyBase::mm[ 4] - s * MyBase::mm[ 6];
+                MyBase::mm[ 8] = c * MyBase::mm[ 8] - s * MyBase::mm[10];
+                MyBase::mm[12] = c * MyBase::mm[12] - s * MyBase::mm[14];
+
+                MyBase::mm[ 2] = a2;
+                MyBase::mm[ 6] = a6;
+                MyBase::mm[10] = a10;
+                MyBase::mm[14] = a14;
+            }
+            break;
+
+        case Z_AXIS:
+            {
+                T a1, a5, a9, a13;
+
+                a1  = c * MyBase::mm[ 1] - s * MyBase::mm[ 0];
+                a5  = c * MyBase::mm[ 5] - s * MyBase::mm[ 4];
+                a9  = c * MyBase::mm[ 9] - s * MyBase::mm[ 8];
+                a13 = c * MyBase::mm[13] - s * MyBase::mm[12];
+
+                MyBase::mm[ 0] = c * MyBase::mm[ 0] + s * MyBase::mm[ 1];
+                MyBase::mm[ 4] = c * MyBase::mm[ 4] + s * MyBase::mm[ 5];
+                MyBase::mm[ 8] = c * MyBase::mm[ 8] + s * MyBase::mm[ 9];
+                MyBase::mm[12] = c * MyBase::mm[12] + s * MyBase::mm[13];
+
+                MyBase::mm[ 1] = a1;
+                MyBase::mm[ 5] = a5;
+                MyBase::mm[ 9] = a9;
+                MyBase::mm[13] = a13;
+
+            }
+            break;
+
+        default:
+            assert(axis==X_AXIS || axis==Y_AXIS || axis==Z_AXIS);
+        }
+    }
+
+    /// @brief Sets the matrix to a shear along axis0 by a fraction of axis1.
+    /// @param axis0 The fixed axis of the shear.
+    /// @param axis1 The shear axis.
+    /// @param shearby The shear factor.
+    void setToShear(Axis axis0, Axis axis1, T shearby)
+    {
+        *this = shear<Mat4<T> >(axis0, axis1, shearby);
+    }
+
+
+    /// @brief Left multiplies a shearing transformation into the matrix.
+    /// @see setToShear
+    void preShear(Axis axis0, Axis axis1, T shear)
+    {
+        int index0 = static_cast<int>(axis0);
+        int index1 = static_cast<int>(axis1);
+
+        // to row "index1" add a multiple of the index0 row
+        MyBase::mm[index1 * 4 + 0] += shear * MyBase::mm[index0 * 4 + 0];
+        MyBase::mm[index1 * 4 + 1] += shear * MyBase::mm[index0 * 4 + 1];
+        MyBase::mm[index1 * 4 + 2] += shear * MyBase::mm[index0 * 4 + 2];
+        MyBase::mm[index1 * 4 + 3] += shear * MyBase::mm[index0 * 4 + 3];
+    }
+
+
+    /// @brief Right multiplies a shearing transformation into the matrix.
+    /// @see setToShear
+    void postShear(Axis axis0, Axis axis1, T shear)
+    {
+        int index0 = static_cast<int>(axis0);
+        int index1 = static_cast<int>(axis1);
+
+        // to collumn "index0" add a multiple of the index1 row
+        MyBase::mm[index0 +  0] += shear * MyBase::mm[index1 +  0];
+        MyBase::mm[index0 +  4] += shear * MyBase::mm[index1 +  4];
+        MyBase::mm[index0 +  8] += shear * MyBase::mm[index1 +  8];
+        MyBase::mm[index0 + 12] += shear * MyBase::mm[index1 + 12];
+
+    }
+
+    /// Transform a Vec4 by post-multiplication.
+    template<typename T0>
+    Vec4<T0> transform(const Vec4<T0> &v) const
+    {
+        return static_cast< Vec4<T0> >(v * *this);
+    }
+
+    /// Transform a Vec3 by post-multiplication, without homogenous division.
+    template<typename T0>
+    Vec3<T0> transform(const Vec3<T0> &v) const
+    {
+        return static_cast< Vec3<T0> >(v * *this);
+    }
+
+    /// Transform a Vec4 by pre-multiplication.
+    template<typename T0>
+    Vec4<T0> pretransform(const Vec4<T0> &v) const
+    {
+        return static_cast< Vec4<T0> >(*this * v);
+    }
+
+    /// Transform a Vec3 by pre-multiplication, without homogenous division.
+    template<typename T0>
+    Vec3<T0> pretransform(const Vec3<T0> &v) const
+    {
+        return static_cast< Vec3<T0> >(*this * v);
+    }
+
+    /// Transform a Vec3 by post-multiplication, doing homogenous divison.
+    template<typename T0>
+    Vec3<T0> transformH(const Vec3<T0> &p) const
+    {
+        T0  w;
+
+        // w = p * (*this).col(3);
+        w = static_cast<T0>(p[0] * MyBase::mm[ 3] + p[1] * MyBase::mm[ 7]
+            + p[2] * MyBase::mm[11] + MyBase::mm[15]);
+
+        if ( !isExactlyEqual(w , 0.0) ) {
+            return Vec3<T0>(static_cast<T0>((p[0] * MyBase::mm[ 0] + p[1] * MyBase::mm[ 4] +
+                                             p[2] * MyBase::mm[ 8] + MyBase::mm[12]) / w),
+                            static_cast<T0>((p[0] * MyBase::mm[ 1] + p[1] * MyBase::mm[ 5] +
+                                             p[2] * MyBase::mm[ 9] + MyBase::mm[13]) / w),
+                            static_cast<T0>((p[0] * MyBase::mm[ 2] + p[1] * MyBase::mm[ 6] +
+                                             p[2] * MyBase::mm[10] + MyBase::mm[14]) / w));
+        }
+
+        return Vec3<T0>(0, 0, 0);
+    }
+
+    /// Transform a Vec3 by pre-multiplication, doing homogenous division.
+    template<typename T0>
+    Vec3<T0> pretransformH(const Vec3<T0> &p) const
+    {
+        T0  w;
+
+        // w = p * (*this).col(3);
+        w = p[0] * MyBase::mm[12] + p[1] * MyBase::mm[13] + p[2] * MyBase::mm[14] + MyBase::mm[15];
+
+        if ( !isExactlyEqual(w , 0.0) ) {
+            return Vec3<T0>(static_cast<T0>((p[0] * MyBase::mm[ 0] + p[1] * MyBase::mm[ 1] +
+                                             p[2] * MyBase::mm[ 2] + MyBase::mm[ 3]) / w),
+                            static_cast<T0>((p[0] * MyBase::mm[ 4] + p[1] * MyBase::mm[ 5] +
+                                             p[2] * MyBase::mm[ 6] + MyBase::mm[ 7]) / w),
+                            static_cast<T0>((p[0] * MyBase::mm[ 8]  + p[1] * MyBase::mm[ 9] +
+                                             p[2] * MyBase::mm[10] + MyBase::mm[11]) / w));
+        }
+
+        return Vec3<T0>(0, 0, 0);
+    }
+
+    /// Transform a Vec3 by post-multiplication, without translation.
+    template<typename T0>
+    Vec3<T0> transform3x3(const Vec3<T0> &v) const
+    {
+        return Vec3<T0>(
+            static_cast<T0>(v[0] * MyBase::mm[ 0] + v[1] * MyBase::mm[ 4] + v[2] * MyBase::mm[ 8]),
+            static_cast<T0>(v[0] * MyBase::mm[ 1] + v[1] * MyBase::mm[ 5] + v[2] * MyBase::mm[ 9]),
+            static_cast<T0>(v[0] * MyBase::mm[ 2] + v[1] * MyBase::mm[ 6] + v[2] * MyBase::mm[10]));
+    }
+
+
+private:
+    bool invert(Mat4<T> &inverse, T tolerance) const;
+
+    T det2(const Mat4<T> &a, int i0, int i1, int j0, int j1) const {
+        int i0row = i0 * 4;
+        int i1row = i1 * 4;
+        return a.mm[i0row+j0]*a.mm[i1row+j1] - a.mm[i0row+j1]*a.mm[i1row+j0];
+    }
+
+    T det3(const Mat4<T> &a, int i0, int i1, int i2,
+           int j0, int j1, int j2) const {
+        int i0row = i0 * 4;
+        return a.mm[i0row+j0]*det2(a, i1,i2, j1,j2) +
+            a.mm[i0row+j1]*det2(a, i1,i2, j2,j0) +
+            a.mm[i0row+j2]*det2(a, i1,i2, j0,j1);
+    }
+
+    static const Mat4<T> sIdentity;
+    static const Mat4<T> sZero;
+}; // class Mat4
+
+
+template <typename T>
+const Mat4<T> Mat4<T>::sIdentity = Mat4<T>(1, 0, 0, 0,
+                                           0, 1, 0, 0,
+                                           0, 0, 1, 0,
+                                           0, 0, 0, 1);
+
+template <typename T>
+const Mat4<T> Mat4<T>::sZero = Mat4<T>(0, 0, 0, 0,
+                                       0, 0, 0, 0,
+                                       0, 0, 0, 0,
+                                       0, 0, 0, 0);
+
+/// @relates Mat4
+/// @brief Equality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+bool operator==(const Mat4<T0> &m0, const Mat4<T1> &m1)
+{
+    const T0 *t0 = m0.asPointer();
+    const T1 *t1 = m1.asPointer();
+
+    for (int i=0; i<16; ++i) if (!isExactlyEqual(t0[i], t1[i])) return false;
+    return true;
+}
+
+/// @relates Mat4
+/// @brief Inequality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+bool operator!=(const Mat4<T0> &m0, const Mat4<T1> &m1) { return !(m0 == m1); }
+
+/// @relates Mat4
+/// @brief Returns M, where \f$M_{i,j} = m_{i,j} * scalar\f$ for \f$i, j \in [0, 3]\f$
+template <typename S, typename T>
+Mat4<typename promote<S, T>::type> operator*(S scalar, const Mat4<T> &m)
+{
+    return m*scalar;
+}
+
+/// @relates Mat4
+/// @brief Returns M, where \f$M_{i,j} = m_{i,j} * scalar\f$ for \f$i, j \in [0, 3]\f$
+template <typename S, typename T>
+Mat4<typename promote<S, T>::type> operator*(const Mat4<T> &m, S scalar)
+{
+    Mat4<typename promote<S, T>::type> result(m);
+    result *= scalar;
+    return result;
+}
+
+/// @relates Mat4
+/// @brief Returns v, where \f$v_{i} = \sum_{n=0}^3 m_{i,n} * v_n \f$ for \f$i \in [0, 3]\f$
+template<typename T, typename MT>
+inline Vec4<typename promote<T, MT>::type>
+operator*(const Mat4<MT> &_m,
+          const Vec4<T> &_v)
+{
+    MT const *m = _m.asPointer();
+    return Vec4<typename promote<T, MT>::type>(
+        _v[0]*m[0]  + _v[1]*m[1]  + _v[2]*m[2]  + _v[3]*m[3],
+        _v[0]*m[4]  + _v[1]*m[5]  + _v[2]*m[6]  + _v[3]*m[7],
+        _v[0]*m[8]  + _v[1]*m[9]  + _v[2]*m[10] + _v[3]*m[11],
+        _v[0]*m[12] + _v[1]*m[13] + _v[2]*m[14] + _v[3]*m[15]);
+}
+
+/// @relates Mat4
+/// @brief Returns v, where \f$v_{i} = \sum_{n=0}^3 m_{n,i} * v_n \f$ for \f$i \in [0, 3]\f$
+template<typename T, typename MT>
+inline Vec4<typename promote<T, MT>::type>
+operator*(const Vec4<T> &_v,
+          const Mat4<MT> &_m)
+{
+    MT const *m = _m.asPointer();
+    return Vec4<typename promote<T, MT>::type>(
+        _v[0]*m[0] + _v[1]*m[4] + _v[2]*m[8]  + _v[3]*m[12],
+        _v[0]*m[1] + _v[1]*m[5] + _v[2]*m[9]  + _v[3]*m[13],
+        _v[0]*m[2] + _v[1]*m[6] + _v[2]*m[10] + _v[3]*m[14],
+        _v[0]*m[3] + _v[1]*m[7] + _v[2]*m[11] + _v[3]*m[15]);
+}
+
+/// @relates Mat4
+/// @brief Returns v, where
+///     \f$v_{i} = \sum_{n=0}^3\left(m_{i,n} * v_n + m_{i,3}\right)\f$ for \f$i \in [0, 2]\f$
+template<typename T, typename MT>
+inline Vec3<typename promote<T, MT>::type>
+operator*(const Mat4<MT> &_m,
+          const Vec3<T> &_v)
+{
+    MT const *m = _m.asPointer();
+    return Vec3<typename promote<T, MT>::type>(
+        _v[0]*m[0] + _v[1]*m[1] + _v[2]*m[2]  + m[3],
+        _v[0]*m[4] + _v[1]*m[5] + _v[2]*m[6]  + m[7],
+        _v[0]*m[8] + _v[1]*m[9] + _v[2]*m[10] + m[11]);
+}
+
+/// @relates Mat4
+/// @brief Returns v, where
+///     \f$v_{i} = \sum_{n=0}^3\left(m_{n,i} * v_n + m_{3,i}\right)\f$ for \f$i \in [0, 2]\f$
+template<typename T, typename MT>
+inline Vec3<typename promote<T, MT>::type>
+operator*(const Vec3<T> &_v,
+          const Mat4<MT> &_m)
+{
+    MT const *m = _m.asPointer();
+    return Vec3<typename promote<T, MT>::type>(
+        _v[0]*m[0] + _v[1]*m[4] + _v[2]*m[8]  + m[12],
+        _v[0]*m[1] + _v[1]*m[5] + _v[2]*m[9]  + m[13],
+        _v[0]*m[2] + _v[1]*m[6] + _v[2]*m[10] + m[14]);
+}
+
+/// @relates Mat4
+/// @brief Returns M, where  \f$M_{i,j} = m0_{i,j} + m1_{i,j}\f$ for \f$i, j \in [0, 3]\f$
+template <typename T0, typename T1>
+Mat4<typename promote<T0, T1>::type>
+operator+(const Mat4<T0> &m0, const Mat4<T1> &m1)
+{
+    Mat4<typename promote<T0, T1>::type> result(m0);
+    result += m1;
+    return result;
+}
+
+/// @relates Mat4
+/// @brief Returns M, where  \f$M_{i,j} = m0_{i,j} - m1_{i,j}\f$ for \f$i, j \in [0, 3]\f$
+template <typename T0, typename T1>
+Mat4<typename promote<T0, T1>::type>
+operator-(const Mat4<T0> &m0, const Mat4<T1> &m1)
+{
+    Mat4<typename promote<T0, T1>::type> result(m0);
+    result -= m1;
+    return result;
+}
+
+/// @relates Mat4
+/// @brief Returns M, where
+///     \f$M_{ij} = \sum_{n=0}^3\left(m0_{nj} + m1_{in}\right)\f$ for \f$i, j \in [0, 3]\f$
+template <typename T0, typename T1>
+Mat4<typename promote<T0, T1>::type>
+operator*(const Mat4<T0> &m0, const Mat4<T1> &m1)
+{
+    Mat4<typename promote<T0, T1>::type> result(m0);
+    result *= m1;
+    return result;
+}
+
+
+/// Transform a Vec3 by pre-multiplication, without translation.
+/// Presumes this matrix is inverse of coordinate transform
+/// Synonymous to "pretransform3x3"
+template<typename T0, typename T1>
+Vec3<T1> transformNormal(const Mat4<T0> &m, const Vec3<T1> &n)
+{
+    return Vec3<T1>(
+        static_cast<T1>(m[0][0]*n[0] + m[0][1]*n[1] + m[0][2]*n[2]),
+        static_cast<T1>(m[1][0]*n[0] + m[1][1]*n[1] + m[1][2]*n[2]),
+        static_cast<T1>(m[2][0]*n[0] + m[2][1]*n[1] + m[2][2]*n[2]));
+}
+
+
+/// Invert via gauss-jordan elimination. Modified from dreamworks internal mx library
+template<typename T>
+bool Mat4<T>::invert(Mat4<T> &inverse, T tolerance) const
+{
+    Mat4<T> temp(*this);
+    inverse.setIdentity();
+
+    // Forward elimination step
+    double det = 1.0;
+    for (int i = 0; i < 4; ++i) {
+        int row = i;
+        double max = fabs(temp[i][i]);
+
+        for (int k = i+1; k < 4; ++k) {
+            if (fabs(temp[k][i]) > max) {
+                row = k;
+                max = fabs(temp[k][i]);
+            }
+        }
+
+        if (isExactlyEqual(max, 0.0)) return false;
+
+        // must move pivot to row i
+        if (row != i) {
+            det = -det;
+            for (int k = 0; k < 4; ++k) {
+                std::swap(temp[row][k], temp[i][k]);
+                std::swap(inverse[row][k], inverse[i][k]);
+            }
+        }
+
+        double pivot = temp[i][i];
+        det *= pivot;
+
+        // scale row i
+        for (int k = 0; k < 4; ++k) {
+            temp[i][k] /= pivot;
+            inverse[i][k] /= pivot;
+        }
+
+        // eliminate in rows below i
+        for (int j = i+1; j < 4; ++j) {
+            double t = temp[j][i];
+            if (!isExactlyEqual(t, 0.0)) {
+                // subtract scaled row i from row j
+                for (int k = 0; k < 4; ++k) {
+                    temp[j][k] -= temp[i][k] * t;
+                    inverse[j][k] -= inverse[i][k] * t;
+                }
+            }
+        }
+    }
+
+    // Backward elimination step
+    for (int i = 3; i > 0; --i) {
+        for (int j = 0; j < i; ++j) {
+            double t = temp[j][i];
+
+            if (!isExactlyEqual(t, 0.0)) {
+                for (int k = 0; k < 4; ++k) {
+                    inverse[j][k] -= inverse[i][k]*t;
+                }
+            }
+        }
+    }
+    return det*det >= tolerance*tolerance;
+}
+
+template <typename T>
+inline bool isAffine(const Mat4<T>& m) {
+    return (m.col(3) == Vec4<T>(0, 0, 0, 1));
+}
+
+template <typename T>
+inline bool hasTranslation(const Mat4<T>& m) {
+    return (m.row(3) != Vec4<T>(0, 0, 0, 1));
+}
+
+
+typedef Mat4<float>  Mat4s;
+typedef Mat4<double> Mat4d;
+
+#if DWREAL_IS_DOUBLE == 1
+typedef Mat4d    Mat4f;
+#else
+typedef Mat4s    Mat4f;
+#endif // DWREAL_IS_DOUBLE
+
+} // namespace math
+
+
+template<> inline math::Mat4s zeroVal<math::Mat4s>() { return math::Mat4s::identity(); }
+template<> inline math::Mat4d zeroVal<math::Mat4d>() { return math::Mat4d::identity(); }
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_MAT4_H_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Math.h b/nuparu/include/openvdb_new/math/Math.h
new file mode 100644
index 00000000..c2213295
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Math.h
@@ -0,0 +1,912 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Math.h
+/// @brief General-purpose arithmetic and comparison routines, most of which
+/// accept arbitrary value types (or at least arbitrary numeric value types)
+
+#ifndef OPENVDB_MATH_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_HAS_BEEN_INCLUDED
+
+#include <assert.h>
+#include <algorithm> // for std::max()
+#include <cmath>     // for floor(), ceil() and sqrt()
+#include <math.h>    // for pow(), fabs() etc
+#include <cstdlib>   // for srand(), abs(int)
+#include <limits>    // for std::numeric_limits<Type>::max()
+#include <string>
+#include <boost/numeric/conversion/conversion_traits.hpp>
+#include <boost/math/special_functions/cbrt.hpp>
+#include <boost/math/special_functions/fpclassify.hpp> // boost::math::isfinite
+#include <boost/random/mersenne_twister.hpp> // for boost::random::mt19937
+#include <boost/random/uniform_01.hpp>
+#include <boost/random/uniform_int.hpp>
+#include <boost/version.hpp> // for BOOST_VERSION
+#include <openvdb/Platform.h>
+#include <openvdb/version.h>
+
+
+// Compile pragmas
+
+#define PRAGMA(x) _Pragma(#x)
+
+// Intel(r) compiler fires remark #1572: floating-point equality and inequality
+// comparisons are unrealiable when == or != is used with floating point operands.
+#if defined(__INTEL_COMPILER)
+    #define OPENVDB_NO_FP_EQUALITY_WARNING_BEGIN \
+        _Pragma("warning (push)")    \
+        _Pragma("warning (disable:1572)")
+    #define OPENVDB_NO_FP_EQUALITY_WARNING_END \
+        _Pragma("warning (pop)")
+#elif defined(__clang__)
+    #define OPENVDB_NO_FP_EQUALITY_WARNING_BEGIN \
+        PRAGMA(clang diagnostic push) \
+        PRAGMA(clang diagnostic ignored "-Wfloat-equal")
+    #define OPENVDB_NO_FP_EQUALITY_WARNING_END \
+        PRAGMA(clang diagnostic pop)
+#else
+    // For GCC, #pragma GCC diagnostic ignored "-Wfloat-equal"
+    // isn't working until gcc 4.2+,
+    // Trying
+    // #pragma GCC system_header
+    // creates other problems, most notably "warning: will never be executed"
+    // in from templates, unsure of how to work around.
+    // If necessary, could use integer based comparisons for equality
+    #define OPENVDB_NO_FP_EQUALITY_WARNING_BEGIN
+    #define OPENVDB_NO_FP_EQUALITY_WARNING_END
+#endif
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+/// @brief Return the value of type T that corresponds to zero.
+/// @note A zeroVal<T>() specialization must be defined for each @c ValueType T
+/// that cannot be constructed using the form @c T(0).  For example, @c std::string(0)
+/// treats 0 as @c NULL and throws a @c std::logic_error.
+template<typename T> inline T zeroVal() { return T(0); }
+/// Return the @c std::string value that corresponds to zero.
+template<> inline std::string zeroVal<std::string>() { return ""; }
+/// Return the @c bool value that corresponds to zero.
+template<> inline bool zeroVal<bool>() { return false; }
+
+/// @todo These won't be needed if we eliminate StringGrids.
+//@{
+/// @brief Needed to support the <tt>(zeroVal<ValueType>() + val)</tt> idiom
+/// when @c ValueType is @c std::string
+inline std::string operator+(const std::string& s, bool) { return s; }
+inline std::string operator+(const std::string& s, int) { return s; }
+inline std::string operator+(const std::string& s, float) { return s; }
+inline std::string operator+(const std::string& s, double) { return s; }
+//@}
+
+
+namespace math {
+
+/// @brief Return the unary negation of the given value.
+/// @note A negative<T>() specialization must be defined for each ValueType T
+/// for which unary negation is not defined.
+template<typename T> inline T negative(const T& val) { return T(-val); }
+/// Return the negation of the given boolean.
+template<> inline bool negative(const bool& val) { return !val; }
+/// Return the "negation" of the given string.
+template<> inline std::string negative(const std::string& val) { return val; }
+
+
+//@{
+/// Tolerance for floating-point comparison
+template<typename T> struct Tolerance { static T value() { return zeroVal<T>(); } };
+template<> struct Tolerance<float>    { static float value() { return 1e-8f; } };
+template<> struct Tolerance<double>   { static double value() { return 1e-15; } };
+//@}
+
+//@{
+/// Delta for small floating-point offsets
+template<typename T> struct Delta { static T value() { return zeroVal<T>(); } };
+template<> struct Delta<float>    { static float value() { return  1e-5f; } };
+template<> struct Delta<double>   { static double value() { return 1e-9; } };
+//@}
+
+
+// ==========> Random Values <==================
+
+/// @brief Simple generator of random numbers over the range [0, 1)
+/// @details Thread-safe as long as each thread has its own Rand01 instance
+template<typename FloatType = double, typename EngineType = boost::mt19937>
+class Rand01
+{
+private:
+    EngineType mEngine;
+    boost::uniform_01<FloatType> mRand;
+
+public:
+    typedef FloatType ValueType;
+
+    /// @brief Initialize the generator.
+    /// @param engine  random number generator
+    Rand01(const EngineType& engine): mEngine(engine) {}
+
+    /// @brief Initialize the generator.
+    /// @param seed  seed value for the random number generator
+    Rand01(unsigned int seed): mEngine(static_cast<typename EngineType::result_type>(seed)) {}
+
+    /// Set the seed value for the random number generator
+    void setSeed(unsigned int seed)
+    {
+        mEngine.seed(static_cast<typename EngineType::result_type>(seed));
+    }
+
+    /// Return a const reference to the random number generator.
+    const EngineType& engine() const { return mEngine; }
+
+    /// Return a uniformly distributed random number in the range [0, 1).
+    FloatType operator()() { return mRand(mEngine); }
+};
+
+typedef Rand01<double, boost::mt19937> Random01;
+
+
+/// @brief Simple random integer generator
+/// @details Thread-safe as long as each thread has its own RandInt instance
+template<typename IntType = int, typename EngineType = boost::mt19937>
+class RandInt
+{
+private:
+#if BOOST_VERSION >= 104700
+    typedef boost::random::uniform_int_distribution<IntType> Distr;
+#else
+    typedef boost::uniform_int<IntType> Distr;
+#endif
+    EngineType mEngine;
+    Distr mRand;
+
+public:
+    /// @brief Initialize the generator.
+    /// @param engine     random number generator
+    /// @param imin,imax  generate integers that are uniformly distributed over [imin, imax]
+    RandInt(const EngineType& engine, IntType imin, IntType imax):
+        mEngine(engine),
+        mRand(std::min(imin, imax), std::max(imin, imax))
+    {}
+
+    /// @brief Initialize the generator.
+    /// @param seed       seed value for the random number generator
+    /// @param imin,imax  generate integers that are uniformly distributed over [imin, imax]
+    RandInt(unsigned int seed, IntType imin, IntType imax):
+        mEngine(static_cast<typename EngineType::result_type>(seed)),
+        mRand(std::min(imin, imax), std::max(imin, imax))
+    {}
+
+    /// Change the range over which integers are distributed to [imin, imax].
+    void setRange(IntType imin, IntType imax)
+    {
+        mRand = Distr(std::min(imin, imax), std::max(imin, imax));
+    }
+
+    /// Set the seed value for the random number generator
+    void setSeed(unsigned int seed)
+    {
+        mEngine.seed(static_cast<typename EngineType::result_type>(seed));
+    }
+
+    /// Return a const reference to the random number generator.
+    const EngineType& engine() const { return mEngine; }
+
+    /// Return a randomly-generated integer in the current range.
+    IntType operator()() { return mRand(mEngine); }
+
+    /// @brief Return a randomly-generated integer in the new range [imin, imax],
+    /// without changing the current range.
+    IntType operator()(IntType imin, IntType imax)
+    {
+        const IntType lo = std::min(imin, imax), hi = std::max(imin, imax);
+#if BOOST_VERSION >= 104700
+        return mRand(mEngine, typename Distr::param_type(lo, hi));
+#else
+        return Distr(lo, hi)(mEngine);
+#endif
+    }
+};
+
+typedef RandInt<int, boost::mt19937> RandomInt;
+
+
+// ==========> Clamp <==================
+
+/// Return @a x clamped to [@a min, @a max]
+template<typename Type>
+inline Type
+Clamp(Type x, Type min, Type max)
+{
+    assert( !(min>max) );
+    return x > min ? x < max ? x : max : min;
+}
+
+
+/// Return @a x clamped to [0, 1]
+template<typename Type>
+inline Type
+Clamp01(Type x) { return x > Type(0) ? x < Type(1) ? x : Type(1) : Type(0); }
+
+
+/// Return @c true if @a x is outside [0,1]
+template<typename Type>
+inline bool
+ClampTest01(Type &x)
+{
+    if (x >= Type(0) && x <= Type(1)) return false;
+    x = x < Type(0) ? Type(0) : Type(1);
+    return true;
+}
+
+/// @brief Return 0 if @a x < @a 0, 1 if @a x > 1 or else @f$(3-2x)x^2@f$.
+template<typename Type>
+inline Type
+SmoothUnitStep(Type x)
+{
+    return x > 0 ? x < 1 ? (3-2*x)*x*x : Type(1) : Type(0);
+}
+
+/// @brief Return 0 if @a x < @a min, 1 if @a x > @a max or else @f$(3-2t)t^2@f$,
+/// where @f$t = (x-min)/(max-min)@f$.
+template<typename Type>
+inline Type
+SmoothUnitStep(Type x, Type min, Type max)
+{
+    assert(min < max);
+    return SmoothUnitStep((x-min)/(max-min));
+}
+
+
+// ==========> Absolute Value <==================
+
+
+//@{
+/// Return the absolute value of the given quantity.
+inline int32_t Abs(int32_t i) { return abs(i); }
+inline int64_t Abs(int64_t i)
+{
+#ifdef _MSC_VER
+    return (i < int64_t(0) ? -i : i);
+#else
+    return labs(i);
+#endif
+}
+inline float Abs(float x) { return fabsf(x); }
+inline double Abs(double x) { return fabs(x); }
+inline long double Abs(long double x) { return fabsl(x); }
+inline uint32_t Abs(uint32_t i) { return i; }
+inline uint64_t Abs(uint64_t i) { return i; }
+inline bool Abs(bool b) { return b; }
+// On OSX size_t and uint64_t are different types
+#if defined(__APPLE__) || defined(MACOSX)
+inline size_t Abs(size_t i) { return i; }
+#endif
+//@}
+
+
+////////////////////////////////////////
+
+
+// ==========> Value Comparison <==================
+
+
+/// Return @c true if @a x is exactly equal to zero.
+template<typename Type>
+inline bool
+isZero(const Type& x)
+{
+    OPENVDB_NO_FP_EQUALITY_WARNING_BEGIN
+    return x == zeroVal<Type>();
+    OPENVDB_NO_FP_EQUALITY_WARNING_END
+}
+
+
+/// @brief Return @c true if @a x is equal to zero to within
+/// the default floating-point comparison tolerance.
+template<typename Type>
+inline bool
+isApproxZero(const Type& x)
+{
+    const Type tolerance = Type(zeroVal<Type>() + Tolerance<Type>::value());
+    return !(x > tolerance) && !(x < -tolerance);
+}
+
+/// Return @c true if @a x is equal to zero to within the given tolerance.
+template<typename Type>
+inline bool
+isApproxZero(const Type& x, const Type& tolerance)
+{
+    return !(x > tolerance) && !(x < -tolerance);
+}
+
+
+/// Return @c true if @a x is less than zero.
+template<typename Type>
+inline bool
+isNegative(const Type& x) { return x < zeroVal<Type>(); }
+
+/// Return @c false, since @c bool values are never less than zero.
+template<> inline bool isNegative<bool>(const bool&) { return false; }
+
+
+/// Return @c true if @a x is finite.
+template<typename Type>
+inline bool
+isFinite(const Type& x) { return boost::math::isfinite(x); }
+
+
+/// @brief Return @c true if @a a is equal to @a b to within
+/// the default floating-point comparison tolerance.
+template<typename Type>
+inline bool
+isApproxEqual(const Type& a, const Type& b)
+{
+    const Type tolerance = Type(zeroVal<Type>() + Tolerance<Type>::value());
+    return !(Abs(a - b) > tolerance);
+}
+
+
+/// Return @c true if @a a is equal to @a b to within the given tolerance.
+template<typename Type>
+inline bool
+isApproxEqual(const Type& a, const Type& b, const Type& tolerance)
+{
+    return !(Abs(a - b) > tolerance);
+}
+
+#define OPENVDB_EXACT_IS_APPROX_EQUAL(T) \
+    template<> inline bool isApproxEqual<T>(const T& a, const T& b) { return a == b; } \
+    template<> inline bool isApproxEqual<T>(const T& a, const T& b, const T&) { return a == b; } \
+    /**/
+
+OPENVDB_EXACT_IS_APPROX_EQUAL(bool)
+OPENVDB_EXACT_IS_APPROX_EQUAL(std::string)
+
+
+/// @brief Return @c true if @a a is larger than @a b to within
+/// the given tolerance, i.e., if @a b - @a a < @a tolerance.
+template<typename Type>
+inline bool
+isApproxLarger(const Type& a, const Type& b, const Type& tolerance)
+{
+    return (b - a < tolerance);
+}
+
+
+/// @brief Return @c true if @a a is exactly equal to @a b.
+template<typename T0, typename T1>
+inline bool
+isExactlyEqual(const T0& a, const T1& b)
+{
+    OPENVDB_NO_FP_EQUALITY_WARNING_BEGIN
+    return a == b;
+    OPENVDB_NO_FP_EQUALITY_WARNING_END
+}
+
+
+template<typename Type>
+inline bool
+isRelOrApproxEqual(const Type& a, const Type& b, const Type& absTol, const Type& relTol)
+{
+    // First check to see if we are inside the absolute tolerance
+    // Necessary for numbers close to 0
+    if (!(Abs(a - b) > absTol)) return true;
+
+    // Next check to see if we are inside the relative tolerance
+    // to handle large numbers that aren't within the abs tolerance
+    // but could be the closest floating point representation
+    double relError;
+    if (Abs(b) > Abs(a)) {
+        relError = Abs((a - b) / b);
+    } else {
+        relError = Abs((a - b) / a);
+    }
+    return (relError <= relTol);
+}
+
+template<>
+inline bool
+isRelOrApproxEqual(const bool& a, const bool& b, const bool&, const bool&)
+{
+    return (a == b);
+}
+
+
+// Avoid strict aliasing issues by using type punning
+// http://cellperformance.beyond3d.com/articles/2006/06/understanding-strict-aliasing.html
+// Using "casting through a union(2)"
+inline int32_t
+floatToInt32(const float aFloatValue)
+{
+    union FloatOrInt32 { float floatValue; int32_t int32Value; };
+    const FloatOrInt32* foi = reinterpret_cast<const FloatOrInt32*>(&aFloatValue);
+    return foi->int32Value;
+}
+
+
+inline int64_t
+doubleToInt64(const double aDoubleValue)
+{
+    union DoubleOrInt64 { double doubleValue; int64_t int64Value; };
+    const DoubleOrInt64* dol = reinterpret_cast<const DoubleOrInt64*>(&aDoubleValue);
+    return dol->int64Value;
+}
+
+
+// aUnitsInLastPlace is the allowed difference between the least significant digits
+// of the numbers' floating point representation
+// Please read the reference paper before trying to use isUlpsEqual
+// http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
+inline bool
+isUlpsEqual(const double aLeft, const double aRight, const int64_t aUnitsInLastPlace)
+{
+    int64_t longLeft = doubleToInt64(aLeft);
+    // Because of 2's complement, must restore lexicographical order
+    if (longLeft < 0) {
+        longLeft = INT64_C(0x8000000000000000) - longLeft;
+    }
+
+    int64_t longRight = doubleToInt64(aRight);
+    // Because of 2's complement, must restore lexicographical order
+    if (longRight < 0) {
+        longRight = INT64_C(0x8000000000000000) - longRight;
+    }
+
+    int64_t difference = labs(longLeft - longRight);
+    return (difference <= aUnitsInLastPlace);
+}
+
+inline bool
+isUlpsEqual(const float aLeft, const float aRight, const int32_t aUnitsInLastPlace)
+{
+    int32_t intLeft = floatToInt32(aLeft);
+    // Because of 2's complement, must restore lexicographical order
+    if (intLeft < 0) {
+        intLeft = 0x80000000 - intLeft;
+    }
+
+    int32_t intRight = floatToInt32(aRight);
+    // Because of 2's complement, must restore lexicographical order
+    if (intRight < 0) {
+        intRight = 0x80000000 - intRight;
+    }
+
+    int32_t difference = abs(intLeft - intRight);
+    return (difference <= aUnitsInLastPlace);
+}
+
+
+////////////////////////////////////////
+
+
+// ==========> Pow <==================
+
+/// Return @f$ x^2 @f$.
+template<typename Type>
+inline Type Pow2(Type x) { return x*x; }
+
+/// Return @f$ x^3 @f$.
+template<typename Type>
+inline Type Pow3(Type x) { return x*x*x; }
+
+/// Return @f$ x^4 @f$.
+template<typename Type>
+inline Type Pow4(Type x) { return Pow2(Pow2(x)); }
+
+/// Return @f$ x^n @f$.
+template<typename Type>
+Type
+Pow(Type x, int n)
+{
+    Type ans = 1;
+    if (n < 0) {
+        n = -n;
+        x = Type(1)/x;
+    }
+    while (n--) ans *= x;
+    return ans;
+}
+
+//@{
+/// Return @f$ b^e @f$.
+inline float
+Pow(float b, float e)
+{
+    assert( b >= 0.0f && "Pow(float,float): base is negative" );
+    return powf(b,e);
+}
+
+inline double
+Pow(double b, double e)
+{
+    assert( b >= 0.0 && "Pow(double,double): base is negative" );
+    return pow(b,e);
+}
+//@}
+
+
+// ==========> Max <==================
+
+/// Return the maximum of two values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b)
+{
+    return std::max(a,b) ;
+}
+
+/// Return the maximum of three values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b, const Type& c)
+{
+    return std::max( std::max(a,b), c ) ;
+}
+
+/// Return the maximum of four values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b, const Type& c, const Type& d)
+{
+    return std::max(std::max(a,b), std::max(c,d));
+}
+
+/// Return the maximum of five values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b, const Type& c, const Type& d, const Type& e)
+{
+    return std::max(std::max(a,b), Max(c,d,e));
+}
+
+/// Return the maximum of six values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b, const Type& c, const Type& d, const Type& e, const Type& f)
+{
+    return std::max(Max(a,b,c), Max(d,e,f));
+}
+
+/// Return the maximum of seven values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b, const Type& c, const Type& d,
+    const Type& e, const Type& f, const Type& g)
+{
+    return std::max(Max(a,b,c,d), Max(e,f,g));
+}
+
+/// Return the maximum of eight values
+template<typename Type>
+inline const Type&
+Max(const Type& a, const Type& b, const Type& c, const Type& d,
+    const Type& e, const Type& f, const Type& g, const Type& h)
+{
+    return std::max(Max(a,b,c,d), Max(e,f,g,h));
+}
+
+
+// ==========> Min <==================
+
+/// Return the minimum of two values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b) { return std::min(a, b); }
+
+/// Return the minimum of three values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b, const Type& c) { return std::min(std::min(a, b), c); }
+
+/// Return the minimum of four values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b, const Type& c, const Type& d)
+{
+    return std::min(std::min(a, b), std::min(c, d));
+}
+
+/// Return the minimum of five values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b, const Type& c, const Type& d, const Type& e)
+{
+    return std::min(std::min(a,b), Min(c,d,e));
+}
+
+/// Return the minimum of six values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b, const Type& c, const Type& d, const Type& e, const Type& f)
+{
+    return std::min(Min(a,b,c), Min(d,e,f));
+}
+
+/// Return the minimum of seven values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b, const Type& c, const Type& d,
+    const Type& e, const Type& f, const Type& g)
+{
+    return std::min(Min(a,b,c,d), Min(e,f,g));
+}
+
+/// Return the minimum of eight values
+template<typename Type>
+inline const Type&
+Min(const Type& a, const Type& b, const Type& c, const Type& d,
+    const Type& e, const Type& f, const Type& g, const Type& h)
+{
+    return std::min(Min(a,b,c,d), Min(e,f,g,h));
+}
+
+
+// ============> Exp <==================
+
+/// Return @f$ e^x @f$.
+template<typename Type>
+inline Type Exp(const Type& x) { return std::exp(x); }
+
+
+////////////////////////////////////////
+
+
+/// Return the sign of the given value as an integer (either -1, 0 or 1).
+template <typename Type>
+inline int Sign(const Type &x) { return (zeroVal<Type>() < x) - (x < zeroVal<Type>()); }
+
+
+/// @brief Return @c true if @a a and @a b have different signs.
+/// @note Zero is considered a positive number.
+template <typename Type>
+inline bool
+SignChange(const Type& a, const Type& b)
+{
+    return ( (a<zeroVal<Type>()) ^ (b<zeroVal<Type>()) );
+}
+
+
+/// @brief Return @c true if the interval [@a a, @a b] includes zero,
+/// i.e., if either @a a or @a b is zero or if they have different signs.
+template <typename Type>
+inline bool
+ZeroCrossing(const Type& a, const Type& b)
+{
+    return a * b <= zeroVal<Type>();
+}
+
+
+//@{
+/// Return the square root of a floating-point value.
+inline float Sqrt(float x) { return sqrtf(x); }
+inline double Sqrt(double x) { return sqrt(x); }
+inline long double Sqrt(long double x) { return sqrtl(x); }
+//@}
+
+
+//@{
+/// Return the cube root of a floating-point value.
+inline float Cbrt(float x) { return boost::math::cbrt(x); }
+inline double Cbrt(double x) { return boost::math::cbrt(x); }
+inline long double Cbrt(long double x) { return boost::math::cbrt(x); }
+//@}
+
+
+//@{
+/// Return the remainder of @a x / @a y.
+inline int Mod(int x, int y) { return (x % y); }
+inline float Mod(float x, float y) { return fmodf(x,y); }
+inline double Mod(double x, double y) { return fmod(x,y); }
+inline long double Mod(long double x, long double y) { return fmodl(x,y); }
+template<typename Type> inline Type Remainder(Type x, Type y) { return Mod(x,y); }
+//@}
+
+
+//@{
+/// Return @a x rounded up to the nearest integer.
+inline float RoundUp(float x) { return ceilf(x); }
+inline double RoundUp(double x) { return ceil(x); }
+inline long double RoundUp(long double x) { return ceill(x); }
+//@}
+/// Return @a x rounded up to the nearest multiple of @a base.
+template<typename Type>
+inline Type
+RoundUp(Type x, Type base)
+{
+    Type remainder = Remainder(x, base);
+    return remainder ? x-remainder+base : x;
+}
+
+
+//@{
+/// Return @a x rounded down to the nearest integer.
+inline float RoundDown(float x) { return floorf(x); }
+inline double RoundDown(double x) { return floor(x); }
+inline long double RoundDown(long double x) { return floorl(x); }
+//@}
+/// Return @a x rounded down to the nearest multiple of @a base.
+template<typename Type>
+inline Type
+RoundDown(Type x, Type base)
+{
+    Type remainder = Remainder(x, base);
+    return remainder ? x-remainder : x;
+}
+
+
+//@{
+/// Return @a x rounded to the nearest integer.
+inline float Round(float x) { return RoundDown(x + 0.5f); }
+inline double Round(double x) { return RoundDown(x + 0.5); }
+inline long double Round(long double x) { return RoundDown(x + 0.5l); }
+//@}
+
+
+/// Return the euclidean remainder of @a x.
+/// Note unlike % operator this will always return a positive result
+template<typename Type>
+inline Type
+EuclideanRemainder(Type x) { return x - RoundDown(x); }
+
+
+/// Return the integer part of @a x.
+template<typename Type>
+inline Type
+IntegerPart(Type x)
+{
+    return (x > 0 ? RoundDown(x) : RoundUp(x));
+}
+
+/// Return the fractional part of @a x.
+template<typename Type>
+inline Type
+FractionalPart(Type x) { return Mod(x,Type(1)); }
+
+
+//@{
+/// Return the floor of @a x.
+inline int Floor(float x) { return int(RoundDown(x)); }
+inline int Floor(double x) { return int(RoundDown(x)); }
+inline int Floor(long double x) { return int(RoundDown(x)); }
+//@}
+
+
+//@{
+/// Return the ceiling of @a x.
+inline int Ceil(float x) { return int(RoundUp(x)); }
+inline int Ceil(double x) { return int(RoundUp(x)); }
+inline int Ceil(long double x) { return int(RoundUp(x)); }
+//@}
+
+
+/// Return @a x if it is greater or equal in magnitude than @a delta.  Otherwise, return zero.
+template<typename Type>
+inline Type Chop(Type x, Type delta) { return (Abs(x) < delta ? zeroVal<Type>() : x); }
+
+
+/// Return @a x truncated to the given number of decimal digits.
+template<typename Type>
+inline Type
+Truncate(Type x, unsigned int digits)
+{
+    Type tenth = Pow(10,digits);
+    return RoundDown(x*tenth+0.5)/tenth;
+}
+
+
+////////////////////////////////////////
+
+
+/// Return the inverse of @a x.
+template<typename Type>
+inline Type
+Inv(Type x)
+{
+    assert(x);
+    return Type(1)/x;
+}
+
+
+enum Axis {
+    X_AXIS = 0,
+    Y_AXIS = 1,
+    Z_AXIS = 2
+};
+
+// enum values are consistent with their historical mx analogs.
+enum RotationOrder {
+    XYZ_ROTATION = 0,
+    XZY_ROTATION,
+    YXZ_ROTATION,
+    YZX_ROTATION,
+    ZXY_ROTATION,
+    ZYX_ROTATION,
+    XZX_ROTATION,
+    ZXZ_ROTATION
+};
+
+
+template <typename S, typename T>
+struct promote {
+    typedef typename boost::numeric::conversion_traits<S, T>::supertype type;
+};
+
+
+/// @brief Return the index [0,1,2] of the smallest value in a 3D vector.
+/// @note This methods assumes operator[] exists and avoids branching.
+/// @details If two components of the input vector are equal and smaller than the
+/// third component, the largest index of the two is always returned.
+/// If all three vector components are equal the largest index, i.e. 2, is
+/// returned. In other words the return value corresponds to the largest index
+/// of the of the smallest vector components.
+template<typename Vec3T>
+size_t
+MinIndex(const Vec3T& v)
+{
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const size_t hashTable[8] = { 2, 1, 9, 1, 2, 9, 0, 0 };//9 is a dummy value
+    const size_t hashKey =
+        ((v[0] < v[1]) << 2) + ((v[0] < v[2]) << 1) + (v[1] < v[2]);// ?*4+?*2+?*1
+    return hashTable[hashKey];
+}
+
+
+/// @brief Return the index [0,1,2] of the largest value in a 3D vector.
+/// @note This methods assumes operator[] exists and avoids branching.
+/// @details If two components of the input vector are equal and larger than the
+/// third component, the largest index of the two is always returned.
+/// If all three vector components are equal the largest index, i.e. 2, is
+/// returned. In other words the return value corresponds to the largest index
+/// of the largest vector components.
+template<typename Vec3T>
+size_t
+MaxIndex(const Vec3T& v)
+{
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const size_t hashTable[8] = { 2, 1, 9, 1, 2, 9, 0, 0 };//9 is a dummy value
+    const size_t hashKey =
+        ((v[0] > v[1]) << 2) + ((v[0] > v[2]) << 1) + (v[1] > v[2]);// ?*4+?*2+?*1
+    return hashTable[hashKey];
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_MATH_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Operators.h b/nuparu/include/openvdb_new/math/Operators.h
new file mode 100644
index 00000000..61aa4bc0
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Operators.h
@@ -0,0 +1,2123 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Operators.h
+
+#ifndef OPENVDB_MATH_OPERATORS_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_OPERATORS_HAS_BEEN_INCLUDED
+
+#include "FiniteDifference.h"
+#include "Stencils.h"
+#include "Maps.h"
+#include "Transform.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+// Simple tools to help determine when type conversions are needed
+template<typename Vec3T> struct is_vec3d { static const bool value = false; };
+template<> struct is_vec3d<Vec3d>        { static const bool value = true; };
+
+template<typename T> struct is_double    { static const bool value = false; };
+template<> struct is_double<double>      { static const bool value = true; };
+
+
+/// @brief Adapter to associate a map with a world-space operator,
+/// giving it the same call signature as an index-space operator
+/// @todo For now, the operator's result type must be specified explicitly,
+/// but eventually it should be possible, via traits, to derive the result type
+/// from the operator type.
+template<typename MapType, typename OpType, typename ResultType>
+struct MapAdapter {
+    MapAdapter(const MapType& m): map(m) {}
+
+    template<typename AccessorType>
+    inline ResultType
+    result(const AccessorType& grid, const Coord& ijk) { return OpType::result(map, grid, ijk); }
+
+    template<typename StencilType>
+    inline ResultType
+    result(const StencilType& stencil) { return OpType::result(map, stencil); }
+
+    const MapType map;
+};
+
+
+/// Adapter for vector-valued index-space operators to return the vector magnitude
+template<typename OpType>
+struct ISOpMagnitude {
+    template<typename AccessorType>
+    static inline double result(const AccessorType& grid, const Coord& ijk) {
+        return double(OpType::result(grid, ijk).length());
+    }
+
+    template<typename StencilType>
+    static inline double result(const StencilType& stencil) {
+        return double(OpType::result(stencil).length());
+    }
+};
+
+/// Adapter for vector-valued world-space operators to return the vector magnitude
+template<typename OpType, typename MapT>
+struct OpMagnitude {
+    template<typename AccessorType>
+    static inline double result(const MapT& map, const AccessorType& grid, const Coord& ijk) {
+        return double(OpType::result(map, grid, ijk).length());
+    }
+
+    template<typename StencilType>
+    static inline double result(const MapT& map, const StencilType& stencil) {
+        return double(OpType::result(map, stencil).length());
+    }
+};
+
+
+namespace internal {
+
+// This additional layer is necessary for Visual C++ to compile.
+template<typename T>
+struct ReturnValue {
+    typedef typename T::ValueType ValueType;
+    typedef math::Vec3<ValueType> Vec3Type;
+};
+
+} // namespace internal
+
+// ---- Operators defined in index space
+
+
+//@{
+/// @brief Gradient operators defined in index space of various orders
+template<DScheme DiffScheme>
+struct ISGradient
+{
+    // random access version
+    template<typename Accessor> static Vec3<typename Accessor::ValueType>
+    result(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        typedef Vec3<ValueType>              Vec3Type;
+        return Vec3Type( D1<DiffScheme>::inX(grid, ijk),
+                         D1<DiffScheme>::inY(grid, ijk),
+                         D1<DiffScheme>::inZ(grid, ijk) );
+    }
+
+    // stencil access version
+    template<typename StencilT> static Vec3<typename StencilT::ValueType>
+    result(const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType  ValueType;
+        typedef Vec3<ValueType>               Vec3Type;
+        return Vec3Type( D1<DiffScheme>::inX(stencil),
+                         D1<DiffScheme>::inY(stencil),
+                         D1<DiffScheme>::inZ(stencil) );
+    }
+};
+//@}
+
+/// struct that relates the BiasedGradientScheme to the
+/// forward and backward difference methods used, as well as to
+/// the correct stencil type for index space use
+template<BiasedGradientScheme bgs>
+struct BIAS_SCHEME {
+    static const DScheme FD = FD_1ST;
+    static const DScheme BD = BD_1ST;
+
+    template<typename GridType, bool IsSafe = true>
+    struct ISStencil {
+        typedef SevenPointStencil<GridType, IsSafe>  StencilType;
+    };
+};
+
+template<> struct BIAS_SCHEME<FIRST_BIAS>
+{
+    static const DScheme FD = FD_1ST;
+    static const DScheme BD = BD_1ST;
+
+    template<typename GridType, bool IsSafe = true>
+    struct ISStencil {
+        typedef SevenPointStencil<GridType, IsSafe>  StencilType;
+    };
+};
+
+template<> struct BIAS_SCHEME<SECOND_BIAS>
+{
+    static const DScheme FD = FD_2ND;
+    static const DScheme BD = BD_2ND;
+
+    template<typename GridType, bool IsSafe = true>
+    struct ISStencil {
+        typedef ThirteenPointStencil<GridType, IsSafe>  StencilType;
+      };
+};
+template<> struct BIAS_SCHEME<THIRD_BIAS>
+{
+    static const DScheme FD = FD_3RD;
+    static const DScheme BD = BD_3RD;
+
+    template<typename GridType, bool IsSafe = true>
+    struct ISStencil {
+        typedef NineteenPointStencil<GridType, IsSafe>  StencilType;
+    };
+};
+template<> struct BIAS_SCHEME<WENO5_BIAS>
+{
+    static const DScheme FD = FD_WENO5;
+    static const DScheme BD = BD_WENO5;
+
+    template<typename GridType, bool IsSafe = true>
+    struct ISStencil {
+        typedef NineteenPointStencil<GridType, IsSafe>  StencilType;
+    };
+};
+template<> struct BIAS_SCHEME<HJWENO5_BIAS>
+{
+    static const DScheme FD = FD_HJWENO5;
+    static const DScheme BD = BD_HJWENO5;
+
+    template<typename GridType, bool IsSafe = true>
+    struct ISStencil {
+        typedef NineteenPointStencil<GridType, IsSafe>  StencilType;
+    };
+};
+
+
+//@{
+/// @brief Biased Gradient Operators, using upwinding defined by the @c Vec3Bias input
+
+template<BiasedGradientScheme GradScheme, typename Vec3Bias>
+struct ISGradientBiased
+{
+    static const DScheme FD = BIAS_SCHEME<GradScheme>::FD;
+    static const DScheme BD = BIAS_SCHEME<GradScheme>::BD;
+
+    // random access version
+    template<typename Accessor>
+    static Vec3<typename Accessor::ValueType>
+    result(const Accessor& grid, const Coord& ijk, const Vec3Bias& V)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        typedef Vec3<ValueType>              Vec3Type;
+
+        return Vec3Type(V[0]<0 ? D1<FD>::inX(grid,ijk) : D1<BD>::inX(grid,ijk),
+                        V[1]<0 ? D1<FD>::inY(grid,ijk) : D1<BD>::inY(grid,ijk),
+                        V[2]<0 ? D1<FD>::inZ(grid,ijk) : D1<BD>::inZ(grid,ijk) );
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static Vec3<typename StencilT::ValueType>
+    result(const StencilT& stencil, const Vec3Bias& V)
+    {
+        typedef typename StencilT::ValueType  ValueType;
+        typedef Vec3<ValueType>               Vec3Type;
+
+        return Vec3Type(V[0]<0 ? D1<FD>::inX(stencil) : D1<BD>::inX(stencil),
+                        V[1]<0 ? D1<FD>::inY(stencil) : D1<BD>::inY(stencil),
+                        V[2]<0 ? D1<FD>::inZ(stencil) : D1<BD>::inZ(stencil) );
+    }
+};
+
+
+template<BiasedGradientScheme GradScheme>
+struct ISGradientNormSqrd
+{
+    static const DScheme FD = BIAS_SCHEME<GradScheme>::FD;
+    static const DScheme BD = BIAS_SCHEME<GradScheme>::BD;
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType
+    result(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType     ValueType;
+        typedef math::Vec3<ValueType>            Vec3Type;
+
+        Vec3Type up   = ISGradient<FD>::result(grid, ijk);
+        Vec3Type down = ISGradient<BD>::result(grid, ijk);
+        return math::GodunovsNormSqrd(grid.getValue(ijk)>0, down, up);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType
+    result(const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType     ValueType;
+        typedef math::Vec3<ValueType>            Vec3Type;
+
+        Vec3Type up   = ISGradient<FD>::result(stencil);
+        Vec3Type down = ISGradient<BD>::result(stencil);
+        return math::GodunovsNormSqrd(stencil.template getValue<0, 0, 0>()>0, down, up);
+    }
+};
+
+#ifdef DWA_OPENVDB  // for SIMD - note will do the computations in float
+template<>
+struct ISGradientNormSqrd<HJWENO5_BIAS>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const Accessor& grid, const Coord& ijk)
+    {
+        struct GetValue
+        {
+            const Accessor& acc;
+            GetValue(const Accessor& acc_): acc(acc_) {}
+            // Return the grid value at ijk converted to simd::Float4::value_type (= float).
+            inline simd::Float4::value_type operator()(const Coord& ijk_) {
+                return static_cast<simd::Float4::value_type>(acc.getValue(ijk_));
+            }
+        }
+        valueAt(grid);
+
+        // SSE optimized
+        const simd::Float4
+            v1(valueAt(ijk.offsetBy(-2, 0, 0)) - valueAt(ijk.offsetBy(-3, 0, 0)),
+               valueAt(ijk.offsetBy( 0,-2, 0)) - valueAt(ijk.offsetBy( 0,-3, 0)),
+               valueAt(ijk.offsetBy( 0, 0,-2)) - valueAt(ijk.offsetBy( 0, 0,-3)), 0),
+            v2(valueAt(ijk.offsetBy(-1, 0, 0)) - valueAt(ijk.offsetBy(-2, 0, 0)),
+               valueAt(ijk.offsetBy( 0,-1, 0)) - valueAt(ijk.offsetBy( 0,-2, 0)),
+               valueAt(ijk.offsetBy( 0, 0,-1)) - valueAt(ijk.offsetBy( 0, 0,-2)), 0),
+            v3(valueAt(ijk                   ) - valueAt(ijk.offsetBy(-1, 0, 0)),
+               valueAt(ijk                   ) - valueAt(ijk.offsetBy( 0,-1, 0)),
+               valueAt(ijk                   ) - valueAt(ijk.offsetBy( 0, 0,-1)), 0),
+            v4(valueAt(ijk.offsetBy( 1, 0, 0)) - valueAt(ijk                   ),
+               valueAt(ijk.offsetBy( 0, 1, 0)) - valueAt(ijk                   ),
+               valueAt(ijk.offsetBy( 0, 0, 1)) - valueAt(ijk                   ), 0),
+            v5(valueAt(ijk.offsetBy( 2, 0, 0)) - valueAt(ijk.offsetBy( 1, 0, 0)),
+               valueAt(ijk.offsetBy( 0, 2, 0)) - valueAt(ijk.offsetBy( 0, 1, 0)),
+               valueAt(ijk.offsetBy( 0, 0, 2)) - valueAt(ijk.offsetBy( 0, 0, 1)), 0),
+            v6(valueAt(ijk.offsetBy( 3, 0, 0)) - valueAt(ijk.offsetBy( 2, 0, 0)),
+               valueAt(ijk.offsetBy( 0, 3, 0)) - valueAt(ijk.offsetBy( 0, 2, 0)),
+               valueAt(ijk.offsetBy( 0, 0, 3)) - valueAt(ijk.offsetBy( 0, 0, 2)), 0),
+            down = math::WENO5(v1, v2, v3, v4, v5),
+            up   = math::WENO5(v6, v5, v4, v3, v2);
+        
+        return math::GodunovsNormSqrd(grid.getValue(ijk)>0, down, up);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const StencilT& s)
+    {
+        typedef simd::Float4::value_type F4Val;
+
+        // SSE optimized
+        const simd::Float4
+            v1(F4Val(s.template getValue<-2, 0, 0>()) - F4Val(s.template getValue<-3, 0, 0>()),
+               F4Val(s.template getValue< 0,-2, 0>()) - F4Val(s.template getValue< 0,-3, 0>()),
+               F4Val(s.template getValue< 0, 0,-2>()) - F4Val(s.template getValue< 0, 0,-3>()), 0),
+            v2(F4Val(s.template getValue<-1, 0, 0>()) - F4Val(s.template getValue<-2, 0, 0>()),
+               F4Val(s.template getValue< 0,-1, 0>()) - F4Val(s.template getValue< 0,-2, 0>()),
+               F4Val(s.template getValue< 0, 0,-1>()) - F4Val(s.template getValue< 0, 0,-2>()), 0),
+            v3(F4Val(s.template getValue< 0, 0, 0>()) - F4Val(s.template getValue<-1, 0, 0>()),
+               F4Val(s.template getValue< 0, 0, 0>()) - F4Val(s.template getValue< 0,-1, 0>()),
+               F4Val(s.template getValue< 0, 0, 0>()) - F4Val(s.template getValue< 0, 0,-1>()), 0),
+            v4(F4Val(s.template getValue< 1, 0, 0>()) - F4Val(s.template getValue< 0, 0, 0>()),
+               F4Val(s.template getValue< 0, 1, 0>()) - F4Val(s.template getValue< 0, 0, 0>()),
+               F4Val(s.template getValue< 0, 0, 1>()) - F4Val(s.template getValue< 0, 0, 0>()), 0),
+            v5(F4Val(s.template getValue< 2, 0, 0>()) - F4Val(s.template getValue< 1, 0, 0>()),
+               F4Val(s.template getValue< 0, 2, 0>()) - F4Val(s.template getValue< 0, 1, 0>()),
+               F4Val(s.template getValue< 0, 0, 2>()) - F4Val(s.template getValue< 0, 0, 1>()), 0),
+            v6(F4Val(s.template getValue< 3, 0, 0>()) - F4Val(s.template getValue< 2, 0, 0>()),
+               F4Val(s.template getValue< 0, 3, 0>()) - F4Val(s.template getValue< 0, 2, 0>()),
+               F4Val(s.template getValue< 0, 0, 3>()) - F4Val(s.template getValue< 0, 0, 2>()), 0),
+            down = math::WENO5(v1, v2, v3, v4, v5),
+            up   = math::WENO5(v6, v5, v4, v3, v2);
+
+        return math::GodunovsNormSqrd(s.template getValue<0, 0, 0>()>0, down, up);
+    }
+};
+#endif //DWA_OPENVDB  // for SIMD - note will do the computations in float
+//@}
+
+
+//@{
+/// @brief Laplacian defined in index space, using various center-difference stencils
+template<DDScheme DiffScheme>
+struct ISLaplacian
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const Accessor& grid, const Coord& ijk);
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const StencilT& stencil);
+};
+
+
+template<>
+struct ISLaplacian<CD_SECOND>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const Accessor& grid, const Coord& ijk)
+    {
+        return  grid.getValue(ijk.offsetBy(1,0,0)) + grid.getValue(ijk.offsetBy(-1, 0, 0)) +
+                grid.getValue(ijk.offsetBy(0,1,0)) + grid.getValue(ijk.offsetBy(0, -1, 0)) +
+                grid.getValue(ijk.offsetBy(0,0,1)) + grid.getValue(ijk.offsetBy(0,  0,-1))
+                                                   - 6*grid.getValue(ijk);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const StencilT& stencil)
+    {
+        return  stencil.template getValue< 1, 0, 0>() + stencil.template getValue<-1, 0, 0>() +
+                stencil.template getValue< 0, 1, 0>() + stencil.template getValue< 0,-1, 0>() +
+                stencil.template getValue< 0, 0, 1>() + stencil.template getValue< 0, 0,-1>()
+                                                   - 6*stencil.template getValue< 0, 0, 0>();
+    }
+};
+
+template<>
+struct ISLaplacian<CD_FOURTH>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueT;
+        return static_cast<ValueT>(
+            (-1./12.)*(
+                grid.getValue(ijk.offsetBy(2,0,0)) + grid.getValue(ijk.offsetBy(-2, 0, 0)) +
+                grid.getValue(ijk.offsetBy(0,2,0)) + grid.getValue(ijk.offsetBy( 0,-2, 0)) +
+                grid.getValue(ijk.offsetBy(0,0,2)) + grid.getValue(ijk.offsetBy( 0, 0,-2)) )
+            + (4./3.)*(
+                grid.getValue(ijk.offsetBy(1,0,0)) + grid.getValue(ijk.offsetBy(-1, 0, 0)) +
+                grid.getValue(ijk.offsetBy(0,1,0)) + grid.getValue(ijk.offsetBy( 0,-1, 0)) +
+                grid.getValue(ijk.offsetBy(0,0,1)) + grid.getValue(ijk.offsetBy( 0, 0,-1)) )
+            - 7.5*grid.getValue(ijk));
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueT;
+        return static_cast<ValueT>(
+            (-1./12.)*(
+                stencil.template getValue< 2, 0, 0>() + stencil.template getValue<-2, 0, 0>() +
+                stencil.template getValue< 0, 2, 0>() + stencil.template getValue< 0,-2, 0>() +
+                stencil.template getValue< 0, 0, 2>() + stencil.template getValue< 0, 0,-2>() )
+            + (4./3.)*(
+                stencil.template getValue< 1, 0, 0>() + stencil.template getValue<-1, 0, 0>() +
+                stencil.template getValue< 0, 1, 0>() + stencil.template getValue< 0,-1, 0>() +
+                stencil.template getValue< 0, 0, 1>() + stencil.template getValue< 0, 0,-1>() )
+            - 7.5*stencil.template getValue< 0, 0, 0>());
+    }
+};
+
+template<>
+struct ISLaplacian<CD_SIXTH>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueT;
+        return static_cast<ValueT>(
+            (1./90.)*(
+                grid.getValue(ijk.offsetBy(3,0,0)) + grid.getValue(ijk.offsetBy(-3, 0, 0)) +
+                grid.getValue(ijk.offsetBy(0,3,0)) + grid.getValue(ijk.offsetBy( 0,-3, 0)) +
+                grid.getValue(ijk.offsetBy(0,0,3)) + grid.getValue(ijk.offsetBy( 0, 0,-3)) )
+            - (3./20.)*(
+                grid.getValue(ijk.offsetBy(2,0,0)) + grid.getValue(ijk.offsetBy(-2, 0, 0)) +
+                grid.getValue(ijk.offsetBy(0,2,0)) + grid.getValue(ijk.offsetBy( 0,-2, 0)) +
+                grid.getValue(ijk.offsetBy(0,0,2)) + grid.getValue(ijk.offsetBy( 0, 0,-2)) )
+            + 1.5 *(
+                grid.getValue(ijk.offsetBy(1,0,0)) + grid.getValue(ijk.offsetBy(-1, 0, 0)) +
+                grid.getValue(ijk.offsetBy(0,1,0)) + grid.getValue(ijk.offsetBy( 0,-1, 0)) +
+                grid.getValue(ijk.offsetBy(0,0,1)) + grid.getValue(ijk.offsetBy( 0, 0,-1)) )
+            - (3*49/18.)*grid.getValue(ijk));
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueT;
+        return static_cast<ValueT>(
+            (1./90.)*(
+                stencil.template getValue< 3, 0, 0>() + stencil.template getValue<-3, 0, 0>() +
+                stencil.template getValue< 0, 3, 0>() + stencil.template getValue< 0,-3, 0>() +
+                stencil.template getValue< 0, 0, 3>() + stencil.template getValue< 0, 0,-3>() )
+            - (3./20.)*(
+                stencil.template getValue< 2, 0, 0>() + stencil.template getValue<-2, 0, 0>() +
+                stencil.template getValue< 0, 2, 0>() + stencil.template getValue< 0,-2, 0>() +
+                stencil.template getValue< 0, 0, 2>() + stencil.template getValue< 0, 0,-2>() )
+            + 1.5 *(
+                stencil.template getValue< 1, 0, 0>() + stencil.template getValue<-1, 0, 0>() +
+                stencil.template getValue< 0, 1, 0>() + stencil.template getValue< 0,-1, 0>() +
+                stencil.template getValue< 0, 0, 1>() + stencil.template getValue< 0, 0,-1>() )
+            - (3*49/18.)*stencil.template getValue< 0, 0, 0>());
+    }
+};
+//@}
+
+
+//@{
+/// Divergence operator defined in index space using various first derivative schemes
+template<DScheme DiffScheme>
+struct ISDivergence
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const Accessor& grid, const Coord& ijk)
+    {
+        return D1Vec<DiffScheme>::inX(grid, ijk, 0) +
+               D1Vec<DiffScheme>::inY(grid, ijk, 1) +
+               D1Vec<DiffScheme>::inZ(grid, ijk, 2);
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const StencilT& stencil)
+    {
+        return D1Vec<DiffScheme>::inX(stencil, 0) +
+               D1Vec<DiffScheme>::inY(stencil, 1) +
+               D1Vec<DiffScheme>::inZ(stencil, 2);
+    }
+};
+//@}
+
+
+//@{
+/// Curl operator defined in index space using various first derivative schemes
+template<DScheme DiffScheme>
+struct ISCurl
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType Vec3Type;
+        return Vec3Type( D1Vec<DiffScheme>::inY(grid, ijk, 2) - //dw/dy - dv/dz
+                         D1Vec<DiffScheme>::inZ(grid, ijk, 1),
+                         D1Vec<DiffScheme>::inZ(grid, ijk, 0) - //du/dz - dw/dx
+                         D1Vec<DiffScheme>::inX(grid, ijk, 2),
+                         D1Vec<DiffScheme>::inX(grid, ijk, 1) - //dv/dx - du/dy
+                         D1Vec<DiffScheme>::inY(grid, ijk, 0) );
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType Vec3Type;
+        return Vec3Type( D1Vec<DiffScheme>::inY(stencil, 2) - //dw/dy - dv/dz
+                         D1Vec<DiffScheme>::inZ(stencil, 1),
+                         D1Vec<DiffScheme>::inZ(stencil, 0) - //du/dz - dw/dx
+                         D1Vec<DiffScheme>::inX(stencil, 2),
+                         D1Vec<DiffScheme>::inX(stencil, 1) - //dv/dx - du/dy
+                         D1Vec<DiffScheme>::inY(stencil, 0) );
+    }
+};
+//@}
+
+
+//@{
+/// Compute the mean curvature in index space
+template<DDScheme DiffScheme2, DScheme DiffScheme1>
+struct ISMeanCurvature
+{
+    /// @brief random access version
+    /// @return true if the gradient is none-zero, in which case the
+    /// mean curvature is computed as two parts: @c alpha is the numerator in
+    /// @f$\nabla \cdot (\nabla \phi / |\nabla \phi|)@f$, and @c beta is @f$|\nabla \phi|@f$.
+    template<typename Accessor>
+    static bool result(const Accessor& grid, const Coord& ijk,
+                       typename Accessor::ValueType& alpha,
+                       typename Accessor::ValueType& beta)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        const ValueType Dx = D1<DiffScheme1>::inX(grid, ijk);
+        const ValueType Dy = D1<DiffScheme1>::inY(grid, ijk);
+        const ValueType Dz = D1<DiffScheme1>::inZ(grid, ijk);
+
+        const ValueType Dx2 = Dx*Dx;
+        const ValueType Dy2 = Dy*Dy;
+        const ValueType Dz2 = Dz*Dz;
+        const ValueType normGrad = Dx2 + Dy2 + Dz2;
+        if (normGrad <= math::Tolerance<ValueType>::value()) {
+            alpha = beta = 0;
+            return false;
+        }
+
+        const ValueType Dxx = D2<DiffScheme2>::inX(grid, ijk);
+        const ValueType Dyy = D2<DiffScheme2>::inY(grid, ijk);
+        const ValueType Dzz = D2<DiffScheme2>::inZ(grid, ijk);
+
+        const ValueType Dxy = D2<DiffScheme2>::inXandY(grid, ijk);
+        const ValueType Dyz = D2<DiffScheme2>::inYandZ(grid, ijk);
+        const ValueType Dxz = D2<DiffScheme2>::inXandZ(grid, ijk);
+
+        // for return
+        alpha = (Dx2*(Dyy+Dzz)+Dy2*(Dxx+Dzz)+Dz2*(Dxx+Dyy)-2*(Dx*(Dy*Dxy+Dz*Dxz)+Dy*Dz*Dyz));
+        beta  = ValueType(std::sqrt(double(normGrad))); // * 1/dx
+        return true;
+    }
+
+    /// @brief stencil access version
+    /// @return true if the gradient is none-zero, in which case the
+    /// mean curvature is computed as two parts: @c alpha is the numerator in
+    /// @f$\nabla \cdot (\nabla \phi / |\nabla \phi|)@f$, and @c beta is @f$|\nabla \phi|@f$.
+    template<typename StencilT>
+    static bool result(const StencilT& stencil,
+                       typename StencilT::ValueType& alpha,
+                       typename StencilT::ValueType& beta)
+    {
+        typedef typename StencilT::ValueType   ValueType;
+        const ValueType Dx = D1<DiffScheme1>::inX(stencil);
+        const ValueType Dy = D1<DiffScheme1>::inY(stencil);
+        const ValueType Dz = D1<DiffScheme1>::inZ(stencil);
+
+        const ValueType Dx2 = Dx*Dx;
+        const ValueType Dy2 = Dy*Dy;
+        const ValueType Dz2 = Dz*Dz;
+        const ValueType normGrad = Dx2 + Dy2 + Dz2;
+        if (normGrad <= math::Tolerance<ValueType>::value()) {
+            alpha = beta = 0;
+            return false;
+        }
+
+        const ValueType Dxx = D2<DiffScheme2>::inX(stencil);
+        const ValueType Dyy = D2<DiffScheme2>::inY(stencil);
+        const ValueType Dzz = D2<DiffScheme2>::inZ(stencil);
+
+        const ValueType Dxy = D2<DiffScheme2>::inXandY(stencil);
+        const ValueType Dyz = D2<DiffScheme2>::inYandZ(stencil);
+        const ValueType Dxz = D2<DiffScheme2>::inXandZ(stencil);
+
+        // for return
+        alpha = (Dx2*(Dyy+Dzz)+Dy2*(Dxx+Dzz)+Dz2*(Dxx+Dyy)-2*(Dx*(Dy*Dxy+Dz*Dxz)+Dy*Dz*Dyz));
+        beta = ValueType(std::sqrt(double(normGrad))); // * 1/dx
+        return true;
+    }
+};
+
+////////////////////////////////////////////////////////
+
+// --- Operators defined in the Range of a given map
+
+//@{
+/// @brief Center difference gradient operators, defined with respect to
+/// the range-space of the @c map
+/// @note This will need to be divided by two in the case of CD_2NDT
+template<typename MapType, DScheme DiffScheme>
+struct Gradient
+{
+    // random access version
+    template<typename Accessor>
+    static typename internal::ReturnValue<Accessor>::Vec3Type
+    result(const MapType& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename internal::ReturnValue<Accessor>::Vec3Type Vec3Type;
+
+        Vec3d iGradient( ISGradient<DiffScheme>::result(grid, ijk) );
+        return Vec3Type(map.applyIJT(iGradient, ijk.asVec3d()));
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename internal::ReturnValue<StencilT>::Vec3Type
+    result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename internal::ReturnValue<StencilT>::Vec3Type Vec3Type;
+
+        Vec3d iGradient( ISGradient<DiffScheme>::result(stencil) );
+        return Vec3Type(map.applyIJT(iGradient, stencil.getCenterCoord().asVec3d()));
+    }
+};
+
+// Partial template specialization of Gradient
+// translation, any order
+template<DScheme DiffScheme>
+struct Gradient<TranslationMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor>
+    static typename internal::ReturnValue<Accessor>::Vec3Type
+    result(const TranslationMap&, const Accessor& grid, const Coord& ijk)
+    {
+        return ISGradient<DiffScheme>::result(grid, ijk);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename internal::ReturnValue<StencilT>::Vec3Type
+    result(const TranslationMap&, const StencilT& stencil)
+    {
+        return ISGradient<DiffScheme>::result(stencil);
+    }
+};
+
+/// Full template specialization of Gradient
+/// uniform scale, 2nd order
+template<>
+struct Gradient<UniformScaleMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor>
+    static typename internal::ReturnValue<Accessor>::Vec3Type
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename internal::ReturnValue<Accessor>::ValueType ValueType;
+        typedef typename internal::ReturnValue<Accessor>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(grid, ijk) );
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return  iGradient * inv2dx;
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename internal::ReturnValue<StencilT>::Vec3Type
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename internal::ReturnValue<StencilT>::ValueType ValueType;
+        typedef typename internal::ReturnValue<StencilT>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(stencil) );
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return  iGradient * inv2dx;
+    }
+};
+
+/// Full template specialization of Gradient
+/// uniform scale translate, 2nd order
+template<>
+struct Gradient<UniformScaleTranslateMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor>
+    static typename internal::ReturnValue<Accessor>::Vec3Type
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename internal::ReturnValue<Accessor>::ValueType ValueType;
+        typedef typename internal::ReturnValue<Accessor>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(grid, ijk) );
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return  iGradient * inv2dx;
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename internal::ReturnValue<StencilT>::Vec3Type
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename internal::ReturnValue<StencilT>::ValueType ValueType;
+        typedef typename internal::ReturnValue<StencilT>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(stencil) );
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return  iGradient * inv2dx;
+    }
+};
+
+/// Full template specialization of Gradient
+/// scale, 2nd order
+template<>
+struct Gradient<ScaleMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor>
+    static typename internal::ReturnValue<Accessor>::Vec3Type
+    result(const ScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename internal::ReturnValue<Accessor>::ValueType ValueType;
+        typedef typename internal::ReturnValue<Accessor>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(grid, ijk) );
+        return  Vec3Type(ValueType(iGradient[0] * map.getInvTwiceScale()[0]),
+                         ValueType(iGradient[1] * map.getInvTwiceScale()[1]),
+                         ValueType(iGradient[2] * map.getInvTwiceScale()[2]) );
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename internal::ReturnValue<StencilT>::Vec3Type
+    result(const ScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename internal::ReturnValue<StencilT>::ValueType ValueType;
+        typedef typename internal::ReturnValue<StencilT>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(stencil) );
+        return  Vec3Type(ValueType(iGradient[0] * map.getInvTwiceScale()[0]),
+                         ValueType(iGradient[1] * map.getInvTwiceScale()[1]),
+                         ValueType(iGradient[2] * map.getInvTwiceScale()[2]) );
+    }
+};
+
+/// Full template specialization of Gradient
+/// scale translate, 2nd order
+template<>
+struct Gradient<ScaleTranslateMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor>
+    static typename internal::ReturnValue<Accessor>::Vec3Type
+    result(const ScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename internal::ReturnValue<Accessor>::ValueType ValueType;
+        typedef typename internal::ReturnValue<Accessor>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(grid, ijk) );
+        return  Vec3Type(ValueType(iGradient[0] * map.getInvTwiceScale()[0]),
+                         ValueType(iGradient[1] * map.getInvTwiceScale()[1]),
+                         ValueType(iGradient[2] * map.getInvTwiceScale()[2]) );
+    }
+
+    // Stencil access version
+    template<typename StencilT>
+    static typename internal::ReturnValue<StencilT>::Vec3Type
+    result(const ScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename internal::ReturnValue<StencilT>::ValueType ValueType;
+        typedef typename internal::ReturnValue<StencilT>::Vec3Type Vec3Type;
+
+        Vec3Type iGradient( ISGradient<CD_2NDT>::result(stencil) );
+        return  Vec3Type(ValueType(iGradient[0] * map.getInvTwiceScale()[0]),
+                         ValueType(iGradient[1] * map.getInvTwiceScale()[1]),
+                         ValueType(iGradient[2] * map.getInvTwiceScale()[2]) );
+    }
+};
+//@}
+
+
+//@{
+/// @brief Biased gradient operators, defined with respect to the range-space of the map
+/// @note This will need to be divided by two in the case of CD_2NDT
+template<typename MapType, BiasedGradientScheme GradScheme>
+struct GradientBiased
+{
+    // random access version
+    template<typename Accessor> static math::Vec3<typename Accessor::ValueType>
+    result(const MapType& map, const Accessor& grid, const Coord& ijk,
+           const Vec3<typename Accessor::ValueType>& V)
+    {
+        typedef typename Accessor::ValueType     ValueType;
+        typedef math::Vec3<ValueType>            Vec3Type;
+
+        Vec3d iGradient( ISGradientBiased<GradScheme, Vec3Type>::result(grid, ijk, V) );
+        return Vec3Type(map.applyIJT(iGradient, ijk.asVec3d()));
+    }
+
+    // stencil access version
+    template<typename StencilT> static math::Vec3<typename StencilT::ValueType>
+    result(const MapType& map, const StencilT& stencil,
+           const Vec3<typename StencilT::ValueType>& V)
+    {
+        typedef typename StencilT::ValueType      ValueType;
+        typedef math::Vec3<ValueType>            Vec3Type;
+
+        Vec3d iGradient( ISGradientBiased<GradScheme, Vec3Type>::result(stencil, V) );
+        return Vec3Type(map.applyIJT(iGradient, stencil.getCenterCoord().asVec3d()));
+    }
+};
+//@}
+
+
+////////////////////////////////////////////////////////
+
+// Computes |Grad[Phi]| using upwinding
+template<typename MapType, BiasedGradientScheme GradScheme>
+struct GradientNormSqrd
+{
+    static const DScheme FD = BIAS_SCHEME<GradScheme>::FD;
+    static const DScheme BD = BIAS_SCHEME<GradScheme>::BD;
+
+
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType
+    result(const MapType& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType     ValueType;
+        typedef math::Vec3<ValueType>            Vec3Type;
+
+        Vec3Type up   = Gradient<MapType, FD>::result(map, grid, ijk);
+        Vec3Type down = Gradient<MapType, BD>::result(map, grid, ijk);
+        return math::GodunovsNormSqrd(grid.getValue(ijk)>0, down, up);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType
+    result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType     ValueType;
+        typedef math::Vec3<ValueType>            Vec3Type;
+
+        Vec3Type up   = Gradient<MapType, FD>::result(map, stencil);
+        Vec3Type down = Gradient<MapType, BD>::result(map, stencil);
+        return math::GodunovsNormSqrd(stencil.template getValue<0, 0, 0>()>0, down, up);
+    }
+};
+
+/// Partial template specialization of GradientNormSqrd
+template<BiasedGradientScheme GradScheme>
+struct GradientNormSqrd<UniformScaleMap, GradScheme>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType     ValueType;
+
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return invdxdx * ISGradientNormSqrd<GradScheme>::result(grid, ijk);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType      ValueType;
+
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return invdxdx * ISGradientNormSqrd<GradScheme>::result(stencil);
+    }
+};
+
+/// Partial template specialization of GradientNormSqrd
+template<BiasedGradientScheme GradScheme>
+struct GradientNormSqrd<UniformScaleTranslateMap, GradScheme>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType     ValueType;
+
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return invdxdx * ISGradientNormSqrd<GradScheme>::result(grid, ijk);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType      ValueType;
+
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return invdxdx * ISGradientNormSqrd<GradScheme>::result(stencil);
+    }
+};
+
+
+//@{
+/// @brief Compute the divergence of a vector-valued grid using differencing
+/// of various orders, the result defined with respect to the range-space of the map.
+template<typename MapType, DScheme DiffScheme>
+struct Divergence
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const MapType& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        for (int i=0; i < 3; i++) {
+            Vec3d vec( D1Vec<DiffScheme>::inX(grid, ijk, i),
+                       D1Vec<DiffScheme>::inY(grid, ijk, i),
+                       D1Vec<DiffScheme>::inZ(grid, ijk, i) );
+            div += ValueType(map.applyIJT(vec, ijk.asVec3d())[i]);
+        }
+        return div;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        for (int i=0; i < 3; i++) {
+            Vec3d vec( D1Vec<DiffScheme>::inX(stencil, i),
+                       D1Vec<DiffScheme>::inY(stencil, i),
+                       D1Vec<DiffScheme>::inZ(stencil, i) );
+            div += ValueType(map.applyIJT(vec, stencil.getCenterCoord().asVec3d())[i]);
+        }
+        return div;
+    }
+};
+
+/// Partial template specialization of Divergence
+/// translation, any scheme
+template<DScheme DiffScheme>
+struct Divergence<TranslationMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const TranslationMap&, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        div =ISDivergence<DiffScheme>::result(grid, ijk);
+        return div;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const TranslationMap&, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        div =ISDivergence<DiffScheme>::result(stencil);
+        return div;
+    }
+};
+
+/// Partial template specialization of Divergence
+/// uniform scale, any scheme
+template<DScheme DiffScheme>
+struct Divergence<UniformScaleMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div(0);
+
+        div =ISDivergence<DiffScheme>::result(grid, ijk);
+        ValueType invdx = ValueType(map.getInvScale()[0]);
+        return div * invdx;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+
+        div =ISDivergence<DiffScheme>::result(stencil);
+        ValueType invdx = ValueType(map.getInvScale()[0]);
+        return div * invdx;
+    }
+};
+
+/// Partial template specialization of Divergence
+/// uniform scale and translation, any scheme
+template<DScheme DiffScheme>
+struct Divergence<UniformScaleTranslateMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div(0);
+
+        div =ISDivergence<DiffScheme>::result(grid, ijk);
+        ValueType invdx = ValueType(map.getInvScale()[0]);
+        return div * invdx;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+
+        div =ISDivergence<DiffScheme>::result(stencil);
+        ValueType invdx = ValueType(map.getInvScale()[0]);
+        return div * invdx;
+    }
+};
+
+/// Full template specialization of Divergence
+/// uniform scale 2nd order
+template<>
+struct Divergence<UniformScaleMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        div =ISDivergence<CD_2NDT>::result(grid, ijk);
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return div * inv2dx;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        div =ISDivergence<CD_2NDT>::result(stencil);
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return div * inv2dx;
+    }
+};
+
+/// Full template specialization of Divergence
+/// uniform scale translate 2nd order
+template<>
+struct Divergence<UniformScaleTranslateMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div(0);
+
+        div =ISDivergence<CD_2NDT>::result(grid, ijk);
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return div * inv2dx;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+
+        div =ISDivergence<CD_2NDT>::result(stencil);
+        ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+        return div * inv2dx;
+    }
+};
+
+/// Partial template specialization of Divergence
+/// scale, any scheme
+template<DScheme DiffScheme>
+struct Divergence<ScaleMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const ScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div = ValueType(
+            D1Vec<DiffScheme>::inX(grid, ijk, 0) * (map.getInvScale()[0]) +
+            D1Vec<DiffScheme>::inY(grid, ijk, 1) * (map.getInvScale()[1]) +
+            D1Vec<DiffScheme>::inZ(grid, ijk, 2) * (map.getInvScale()[2]));
+        return div;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const ScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        div = ValueType(
+              D1Vec<DiffScheme>::inX(stencil, 0) * (map.getInvScale()[0]) +
+              D1Vec<DiffScheme>::inY(stencil, 1) * (map.getInvScale()[1]) +
+              D1Vec<DiffScheme>::inZ(stencil, 2) * (map.getInvScale()[2]) );
+        return div;
+    }
+};
+
+/// Partial template specialization of Divergence
+/// scale translate, any scheme
+template<DScheme DiffScheme>
+struct Divergence<ScaleTranslateMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const ScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div = ValueType(
+            D1Vec<DiffScheme>::inX(grid, ijk, 0) * (map.getInvScale()[0]) +
+            D1Vec<DiffScheme>::inY(grid, ijk, 1) * (map.getInvScale()[1]) +
+            D1Vec<DiffScheme>::inZ(grid, ijk, 2) * (map.getInvScale()[2]));
+        return div;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const ScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div(0);
+        div = ValueType(
+              D1Vec<DiffScheme>::inX(stencil, 0) * (map.getInvScale()[0]) +
+              D1Vec<DiffScheme>::inY(stencil, 1) * (map.getInvScale()[1]) +
+              D1Vec<DiffScheme>::inZ(stencil, 2) * (map.getInvScale()[2]) );
+        return div;
+    }
+};
+
+/// Full template specialization Divergence
+/// scale 2nd order
+template<>
+struct Divergence<ScaleMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const ScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div = ValueType(
+            D1Vec<CD_2NDT>::inX(grid, ijk, 0) * (map.getInvTwiceScale()[0]) +
+            D1Vec<CD_2NDT>::inY(grid, ijk, 1) * (map.getInvTwiceScale()[1]) +
+            D1Vec<CD_2NDT>::inZ(grid, ijk, 2) * (map.getInvTwiceScale()[2]) );
+        return div;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const ScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div = ValueType(
+            D1Vec<CD_2NDT>::inX(stencil, 0) * (map.getInvTwiceScale()[0]) +
+            D1Vec<CD_2NDT>::inY(stencil, 1) * (map.getInvTwiceScale()[1]) +
+            D1Vec<CD_2NDT>::inZ(stencil, 2) * (map.getInvTwiceScale()[2]) );
+        return div;
+    }
+};
+
+/// Full template specialization of Divergence
+/// scale and translate, 2nd order
+template<>
+struct Divergence<ScaleTranslateMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType::value_type
+    result(const ScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType::value_type ValueType;
+
+        ValueType div = ValueType(
+            D1Vec<CD_2NDT>::inX(grid, ijk, 0) * (map.getInvTwiceScale()[0]) +
+            D1Vec<CD_2NDT>::inY(grid, ijk, 1) * (map.getInvTwiceScale()[1]) +
+            D1Vec<CD_2NDT>::inZ(grid, ijk, 2) * (map.getInvTwiceScale()[2]) );
+        return div;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType::value_type
+    result(const ScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType::value_type ValueType;
+
+        ValueType div = ValueType(
+            D1Vec<CD_2NDT>::inX(stencil, 0) * (map.getInvTwiceScale()[0]) +
+            D1Vec<CD_2NDT>::inY(stencil, 1) * (map.getInvTwiceScale()[1]) +
+            D1Vec<CD_2NDT>::inZ(stencil, 2) * (map.getInvTwiceScale()[2]) );
+        return div;
+    }
+};
+//@}
+
+
+//@{
+/// @brief Compute the curl of a vector-valued grid using differencing
+/// of various orders in the space defined by the range of the map.
+template<typename MapType, DScheme DiffScheme>
+struct Curl
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const MapType& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType Vec3Type;
+        Vec3Type mat[3];
+        for (int i = 0; i < 3; i++) {
+            Vec3d vec(
+                D1Vec<DiffScheme>::inX(grid, ijk, i),
+                D1Vec<DiffScheme>::inY(grid, ijk, i),
+                D1Vec<DiffScheme>::inZ(grid, ijk, i));
+            // dF_i/dx_j   (x_1 = x, x_2 = y,  x_3 = z)
+            mat[i] = Vec3Type(map.applyIJT(vec, ijk.asVec3d()));
+        }
+        return Vec3Type(mat[2][1] - mat[1][2], // dF_3/dx_2 - dF_2/dx_3
+                        mat[0][2] - mat[2][0], // dF_1/dx_3 - dF_3/dx_1
+                        mat[1][0] - mat[0][1]); // dF_2/dx_1 - dF_1/dx_2
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType Vec3Type;
+        Vec3Type mat[3];
+        for (int i = 0; i < 3; i++) {
+            Vec3d vec(
+                D1Vec<DiffScheme>::inX(stencil, i),
+                D1Vec<DiffScheme>::inY(stencil, i),
+                D1Vec<DiffScheme>::inZ(stencil, i));
+            // dF_i/dx_j   (x_1 = x, x_2 = y,  x_3 = z)
+            mat[i] = Vec3Type(map.applyIJT(vec, stencil.getCenterCoord().asVec3d()));
+        }
+        return Vec3Type(mat[2][1] - mat[1][2], // dF_3/dx_2 - dF_2/dx_3
+                        mat[0][2] - mat[2][0], // dF_1/dx_3 - dF_3/dx_1
+                        mat[1][0] - mat[0][1]); // dF_2/dx_1 - dF_1/dx_2
+    }
+};
+
+/// Partial template specialization of Curl
+template<DScheme DiffScheme>
+struct Curl<UniformScaleMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+        return ISCurl<DiffScheme>::result(grid, ijk) * ValueType(map.getInvScale()[0]);
+    }
+
+    // Stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+         typedef typename StencilT::ValueType  Vec3Type;
+         typedef typename Vec3Type::value_type ValueType;
+         return ISCurl<DiffScheme>::result(stencil) * ValueType(map.getInvScale()[0]);
+     }
+};
+
+/// Partial template specialization of Curl
+template<DScheme DiffScheme>
+struct Curl<UniformScaleTranslateMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+
+        return ISCurl<DiffScheme>::result(grid, ijk) * ValueType(map.getInvScale()[0]);
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+
+        return ISCurl<DiffScheme>::result(stencil) * ValueType(map.getInvScale()[0]);
+    }
+};
+
+/// Full template specialization of Curl
+template<>
+struct Curl<UniformScaleMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+
+        return ISCurl<CD_2NDT>::result(grid, ijk) * ValueType(map.getInvTwiceScale()[0]);
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+
+        return ISCurl<CD_2NDT>::result(stencil) * ValueType(map.getInvTwiceScale()[0]);
+    }
+};
+
+/// Full template specialization of Curl
+template<>
+struct Curl<UniformScaleTranslateMap, CD_2ND>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+
+        return ISCurl<CD_2NDT>::result(grid, ijk) * ValueType(map.getInvTwiceScale()[0]);
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType  Vec3Type;
+        typedef typename Vec3Type::value_type ValueType;
+
+        return ISCurl<CD_2NDT>::result(stencil) * ValueType(map.getInvTwiceScale()[0]);
+    }
+};
+//@}
+
+
+//@{
+/// @brief Compute the Laplacian at a given location in a grid using finite differencing
+/// of various orders.  The result is defined in the range of the map.
+template<typename MapType, DDScheme DiffScheme>
+struct Laplacian
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const MapType& map,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        // all the second derivatives in index space
+        ValueType iddx  = D2<DiffScheme>::inX(grid, ijk);
+        ValueType iddy  = D2<DiffScheme>::inY(grid, ijk);
+        ValueType iddz  = D2<DiffScheme>::inZ(grid, ijk);
+
+        ValueType iddxy = D2<DiffScheme>::inXandY(grid, ijk);
+        ValueType iddyz = D2<DiffScheme>::inYandZ(grid, ijk);
+        ValueType iddxz = D2<DiffScheme>::inXandZ(grid, ijk);
+
+        // second derivatives in index space
+        Mat3d  d2_is(iddx,  iddxy, iddxz,
+                     iddxy, iddy,  iddyz,
+                     iddxz, iddyz, iddz);
+
+        Mat3d d2_rs;  // to hold the second derivative matrix in range space
+        if (is_linear<MapType>::value) {
+            d2_rs = map.applyIJC(d2_is);
+        } else {
+            // compute the first derivatives with 2nd order accuracy.
+            Vec3d d1_is(static_cast<double>(D1<CD_2ND>::inX(grid, ijk)),
+                        static_cast<double>(D1<CD_2ND>::inY(grid, ijk)),
+                        static_cast<double>(D1<CD_2ND>::inZ(grid, ijk)));
+
+            d2_rs = map.applyIJC(d2_is, d1_is, ijk.asVec3d());
+        }
+
+        // the trace of the second derivative (range space) matrix is laplacian
+        return ValueType(d2_rs(0,0) + d2_rs(1,1) + d2_rs(2,2));
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        // all the second derivatives in index space
+        ValueType iddx  = D2<DiffScheme>::inX(stencil);
+        ValueType iddy  = D2<DiffScheme>::inY(stencil);
+        ValueType iddz  = D2<DiffScheme>::inZ(stencil);
+
+        ValueType iddxy = D2<DiffScheme>::inXandY(stencil);
+        ValueType iddyz = D2<DiffScheme>::inYandZ(stencil);
+        ValueType iddxz = D2<DiffScheme>::inXandZ(stencil);
+
+        // second derivatives in index space
+        Mat3d  d2_is(iddx,  iddxy, iddxz,
+                     iddxy, iddy,  iddyz,
+                     iddxz, iddyz, iddz);
+
+        Mat3d d2_rs;  // to hold the second derivative matrix in range space
+        if (is_linear<MapType>::value) {
+            d2_rs = map.applyIJC(d2_is);
+        } else {
+            // compute the first derivatives with 2nd order accuracy.
+            Vec3d d1_is(D1<CD_2ND>::inX(stencil),
+                        D1<CD_2ND>::inY(stencil),
+                        D1<CD_2ND>::inZ(stencil) );
+
+            d2_rs = map.applyIJC(d2_is, d1_is, stencil.getCenterCoord().asVec3d());
+        }
+
+        // the trace of the second derivative (range space) matrix is laplacian
+        return ValueType(d2_rs(0,0) + d2_rs(1,1) + d2_rs(2,2));
+    }
+};
+
+
+template<DDScheme DiffScheme>
+struct Laplacian<TranslationMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const TranslationMap&,
+        const Accessor& grid, const Coord& ijk)
+    {
+        return ISLaplacian<DiffScheme>::result(grid, ijk);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const TranslationMap&, const StencilT& stencil)
+    {
+        return ISLaplacian<DiffScheme>::result(stencil);
+    }
+};
+
+
+// The Laplacian is invariant to rotation or reflection.
+template<DDScheme DiffScheme>
+struct Laplacian<UnitaryMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const UnitaryMap&,
+        const Accessor& grid, const Coord& ijk)
+    {
+        return ISLaplacian<DiffScheme>::result(grid, ijk);
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const UnitaryMap&, const StencilT& stencil)
+    {
+        return ISLaplacian<DiffScheme>::result(stencil);
+    }
+};
+
+
+template<DDScheme DiffScheme>
+struct Laplacian<UniformScaleMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return ISLaplacian<DiffScheme>::result(grid, ijk) * invdxdx;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return ISLaplacian<DiffScheme>::result(stencil) * invdxdx;
+    }
+};
+
+
+template<DDScheme DiffScheme>
+struct Laplacian<UniformScaleTranslateMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return ISLaplacian<DiffScheme>::result(grid, ijk) * invdxdx;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+        return ISLaplacian<DiffScheme>::result(stencil) * invdxdx;
+    }
+};
+
+
+template<DDScheme DiffScheme>
+struct Laplacian<ScaleMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const ScaleMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        // compute the second derivatives in index space
+        ValueType iddx = D2<DiffScheme>::inX(grid, ijk);
+        ValueType iddy = D2<DiffScheme>::inY(grid, ijk);
+        ValueType iddz = D2<DiffScheme>::inZ(grid, ijk);
+        const Vec3d& invScaleSqr = map.getInvScaleSqr();
+        // scale them by the appropriate 1/dx^2, 1/dy^2, 1/dz^2 and sum
+        return ValueType(iddx * invScaleSqr[0] + iddy * invScaleSqr[1] + iddz * invScaleSqr[2]);
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const ScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        // compute the second derivatives in index space
+        ValueType iddx = D2<DiffScheme>::inX(stencil);
+        ValueType iddy = D2<DiffScheme>::inY(stencil);
+        ValueType iddz = D2<DiffScheme>::inZ(stencil);
+        const Vec3d& invScaleSqr = map.getInvScaleSqr();
+        // scale them by the appropriate 1/dx^2, 1/dy^2, 1/dz^2 and sum
+        return ValueType(iddx * invScaleSqr[0] + iddy * invScaleSqr[1] + iddz * invScaleSqr[2]);
+    }
+};
+
+
+template<DDScheme DiffScheme>
+struct Laplacian<ScaleTranslateMap, DiffScheme>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const ScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        // compute the second derivatives in index space
+        ValueType iddx = D2<DiffScheme>::inX(grid, ijk);
+        ValueType iddy = D2<DiffScheme>::inY(grid, ijk);
+        ValueType iddz = D2<DiffScheme>::inZ(grid, ijk);
+        const Vec3d& invScaleSqr = map.getInvScaleSqr();
+        // scale them by the appropriate 1/dx^2, 1/dy^2, 1/dz^2 and sum
+        return ValueType(iddx * invScaleSqr[0] + iddy * invScaleSqr[1] + iddz * invScaleSqr[2]);
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const ScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        // compute the second derivatives in index space
+        ValueType iddx = D2<DiffScheme>::inX(stencil);
+        ValueType iddy = D2<DiffScheme>::inY(stencil);
+        ValueType iddz = D2<DiffScheme>::inZ(stencil);
+        const Vec3d& invScaleSqr = map.getInvScaleSqr();
+        // scale them by the appropriate 1/dx^2, 1/dy^2, 1/dz^2 and sum
+        return ValueType(iddx * invScaleSqr[0] + iddy * invScaleSqr[1] + iddz * invScaleSqr[2]);
+    }
+};
+
+
+/// @brief Compute the closest-point transform to a level set.
+/// @return the closest point to the surface from which the level set was derived,
+/// in the domain space of the map (e.g., voxel space).
+template<typename MapType, DScheme DiffScheme>
+struct CPT
+{
+    // random access version
+    template<typename Accessor> static math::Vec3<typename Accessor::ValueType>
+    result(const MapType& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        typedef Vec3<ValueType>              Vec3Type;
+
+        // current distance
+        ValueType d = grid.getValue(ijk);
+        // compute gradient in physical space where it is a unit normal
+        // since the grid holds a distance level set.
+        Vec3d vectorFromSurface(d*Gradient<MapType,DiffScheme>::result(map, grid, ijk));
+        if (is_linear<MapType>::value) {
+            Vec3d result = ijk.asVec3d() - map.applyInverseMap(vectorFromSurface);
+            return Vec3Type(result);
+        } else {
+            Vec3d location = map.applyMap(ijk.asVec3d());
+            Vec3d result = map.applyInverseMap(location - vectorFromSurface);
+            return Vec3Type(result);
+        }
+    }
+
+    // stencil access version
+    template<typename StencilT> static math::Vec3<typename StencilT::ValueType>
+    result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        typedef Vec3<ValueType>              Vec3Type;
+
+        // current distance
+        ValueType d = stencil.template getValue<0, 0, 0>();
+        // compute gradient in physical space where it is a unit normal
+        // since the grid holds a distance level set.
+        Vec3d vectorFromSurface(d*Gradient<MapType, DiffScheme>::result(map, stencil));
+        if (is_linear<MapType>::value) {
+            Vec3d result = stencil.getCenterCoord().asVec3d()
+                - map.applyInverseMap(vectorFromSurface);
+            return Vec3Type(result);
+        } else {
+            Vec3d location = map.applyMap(stencil.getCenterCoord().asVec3d());
+            Vec3d result = map.applyInverseMap(location - vectorFromSurface);
+            return Vec3Type(result);
+        }
+    }
+};
+
+
+/// @brief Compute the closest-point transform to a level set.
+/// @return the closest point to the surface from which the level set was derived,
+/// in the range space of the map (e.g., in world space)
+template<typename MapType, DScheme DiffScheme>
+struct CPT_RANGE
+{
+    // random access version
+    template<typename Accessor> static Vec3<typename Accessor::ValueType>
+    result(const MapType& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        typedef Vec3<ValueType>              Vec3Type;
+        // current distance
+        ValueType d = grid.getValue(ijk);
+        // compute gradient in physical space where it is a unit normal
+        // since the grid holds a distance level set.
+        Vec3Type vectorFromSurface =
+            d*Gradient<MapType,DiffScheme>::result(map, grid, ijk);
+        Vec3d result = map.applyMap(ijk.asVec3d()) - vectorFromSurface;
+
+        return Vec3Type(result);
+    }
+
+    // stencil access version
+    template<typename StencilT> static Vec3<typename StencilT::ValueType>
+    result(const MapType& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        typedef Vec3<ValueType>              Vec3Type;
+        // current distance
+        ValueType d = stencil.template getValue<0, 0, 0>();
+        // compute gradient in physical space where it is a unit normal
+        // since the grid holds a distance level set.
+        Vec3Type vectorFromSurface =
+            d*Gradient<MapType, DiffScheme>::result(map, stencil);
+        Vec3d result = map.applyMap(stencil.getCenterCoord().asVec3d()) - vectorFromSurface;
+
+        return Vec3Type(result);
+    }
+};
+
+
+/// @brief Compute the mean curvature.
+/// @return the mean curvature in two parts: @c alpha is the numerator in
+/// @f$\nabla \cdot (\nabla \phi / |\nabla \phi|)@f$, and @c beta is @f$|\nabla \phi|@f$.
+template<typename MapType, DDScheme DiffScheme2, DScheme DiffScheme1>
+struct MeanCurvature
+{
+    /// @brief random access version
+    /// @return true if the gradient is none-zero, in which case the
+    /// mean curvature is computed as two parts: @c alpha is the numerator in
+    /// @f$\nabla \cdot (\nabla \phi / |\nabla \phi|)@f$, and @c beta is @f$|\nabla \phi|@f$.
+    template<typename Accessor>
+    static bool compute(const MapType& map, const Accessor& grid, const Coord& ijk,
+                        double& alpha, double& beta)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+         // compute the gradient in index and world space
+         Vec3d d1_is(static_cast<double>(D1<DiffScheme1>::inX(grid, ijk)),
+                     static_cast<double>(D1<DiffScheme1>::inY(grid, ijk)),
+                     static_cast<double>(D1<DiffScheme1>::inZ(grid, ijk))), d1_ws;
+         if (is_linear<MapType>::value) {//resolved at compiletime
+             d1_ws = map.applyIJT(d1_is);
+         } else {
+             d1_ws = map.applyIJT(d1_is, ijk.asVec3d());
+         }
+         const double Dx2 = d1_ws(0)*d1_ws(0);
+         const double Dy2 = d1_ws(1)*d1_ws(1);
+         const double Dz2 = d1_ws(2)*d1_ws(2);
+         const double normGrad = Dx2 + Dy2 + Dz2;
+         if (normGrad <= math::Tolerance<double>::value()) {
+             alpha = beta = 0;
+             return false;
+         }
+
+         // all the second derivatives in index space
+         ValueType iddx  = D2<DiffScheme2>::inX(grid, ijk);
+         ValueType iddy  = D2<DiffScheme2>::inY(grid, ijk);
+         ValueType iddz  = D2<DiffScheme2>::inZ(grid, ijk);
+
+         ValueType iddxy = D2<DiffScheme2>::inXandY(grid, ijk);
+         ValueType iddyz = D2<DiffScheme2>::inYandZ(grid, ijk);
+         ValueType iddxz = D2<DiffScheme2>::inXandZ(grid, ijk);
+
+         // second derivatives in index space
+         Mat3d  d2_is(iddx,  iddxy, iddxz,
+                      iddxy, iddy,  iddyz,
+                      iddxz, iddyz, iddz);
+
+         // convert second derivatives to world space
+         Mat3d d2_ws;
+         if (is_linear<MapType>::value) {//resolved at compiletime
+             d2_ws = map.applyIJC(d2_is);
+         } else {
+             d2_ws = map.applyIJC(d2_is, d1_is, ijk.asVec3d());
+         }
+
+         // assemble the nominator and denominator for mean curvature
+         alpha = (Dx2*(d2_ws(1,1)+d2_ws(2,2))+Dy2*(d2_ws(0,0)+d2_ws(2,2))
+                  +Dz2*(d2_ws(0,0)+d2_ws(1,1))
+                  -2*(d1_ws(0)*(d1_ws(1)*d2_ws(0,1)+d1_ws(2)*d2_ws(0,2))
+                      +d1_ws(1)*d1_ws(2)*d2_ws(1,2)));
+         beta = std::sqrt(normGrad); // * 1/dx
+         return true;
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const MapType& map,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        double alpha, beta;
+        return compute(map, grid, ijk, alpha, beta) ?
+               ValueType(alpha/(2. *math::Pow3(beta))) : 0;
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType normGrad(const MapType& map,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+        double alpha, beta;
+        return compute(map, grid, ijk, alpha, beta) ?
+               ValueType(alpha/(2. *math::Pow2(beta))) : 0;
+    }
+
+    /// @brief stencil access version
+    /// @return true if the gradient is none-zero, in which case the
+    /// mean curvature is computed as two parts: @c alpha is the numerator in
+    /// @f$\nabla \cdot (\nabla \phi / |\nabla \phi|)@f$, and @c beta is @f$|\nabla \phi|@f$.
+    template<typename StencilT>
+    static bool compute(const MapType& map, const StencilT& stencil,
+                        double& alpha, double& beta)
+    {
+        typedef typename StencilT::ValueType  ValueType;
+
+         // compute the gradient in index and world space
+         Vec3d d1_is(D1<DiffScheme1>::inX(stencil),
+                     D1<DiffScheme1>::inY(stencil),
+                     D1<DiffScheme1>::inZ(stencil) ), d1_ws;
+         if (is_linear<MapType>::value) {//resolved at compiletime
+             d1_ws = map.applyIJT(d1_is);
+         } else {
+             d1_ws = map.applyIJT(d1_is, stencil.getCenterCoord().asVec3d());
+         }
+         const double Dx2 = d1_ws(0)*d1_ws(0);
+         const double Dy2 = d1_ws(1)*d1_ws(1);
+         const double Dz2 = d1_ws(2)*d1_ws(2);
+         const double normGrad = Dx2 + Dy2 + Dz2;
+         if (normGrad <= math::Tolerance<double>::value()) {
+             alpha = beta = 0;
+             return false;
+         }
+
+         // all the second derivatives in index space
+         ValueType iddx  = D2<DiffScheme2>::inX(stencil);
+         ValueType iddy  = D2<DiffScheme2>::inY(stencil);
+         ValueType iddz  = D2<DiffScheme2>::inZ(stencil);
+
+         ValueType iddxy = D2<DiffScheme2>::inXandY(stencil);
+         ValueType iddyz = D2<DiffScheme2>::inYandZ(stencil);
+         ValueType iddxz = D2<DiffScheme2>::inXandZ(stencil);
+
+         // second derivatives in index space
+         Mat3d  d2_is(iddx,  iddxy, iddxz,
+                      iddxy, iddy,  iddyz,
+                      iddxz, iddyz, iddz);
+
+         // convert second derivatives to world space
+         Mat3d d2_ws;
+         if (is_linear<MapType>::value) {//resolved at compiletime
+             d2_ws = map.applyIJC(d2_is);
+         } else {
+             d2_ws = map.applyIJC(d2_is, d1_is, stencil.getCenterCoord().asVec3d());
+         }
+
+         // for return
+         alpha = (Dx2*(d2_ws(1,1)+d2_ws(2,2))+Dy2*(d2_ws(0,0)+d2_ws(2,2))
+                  +Dz2*(d2_ws(0,0)+d2_ws(1,1))
+                  -2*(d1_ws(0)*(d1_ws(1)*d2_ws(0,1)+d1_ws(2)*d2_ws(0,2))
+                      +d1_ws(1)*d1_ws(2)*d2_ws(1,2)));
+         beta  = std::sqrt(normGrad); // * 1/dx
+         return true;
+    }
+
+    template<typename StencilT>
+    static typename StencilT::ValueType
+    result(const MapType& map, const StencilT stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        double alpha, beta;
+        return compute(map, stencil, alpha, beta) ?
+               ValueType(alpha/(2*math::Pow3(beta))) : 0;
+    }
+
+    template<typename StencilT>
+    static typename StencilT::ValueType normGrad(const MapType& map, const StencilT stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+        double alpha, beta;
+        return compute(map, stencil, alpha, beta) ?
+               ValueType(alpha/(2*math::Pow2(beta))) : 0;
+    }
+};
+
+
+template<DDScheme DiffScheme2, DScheme DiffScheme1>
+struct MeanCurvature<TranslationMap, DiffScheme2, DiffScheme1>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const TranslationMap&,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        ValueType alpha, beta;
+        return ISMeanCurvature<DiffScheme2, DiffScheme1>::result(grid, ijk, alpha, beta) ?
+               ValueType(alpha /(2*math::Pow3(beta))) : 0;
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType normGrad(const TranslationMap&,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        ValueType alpha, beta;
+        return ISMeanCurvature<DiffScheme2, DiffScheme1>::result(grid, ijk, alpha, beta) ?
+               ValueType(alpha/(2*math::Pow2(beta))) : 0;
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const TranslationMap&, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        ValueType alpha, beta;
+        return ISMeanCurvature<DiffScheme2, DiffScheme1>::result(stencil, alpha, beta) ?
+               ValueType(alpha /(2*math::Pow3(beta))) : 0;
+    }
+
+    template<typename StencilT>
+    static typename StencilT::ValueType normGrad(const TranslationMap&, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        ValueType alpha, beta;
+        return ISMeanCurvature<DiffScheme2, DiffScheme1>::result(stencil, alpha, beta) ?
+               ValueType(alpha/(2*math::Pow2(beta))) : 0;
+    }
+};
+
+
+template<DDScheme DiffScheme2, DScheme DiffScheme1>
+struct MeanCurvature<UniformScaleMap, DiffScheme2, DiffScheme1>
+{
+    // random access version
+    template<typename Accessor>
+    static typename Accessor::ValueType result(const UniformScaleMap& map,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(grid, ijk, alpha, beta)) {
+            ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+            return ValueType(alpha*inv2dx/math::Pow3(beta));
+        }
+        return 0;
+    }
+
+    template<typename Accessor>
+    static typename Accessor::ValueType normGrad(const UniformScaleMap& map,
+        const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(grid, ijk, alpha, beta)) {
+            ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+            return ValueType(alpha*invdxdx/(2*math::Pow2(beta)));
+        }
+        return 0;
+    }
+
+    // stencil access version
+    template<typename StencilT>
+    static typename StencilT::ValueType result(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(stencil, alpha, beta)) {
+            ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+            return ValueType(alpha*inv2dx/math::Pow3(beta));
+        }
+        return 0;
+    }
+
+    template<typename StencilT>
+    static typename StencilT::ValueType normGrad(const UniformScaleMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(stencil, alpha, beta)) {
+            ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+            return ValueType(alpha*invdxdx/(2*math::Pow2(beta)));
+        }
+        return 0;
+    }
+};
+
+
+template<DDScheme DiffScheme2, DScheme DiffScheme1>
+struct MeanCurvature<UniformScaleTranslateMap, DiffScheme2, DiffScheme1>
+{
+    // random access version
+    template<typename Accessor> static typename Accessor::ValueType
+    result(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(grid, ijk, alpha, beta)) {
+            ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+            return ValueType(alpha*inv2dx/math::Pow3(beta));
+        }
+        return 0;
+    }
+
+    template<typename Accessor> static typename Accessor::ValueType
+    normGrad(const UniformScaleTranslateMap& map, const Accessor& grid, const Coord& ijk)
+    {
+        typedef typename Accessor::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(grid, ijk, alpha, beta)) {
+            ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+            return ValueType(alpha*invdxdx/(2*math::Pow2(beta)));
+        }
+        return 0;
+    }
+
+    // stencil access version
+    template<typename StencilT> static typename StencilT::ValueType
+    result(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(stencil, alpha, beta)) {
+            ValueType inv2dx = ValueType(map.getInvTwiceScale()[0]);
+            return ValueType(alpha*inv2dx/math::Pow3(beta));
+        }
+        return 0;
+    }
+
+    template<typename StencilT> static typename StencilT::ValueType
+    normGrad(const UniformScaleTranslateMap& map, const StencilT& stencil)
+    {
+        typedef typename StencilT::ValueType ValueType;
+
+        ValueType alpha, beta;
+        if (ISMeanCurvature<DiffScheme2, DiffScheme1>::result(stencil, alpha, beta)) {
+            ValueType invdxdx = ValueType(map.getInvScaleSqr()[0]);
+            return ValueType(alpha*invdxdx/(2*math::Pow2(beta)));
+        }
+        return 0;
+    }
+};
+
+
+/// @brief A wrapper that holds a MapBase::ConstPtr and exposes a reduced set
+/// of functionality needed by the mathematical operators
+/// @details This may be used in some <tt>Map</tt>-templated code, when the overhead of
+/// actually resolving the @c Map type is large compared to the map work to be done.
+class GenericMap
+{
+public:
+    template<typename GridType>
+    GenericMap(const GridType& g): mMap(g.transform().baseMap()) {}
+
+    GenericMap(const Transform& t): mMap(t.baseMap()) {}
+    GenericMap(MapBase::Ptr map): mMap(boost::const_pointer_cast<const MapBase>(map)) {}
+    GenericMap(MapBase::ConstPtr map): mMap(map) {}
+    ~GenericMap() {}
+
+    Vec3d applyMap(const Vec3d& in) const { return mMap->applyMap(in); }
+    Vec3d applyInverseMap(const Vec3d& in) const { return mMap->applyInverseMap(in); }
+
+    Vec3d applyIJT(const Vec3d& in) const { return mMap->applyIJT(in); }
+    Vec3d applyIJT(const Vec3d& in, const Vec3d& pos) const { return mMap->applyIJT(in, pos); }
+    Mat3d applyIJC(const Mat3d& m) const { return mMap->applyIJC(m); }
+    Mat3d applyIJC(const Mat3d& m, const Vec3d& v, const Vec3d& pos) const
+        { return mMap->applyIJC(m,v,pos); }
+
+    double determinant() const { return mMap->determinant(); }
+    double determinant(const Vec3d& in) const { return mMap->determinant(in); }
+
+    Vec3d voxelSize() const { return mMap->voxelSize(); }
+    Vec3d voxelSize(const Vec3d&v) const { return mMap->voxelSize(v); }
+
+private:
+    MapBase::ConstPtr mMap;
+};
+
+} // end math namespace
+} // namespace OPENVDB_VERSION_NAME
+} // end openvdb namespace
+
+#endif // OPENVDB_MATH_OPERATORS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Proximity.h b/nuparu/include/openvdb_new/math/Proximity.h
new file mode 100644
index 00000000..37110936
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Proximity.h
@@ -0,0 +1,79 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_PROXIMITY_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_PROXIMITY_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @brief Closest Point on Triangle to Point. Given a triangle @c abc and a point @c p,
+/// return the point on @c abc closest to @c p and the corresponding barycentric coordinates.
+///
+/// @details Algorithms from "Real-Time Collision Detection" pg 136 to 142 by Christer Ericson.
+/// The closest point is obtained by first determining which of the triangles'
+/// Voronoi feature regions @c p is in and then computing the orthogonal projection
+/// of @c p onto the corresponding feature.
+///
+/// @param a    The triangle's first vertex point.
+/// @param b    The triangle's second vertex point.
+/// @param c    The triangle's third vertex point.
+/// @param p    Point to compute the closest point on @c abc for.
+/// @param uvw  Barycentric coordinates, computed and returned.
+OPENVDB_API Vec3d
+closestPointOnTriangleToPoint(
+    const Vec3d& a, const Vec3d& b, const Vec3d& c, const Vec3d& p, Vec3d& uvw);
+
+
+/// @brief  Closest Point on Line Segment to Point. Given segment @c ab and point @c p,
+/// return the point on @c ab closest to @c p and @c t the parametric distance to @c b.
+///
+/// @param a    The segment's first vertex point.
+/// @param b    The segment's second vertex point.
+/// @param p    Point to compute the closest point on @c ab for.
+/// @param t    Parametric distance to @c b.
+OPENVDB_API Vec3d
+closestPointOnSegmentToPoint(
+    const Vec3d& a, const Vec3d& b, const Vec3d& p, double& t);
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_MESH_TO_VOLUME_UTIL_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/QuantizedUnitVec.h b/nuparu/include/openvdb_new/math/QuantizedUnitVec.h
new file mode 100644
index 00000000..09a81ad8
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/QuantizedUnitVec.h
@@ -0,0 +1,166 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_QUANTIZED_UNIT_VEC_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_QUANTIZED_UNIT_VEC_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <openvdb/version.h>
+#include "Vec3.h"
+#include <tbb/atomic.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+
+// Bit compression method that effciently represents a unit vector using
+// 2 bytes i.e. 16 bits of data by only storing two quantized components.
+// Based on "Higher Accuracy Quantized Normals" article from GameDev.Net LLC, 2000
+
+class OPENVDB_API QuantizedUnitVec
+{
+public:
+
+    template <typename T>
+    static uint16_t pack(const Vec3<T>& vec);
+    static Vec3s unpack(const uint16_t data);
+
+    static void flipSignBits(uint16_t&);
+
+private:
+    QuantizedUnitVec() {}
+
+    // threadsafe initialization function for the normalization weights.
+    static void init();
+
+    // bit masks
+    static const uint16_t MASK_SLOTS = 0x1FFF; // 0001111111111111
+    static const uint16_t MASK_XSLOT = 0x1F80; // 0001111110000000
+    static const uint16_t MASK_YSLOT = 0x007F; // 0000000001111111
+    static const uint16_t MASK_XSIGN = 0x8000; // 1000000000000000
+    static const uint16_t MASK_YSIGN = 0x4000; // 0100000000000000
+    static const uint16_t MASK_ZSIGN = 0x2000; // 0010000000000000
+
+    // initialization flag.
+    static bool sInitialized;
+
+    // normalization weights, 32 kilobytes.
+    static float sNormalizationWeights[MASK_SLOTS + 1];
+}; // class QuantizedUnitVec
+
+
+////////////////////////////////////////
+
+
+template <typename T>
+inline uint16_t
+QuantizedUnitVec::pack(const Vec3<T>& vec)
+{
+    if (math::isZero(vec)) return 0;
+
+    uint16_t data = 0;
+    T x(vec[0]), y(vec[1]), z(vec[2]);
+
+    // The sign of the three components are first stored using
+    // 3-bits and can then safely be discarded.
+    if (x < T(0.0)) { data |= MASK_XSIGN; x = -x; }
+    if (y < T(0.0)) { data |= MASK_YSIGN; y = -y; }
+    if (z < T(0.0)) { data |= MASK_ZSIGN; z = -z; }
+
+    // The z component is discarded and x & y are quantized in
+    // the 0 to 126 range.
+    T w = T(126.0) / (x + y + z);
+    uint16_t xbits = static_cast<uint16_t>((x * w));
+    uint16_t ybits = static_cast<uint16_t>((y * w));
+
+    // The remaining 13 bits in our 16 bit word are dividied into a
+    // 6-bit x-slot and a 7-bit y-slot. Both the xbits and the ybits
+    // can still be represented using (2^7 - 1) quantization levels.
+
+    // If the xbits requre more than 6-bits, store the complement.
+    // (xbits + ybits < 127, thus if xbits > 63 => ybits <= 63)
+    if(xbits > 63) {
+        xbits = static_cast<uint16_t>(127 - xbits);
+        ybits = static_cast<uint16_t>(127 - ybits);
+    }
+
+    // Pack components into their respective slots.
+    data = static_cast<uint16_t>(data | (xbits << 7));
+    data = static_cast<uint16_t>(data | ybits);
+    return data;
+}
+
+
+inline Vec3s
+QuantizedUnitVec::unpack(const uint16_t data)
+{
+    if (!sInitialized) init();
+
+    const float w = sNormalizationWeights[data & MASK_SLOTS];
+
+    uint16_t xbits = static_cast<uint16_t>((data & MASK_XSLOT) >> 7);
+    uint16_t ybits = static_cast<uint16_t>(data & MASK_YSLOT);
+
+    // Check if the complement components where stored and revert.
+    if ((xbits + ybits) > 126) {
+        xbits = static_cast<uint16_t>(127 - xbits);
+        ybits = static_cast<uint16_t>(127 - ybits);
+    }
+
+    Vec3s vec(float(xbits) * w, float(ybits) * w, float(126 - xbits - ybits) * w);
+
+    if (data & MASK_XSIGN) vec[0] = -vec[0];
+    if (data & MASK_YSIGN) vec[1] = -vec[1];
+    if (data & MASK_ZSIGN) vec[2] = -vec[2];
+    return vec;
+}
+
+
+////////////////////////////////////////
+
+
+inline void
+QuantizedUnitVec::flipSignBits(uint16_t& v)
+{
+    v = static_cast<uint16_t>((v & MASK_SLOTS) | (~v & ~MASK_SLOTS));
+}
+
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_QUANTIZED_UNIT_VEC_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Quat.h b/nuparu/include/openvdb_new/math/Quat.h
new file mode 100644
index 00000000..93248f1d
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Quat.h
@@ -0,0 +1,658 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_QUAT_H_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_QUAT_H_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <cmath>
+
+#include "Mat.h"
+#include "Mat3.h"
+#include "Math.h"
+#include "Vec3.h"
+#include <openvdb/Exceptions.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename T> class Quat;
+
+/// Linear interpolation between the two quaternions
+template <typename T>
+Quat<T> slerp(const Quat<T> &q1, const Quat<T> &q2, T t, T tolerance=0.00001)
+{
+    T qdot, angle, sineAngle;
+
+    qdot = q1.dot(q2);
+
+    if (fabs(qdot) >= 1.0) {
+        angle     = 0; // not necessary but suppresses compiler warning
+        sineAngle = 0;
+    } else {
+        angle     = acos(qdot);
+        sineAngle = sin(angle);
+    }
+
+    //
+    // Denominator close to 0 corresponds to the case where the
+    // two quaternions are close to the same rotation. In this
+    // case linear interpolation is used but we normalize to
+    // guarantee unit length
+    //
+    if (sineAngle <= tolerance) {
+        T s = 1.0 - t;
+
+        Quat<T> qtemp(s * q1[0] + t * q2[0], s * q1[1] + t * q2[1],
+                      s * q1[2] + t * q2[2], s * q1[3] + t * q2[3]);
+        //
+        // Check the case where two close to antipodal quaternions were
+        // blended resulting in a nearly zero result which can happen,
+        // for example, if t is close to 0.5. In this case it is not safe
+        // to project back onto the sphere.
+        //
+        double lengthSquared = qtemp.dot(qtemp);
+
+        if (lengthSquared <= tolerance * tolerance) {
+            qtemp = (t < 0.5) ? q1 : q2;
+        } else {
+            qtemp *= 1.0 / sqrt(lengthSquared);
+        }
+        return qtemp;
+    } else {
+
+        T sine  = 1.0 / sineAngle;
+        T a = sin((1.0 - t) * angle) * sine;
+        T b = sin(t * angle) * sine;
+        return Quat<T>(a * q1[0] + b * q2[0], a * q1[1] + b * q2[1],
+                       a * q1[2] + b * q2[2], a * q1[3] + b * q2[3]);
+    }
+
+}
+
+template<typename T>
+class Quat
+{
+public:
+    /// Trivial constructor, the quaternion is NOT initialized
+    Quat() {}
+
+    /// Constructor with four arguments, e.g.   Quatf q(1,2,3,4);
+    Quat(T x, T y, T z, T w)
+    {
+        mm[0] = x;
+        mm[1] = y;
+        mm[2] = z;
+        mm[3] = w;
+
+    }
+
+    /// Constructor with array argument, e.g.   float a[4]; Quatf q(a);
+    Quat(T *a)
+    {
+        mm[0] = a[0];
+        mm[1] = a[1];
+        mm[2] = a[2];
+        mm[3] = a[3];
+
+    }
+
+    /// Constructor given rotation as axis and angle, the axis must be
+    /// unit vector
+    Quat(const Vec3<T> &axis, T angle)
+    {
+        // assert( REL_EQ(axis.length(), 1.) );
+
+        T s = T(sin(angle*T(0.5)));
+
+        mm[0] = axis.x() * s;
+        mm[1] = axis.y() * s;
+        mm[2] = axis.z() * s;
+
+        mm[3] = T(cos(angle*T(0.5)));
+
+    }
+
+    /// Constructor given rotation as axis and angle
+    Quat(math::Axis axis, T angle)
+    {
+        T s = T(sin(angle*T(0.5)));
+
+        mm[0] = (axis==math::X_AXIS) * s;
+        mm[1] = (axis==math::Y_AXIS) * s;
+        mm[2] = (axis==math::Z_AXIS) * s;
+
+        mm[3] = T(cos(angle*T(0.5)));
+    }
+
+    /// Constructor given a rotation matrix
+    template<typename T1>
+    Quat(const Mat3<T1> &rot) {
+
+        // verify that the matrix is really a rotation
+        if(!isUnitary(rot)) {  // unitary is reflection or rotation
+             OPENVDB_THROW(ArithmeticError,
+                "A non-rotation matrix can not be used to construct a quaternion");
+        }
+        if (!isApproxEqual(rot.det(), (T1)1)) { // rule out reflection
+             OPENVDB_THROW(ArithmeticError,
+                "A reflection matrix can not be used to construct a quaternion");
+        }
+
+        T trace = (T)rot.trace();
+        if (trace > 0) {
+
+            T q_w = 0.5 * std::sqrt(trace+1);
+            T factor = 0.25 / q_w;
+
+            mm[0] = factor * (rot(1,2) - rot(2,1));
+            mm[1] = factor * (rot(2,0) - rot(0,2));
+            mm[2] = factor * (rot(0,1) - rot(1,0));
+            mm[3] = q_w;
+        }  else if (rot(0,0) > rot(1,1) && rot(0,0) > rot(2,2)) {
+
+            T q_x = 0.5 * sqrt(rot(0,0)- rot(1,1)-rot(2,2)+1);
+            T factor = 0.25 / q_x;
+
+            mm[0] = q_x;
+            mm[1] = factor * (rot(0,1) + rot(1,0));
+            mm[2] = factor * (rot(2,0) + rot(0,2));
+            mm[3] = factor * (rot(1,2) - rot(2,1));
+        } else if (rot(1,1) > rot(2,2)) {
+
+            T q_y = 0.5 * sqrt(rot(1,1)-rot(0,0)-rot(2,2)+1);
+            T factor = 0.25 / q_y;
+
+            mm[0] =  factor * (rot(0,1) + rot(1,0));
+            mm[1] = q_y;
+            mm[2] = factor * (rot(1,2) + rot(2,1));
+            mm[3] = factor * (rot(2,0) - rot(0,2));
+        } else {
+
+            T q_z = 0.5 * sqrt(rot(2,2)-rot(0,0)-rot(1,1)+1);
+            T factor = 0.25 / q_z;
+
+            mm[0] = factor * (rot(2,0) + rot(0,2));
+            mm[1] = factor * (rot(1,2) + rot(2,1));
+            mm[2] = q_z;
+            mm[3] = factor * (rot(0,1) - rot(1,0));
+        }
+    }
+
+    /// Copy constructor
+    Quat(const Quat &q)
+    {
+        mm[0] = q.mm[0];
+        mm[1] = q.mm[1];
+        mm[2] = q.mm[2];
+        mm[3] = q.mm[3];
+
+    }
+
+    /// Reference to the component, e.g.   q.x() = 4.5f;
+    T& x() { return mm[0]; }
+    T& y() { return mm[1]; }
+    T& z() { return mm[2]; }
+    T& w() { return mm[3]; }
+
+    /// Get the component, e.g.   float f = q.w();
+    T x() const { return mm[0]; }
+    T y() const { return mm[1]; }
+    T z() const { return mm[2]; }
+    T w() const { return mm[3]; }
+
+    // Number of elements
+    static unsigned numElements() { return 4; }
+
+    /// Array style reference to the components, e.g.   q[3] = 1.34f;
+    T& operator[](int i) { return mm[i]; }
+
+    /// Array style constant reference to the components, e.g.  float f = q[1];
+    T operator[](int i) const { return mm[i]; }
+
+    /// Cast to T*
+    operator T*() { return mm; }
+    operator const T*() const { return mm; }
+
+    /// Alternative indexed reference to the elements
+    T& operator()(int i) { return mm[i]; }
+
+    /// Alternative indexed constant reference to the elements,
+    T operator()(int i) const { return mm[i]; }
+
+    /// Return angle of rotation
+    T angle() const
+    {
+        T sqrLength = mm[0]*mm[0] + mm[1]*mm[1] + mm[2]*mm[2];
+
+        if ( sqrLength > 1.0e-8 ) {
+
+            return T(T(2.0) * acos(mm[3]));
+
+        } else {
+
+            return T(0.0);
+        }
+    }
+
+    /// Return axis of rotation
+    Vec3<T> axis() const
+    {
+        T sqrLength = mm[0]*mm[0] + mm[1]*mm[1] + mm[2]*mm[2];
+
+        if ( sqrLength > 1.0e-8 ) {
+
+            T invLength = T(T(1)/sqrt(sqrLength));
+
+            return Vec3<T>( mm[0]*invLength, mm[1]*invLength, mm[2]*invLength );
+        } else {
+
+            return Vec3<T>(1,0,0);
+        }
+    }
+
+
+    /// "this" quaternion gets initialized to [x, y, z, w]
+    Quat& init(T x, T y, T z, T w)
+    {
+        mm[0] = x; mm[1] = y; mm[2] = z; mm[3] = w;
+        return *this;
+    }
+
+    /// "this" quaternion gets initialized to identity, same as setIdentity()
+    Quat& init() { return setIdentity(); }
+
+    /// Set "this" quaternion to rotation specified by axis and angle,
+    /// the axis must be unit vector
+    Quat& setAxisAngle(const Vec3<T>& axis, T angle)
+    {
+
+        T s = T(sin(angle*T(0.5)));
+
+        mm[0] = axis.x() * s;
+        mm[1] = axis.y() * s;
+        mm[2] = axis.z() * s;
+
+        mm[3] = T(cos(angle*T(0.5)));
+
+        return *this;
+    } // axisAngleTest
+
+    /// Set "this" vector to zero
+    Quat& setZero()
+    {
+        mm[0] = mm[1] = mm[2] = mm[3] = 0;
+        return *this;
+    }
+
+    /// Set "this" vector to identity
+    Quat& setIdentity()
+    {
+        mm[0] = mm[1] = mm[2] = 0;
+        mm[3] = 1;
+        return *this;
+    }
+
+    /// Returns vector of x,y,z rotational components
+    Vec3<T> eulerAngles(RotationOrder rotationOrder) const
+    { return math::eulerAngles(Mat3<T>(*this), rotationOrder); }
+
+    /// Assignment operator
+    Quat& operator=(const Quat &q)
+    {
+        mm[0] = q.mm[0];
+        mm[1] = q.mm[1];
+        mm[2] = q.mm[2];
+        mm[3] = q.mm[3];
+
+        return *this;
+    }
+
+    /// Equality operator, does exact floating point comparisons
+    bool operator==(const Quat &q) const
+    {
+        return (isExactlyEqual(mm[0],q.mm[0]) &&
+                isExactlyEqual(mm[1],q.mm[1]) &&
+                isExactlyEqual(mm[2],q.mm[2]) &&
+                isExactlyEqual(mm[3],q.mm[3]) );
+    }
+
+    /// Test if "this" is equivalent to q with tolerance of eps value
+    bool eq(const Quat &q, T eps=1.0e-7) const
+    {
+        return isApproxEqual(mm[0],q.mm[0],eps) && isApproxEqual(mm[1],q.mm[1],eps) &&
+            isApproxEqual(mm[2],q.mm[2],eps) && isApproxEqual(mm[3],q.mm[3],eps) ;
+    } // trivial
+
+    /// Add quaternion q to "this" quaternion, e.g.   q += q1;
+    Quat& operator+=(const Quat &q)
+    {
+        mm[0] += q.mm[0];
+        mm[1] += q.mm[1];
+        mm[2] += q.mm[2];
+        mm[3] += q.mm[3];
+
+        return *this;
+    }
+
+    /// Subtract quaternion q from "this" quaternion, e.g.   q -= q1;
+    Quat& operator-=(const Quat &q)
+    {
+        mm[0] -= q.mm[0];
+        mm[1] -= q.mm[1];
+        mm[2] -= q.mm[2];
+        mm[3] -= q.mm[3];
+
+        return *this;
+    }
+
+    /// Scale "this" quaternion by scalar, e.g.   q *= scalar;
+    Quat& operator*=(T scalar)
+    {
+        mm[0] *= scalar;
+        mm[1] *= scalar;
+        mm[2] *= scalar;
+        mm[3] *= scalar;
+
+        return *this;
+    }
+
+    /// Return (this+q), e.g.   q = q1 + q2;
+    Quat operator+(const Quat &q) const
+    {
+        return Quat<T>(mm[0]+q.mm[0], mm[1]+q.mm[1], mm[2]+q.mm[2], mm[3]+q.mm[3]);
+    }
+
+    /// Return (this-q), e.g.   q = q1 - q2;
+    Quat operator-(const Quat &q) const
+    {
+        return Quat<T>(mm[0]-q.mm[0], mm[1]-q.mm[1], mm[2]-q.mm[2], mm[3]-q.mm[3]);
+    }
+
+    /// Return (this*q), e.g.   q = q1 * q2;
+    Quat operator*(const Quat &q) const
+    {
+        Quat<T> prod;
+
+        prod.mm[0] = mm[3]*q.mm[0] + mm[0]*q.mm[3] + mm[1]*q.mm[2] - mm[2]*q.mm[1];
+        prod.mm[1] = mm[3]*q.mm[1] + mm[1]*q.mm[3] + mm[2]*q.mm[0] - mm[0]*q.mm[2];
+        prod.mm[2] = mm[3]*q.mm[2] + mm[2]*q.mm[3] + mm[0]*q.mm[1] - mm[1]*q.mm[0];
+        prod.mm[3] = mm[3]*q.mm[3] - mm[0]*q.mm[0] - mm[1]*q.mm[1] - mm[2]*q.mm[2];
+
+        return prod;
+
+    }
+
+    /// Assigns this to (this*q), e.g.   q *= q1;
+    Quat operator*=(const Quat &q)
+    {
+        *this = *this * q;
+        return *this;
+    }
+
+    /// Return (this*scalar), e.g.   q = q1 * scalar;
+    Quat operator*(T scalar) const
+    {
+        return Quat<T>(mm[0]*scalar, mm[1]*scalar, mm[2]*scalar, mm[3]*scalar);
+    }
+
+    /// Return (this/scalar), e.g.   q = q1 / scalar;
+    Quat operator/(T scalar) const
+    {
+        return Quat<T>(mm[0]/scalar, mm[1]/scalar, mm[2]/scalar, mm[3]/scalar);
+    }
+
+    /// Negation operator, e.g.   q = -q;
+    Quat operator-() const
+    { return Quat<T>(-mm[0], -mm[1], -mm[2], -mm[3]); }
+
+    /// this = q1 + q2
+    /// "this", q1 and q2 need not be distinct objects, e.g. q.add(q1,q);
+    Quat& add(const Quat &q1, const Quat &q2)
+    {
+        mm[0] = q1.mm[0] + q2.mm[0];
+        mm[1] = q1.mm[1] + q2.mm[1];
+        mm[2] = q1.mm[2] + q2.mm[2];
+        mm[3] = q1.mm[3] + q2.mm[3];
+
+        return *this;
+    }
+
+    /// this = q1 - q2
+    /// "this", q1 and q2 need not be distinct objects, e.g. q.sub(q1,q);
+    Quat& sub(const Quat &q1, const Quat &q2)
+    {
+        mm[0] = q1.mm[0] - q2.mm[0];
+        mm[1] = q1.mm[1] - q2.mm[1];
+        mm[2] = q1.mm[2] - q2.mm[2];
+        mm[3] = q1.mm[3] - q2.mm[3];
+
+        return *this;
+    }
+
+    /// this = q1 * q2
+    /// q1 and q2 must be distinct objects than "this", e.g.  q.mult(q1,q2);
+    Quat& mult(const Quat &q1, const Quat &q2)
+    {
+        mm[0] = q1.mm[3]*q2.mm[0] + q1.mm[0]*q2.mm[3] +
+                q1.mm[1]*q2.mm[2] - q1.mm[2]*q2.mm[1];
+        mm[1] = q1.mm[3]*q2.mm[1] + q1.mm[1]*q2.mm[3] +
+                q1.mm[2]*q2.mm[0] - q1.mm[0]*q2.mm[2];
+        mm[2] = q1.mm[3]*q2.mm[2] + q1.mm[2]*q2.mm[3] +
+                q1.mm[0]*q2.mm[1] - q1.mm[1]*q2.mm[0];
+        mm[3] = q1.mm[3]*q2.mm[3] - q1.mm[0]*q2.mm[0] -
+                q1.mm[1]*q2.mm[1] - q1.mm[2]*q2.mm[2];
+
+        return *this;
+    }
+
+    /// this =  scalar*q, q need not be distinct object than "this",
+    /// e.g. q.scale(1.5,q1);
+    Quat& scale(T scale, const Quat &q)
+    {
+        mm[0] = scale * q.mm[0];
+        mm[1] = scale * q.mm[1];
+        mm[2] = scale * q.mm[2];
+        mm[3] = scale * q.mm[3];
+
+        return *this;
+    }
+
+    /// Dot product
+    T dot(const Quat &q) const
+    {
+        return (mm[0]*q.mm[0] + mm[1]*q.mm[1] + mm[2]*q.mm[2] + mm[3]*q.mm[3]);
+    }
+
+    /// Return the quaternion rate corrsponding to the angular velocity omega
+    /// and "this" current rotation
+    Quat derivative(const Vec3<T>& omega) const
+    {
+        return Quat<T>( +w()*omega.x() -z()*omega.y() +y()*omega.z() ,
+                        +z()*omega.x() +w()*omega.y() -x()*omega.z() ,
+                        -y()*omega.x() +x()*omega.y() +w()*omega.z() ,
+                        -x()*omega.x() -y()*omega.y() -z()*omega.z() );
+    }
+
+    /// this = normalized this
+    bool normalize(T eps = T(1.0e-8))
+    {
+        T d = T(sqrt(mm[0]*mm[0] + mm[1]*mm[1] + mm[2]*mm[2] + mm[3]*mm[3]));
+        if( isApproxEqual(d, T(0.0), eps) ) return false;
+        *this *= ( T(1)/d );
+        return true;
+    }
+
+    /// this = normalized this
+    Quat unit() const
+    {
+        T d = sqrt(mm[0]*mm[0] + mm[1]*mm[1] + mm[2]*mm[2] + mm[3]*mm[3]);
+        if( isExactlyEqual(d , T(0.0) ) )
+            OPENVDB_THROW(ArithmeticError,
+                "Normalizing degenerate quaternion");
+        return *this / d;
+    }
+
+    /// returns inverse of this
+    Quat inverse(T tolerance = T(0))
+    {
+        T d = mm[0]*mm[0] + mm[1]*mm[1] + mm[2]*mm[2] + mm[3]*mm[3];
+        if( isApproxEqual(d, T(0.0), tolerance) )
+            OPENVDB_THROW(ArithmeticError,
+                "Cannot invert degenerate quaternion");
+        Quat result = *this/-d;
+        result.mm[3] = -result.mm[3];
+        return result;
+    }
+
+
+    /// Return the conjugate of "this", same as invert without
+    /// unit quaternion test
+    Quat conjugate() const
+    {
+        return Quat<T>(-mm[0], -mm[1], -mm[2], mm[3]);
+    }
+
+    /// Return rotated vector by "this" quaternion
+    Vec3<T> rotateVector(const Vec3<T> &v) const
+    {
+        Mat3<T> m(*this);
+        return m.transform(v);
+    }
+
+    /// Predefined constants, e.g.   Quat q = Quat::identity();
+    static Quat zero() { return Quat<T>(0,0,0,0); }
+    static Quat identity() { return Quat<T>(0,0,0,1); }
+
+     /// @return string representation of Classname
+    std::string
+    str() const {
+        std::ostringstream buffer;
+
+        buffer << "[";
+
+        // For each column
+        for (unsigned j(0); j < 4; j++) {
+            if (j) buffer << ", ";
+            buffer << mm[j];
+        }
+
+        buffer << "]";
+
+        return buffer.str();
+    }
+
+    /// Output to the stream, e.g.   std::cout << q << std::endl;
+    friend std::ostream& operator<<(std::ostream &stream, const Quat &q)
+    {
+        stream << q.str();
+        return stream;
+    }
+
+    friend Quat slerp<>(const Quat &q1, const Quat &q2, T t, T tolerance);
+
+
+    void write(std::ostream& os) const {
+        os.write((char*)&mm, sizeof(T)*4);
+    }
+    void read(std::istream& is) {
+        is.read((char*)&mm, sizeof(T)*4);
+    }
+
+protected:
+    T mm[4];
+};
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+Quat<T> operator*(S scalar, const Quat<T> &q) { return q*scalar; }
+
+
+/// @brief Interpolate between m1 and m2.
+/// Converts to quaternion  form and uses slerp
+/// m1 and m2 must be rotation matrices!
+template <typename T, typename T0>
+Mat3<T> slerp(const Mat3<T0> &m1, const Mat3<T0> &m2, T t)
+{
+    typedef Mat3<T> MatType;
+
+    Quat<T> q1(m1);
+    Quat<T> q2(m2);
+
+    if (q1.dot(q2) < 0) q2 *= -1;
+
+    Quat<T> qslerp = slerp<T>(q1, q2, static_cast<T>(t));
+    MatType m = rotation<MatType>(qslerp);
+    return m;
+}
+
+
+
+/// Interpolate between m1 and m4 by converting m1 ... m4  into
+/// quaternions and treating them as control points of a Bezier
+/// curve using slerp in place of lerp in the De Castlejeau evaluation
+/// algorithm. Just like a cubic Bezier curve, this will interpolate
+/// m1 at t = 0 and m4 at t = 1 but in general will not pass through
+/// m2 and m3.  Unlike a standard Bezier curve this curve will not have
+/// the convex hull property.
+/// m1 ... m4 must be rotation matrices!
+template <typename T, typename T0>
+Mat3<T> bezLerp(const Mat3<T0> &m1, const Mat3<T0> &m2,
+                const Mat3<T0> &m3, const Mat3<T0> &m4,
+                T t)
+{
+    Mat3<T> m00, m01, m02, m10, m11;
+
+    m00 = slerp(m1, m2, t);
+    m01 = slerp(m2, m3, t);
+    m02 = slerp(m3, m4, t);
+
+    m10 = slerp(m00, m01, t);
+    m11 = slerp(m01, m02, t);
+
+    return slerp(m10, m11, t);
+}
+
+typedef Quat<float> Quats;
+typedef Quat<double> Quatd;
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif //OPENVDB_MATH_QUAT_H_HAS_BEEN_INCLUDED
+
+// ---------------------------------------------------------------------------
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Ray.h b/nuparu/include/openvdb_new/math/Ray.h
new file mode 100644
index 00000000..00bc0e27
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Ray.h
@@ -0,0 +1,342 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Ray.h
+///
+/// @author Ken Museth
+///
+/// @brief A Ray class.
+
+#ifndef OPENVDB_MATH_RAY_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_RAY_HAS_BEEN_INCLUDED
+
+#include "Math.h"
+#include "Vec3.h"
+#include "Transform.h"
+#include <iostream> // for std::ostream
+#include <boost/type_traits/is_floating_point.hpp>
+#include <limits>// for std::numeric_limits<Type>::max()
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename RealT = double>
+class Ray
+{
+public:
+    BOOST_STATIC_ASSERT(boost::is_floating_point<RealT>::value);
+    typedef RealT       RealType;
+    typedef Vec3<RealT> Vec3Type;
+    typedef Vec3Type    Vec3T;
+    struct TimeSpan {
+        RealT t0, t1;
+        /// @brief Default constructor
+        TimeSpan() {}
+        /// @brief Constructor
+        TimeSpan(RealT _t0, RealT _t1) : t0(_t0), t1(_t1) {}
+        /// @brief Set both times
+        inline void set(RealT _t0, RealT _t1) { t0=_t0; t1=_t1; }
+        /// @brief Get both times
+        inline void get(RealT& _t0, RealT& _t1) const { _t0=t0; _t1=t1; }
+        /// @brief Return @c true if t1 is larger than t0 by at least eps.
+        inline bool valid(RealT eps=math::Delta<RealT>::value()) const { return (t1-t0)>eps; }
+        /// @brief Return the midpoint of the ray.
+        inline RealT mid() const { return 0.5*(t0 + t1); }
+        /// @brief Multiplies both times
+        inline void scale(RealT s) {assert(s>0); t0*=s; t1*=s; }
+        /// @brief Return @c true if time is inclusive
+        inline bool test(RealT t) const { return (t>=t0 && t<=t1); }
+    };
+
+    Ray(const Vec3Type& eye = Vec3Type(0,0,0),
+        const Vec3Type& direction = Vec3Type(1,0,0),
+        RealT t0 = math::Delta<RealT>::value(),
+        RealT t1 = std::numeric_limits<RealT>::max())
+        : mEye(eye), mDir(direction), mInvDir(1/mDir), mTimeSpan(t0, t1)
+    {
+    }
+
+    inline void setEye(const Vec3Type& eye) { mEye = eye; }
+
+    inline void setDir(const Vec3Type& dir)
+      {
+          mDir = dir;
+          mInvDir = 1/mDir;
+      }
+
+    inline void setMinTime(RealT t0) { assert(t0>0); mTimeSpan.t0 = t0; }
+
+    inline void setMaxTime(RealT t1) { assert(t1>0); mTimeSpan.t1 = t1; }
+
+    inline void setTimes(RealT t0 = math::Delta<RealT>::value(),
+                         RealT t1 = std::numeric_limits<RealT>::max())
+    {
+        assert(t0>0 && t1>0);
+        mTimeSpan.set(t0, t1);
+    }
+
+    inline void scaleTimes(RealT scale) { mTimeSpan.scale(scale); }
+
+    inline void reset(const Vec3Type& eye,
+                      const Vec3Type& direction,
+                      RealT t0 = math::Delta<RealT>::value(),
+                      RealT t1 = std::numeric_limits<RealT>::max())
+    {
+        this->setEye(eye);
+        this->setDir(direction);
+        this->setTimes(t0, t1);
+    }
+
+    inline const Vec3T& eye() const {return mEye;}
+
+    inline const Vec3T& dir() const {return mDir;}
+
+    inline const Vec3T& invDir() const {return mInvDir;}
+
+    inline RealT t0() const {return mTimeSpan.t0;}
+
+    inline RealT t1() const {return mTimeSpan.t1;}
+
+    /// @brief Return the position along the ray at the specified time.
+    inline Vec3R operator()(RealT time) const { return mEye + mDir * time; }
+
+    /// @brief Return the starting point of the ray.
+    inline Vec3R start() const { return (*this)(mTimeSpan.t0); }
+
+    /// @brief Return the endpoint of the ray.
+    inline Vec3R end() const { return (*this)(mTimeSpan.t1); }
+
+    /// @brief Return the midpoint of the ray.
+    inline Vec3R mid() const { return (*this)(mTimeSpan.mid()); }
+
+    /// @brief Return @c true if t0 is strictly less than t1.
+    OPENVDB_DEPRECATED inline bool test() const { return mTimeSpan.valid(RealT(0)); }
+
+    /// @brief Return @c true if t1 is larger than t0 by at least eps.
+    inline bool valid(RealT eps=math::Delta<float>::value()) const
+      {
+          return mTimeSpan.valid(eps);
+      }
+
+    /// @brief Return @c true if @a time is within t0 and t1, both inclusive.
+    inline bool test(RealT time) const { return mTimeSpan.test(time); }
+
+    /// @brief Return a new Ray that is transformed with the specified map.
+    /// @param map  the map from which to construct the new Ray.
+    /// @warning Assumes a linear map and a normalized direction.
+    /// @details The requirement that the direction is normalized
+    /// follows from the transformation of t0 and t1 - and that fact that
+    /// we want applyMap and applyInverseMap to be inverse operations.
+    template<typename MapType>
+    inline Ray applyMap(const MapType& map) const
+    {
+        assert(map.isLinear());
+        assert(math::isRelOrApproxEqual(mDir.length(), RealT(1), Tolerance<RealT>::value(), Delta<RealT>::value()));
+        const Vec3T eye = map.applyMap(mEye);
+        const Vec3T dir = map.applyJacobian(mDir);
+        const RealT length = dir.length();
+        return Ray(eye, dir/length, length*mTimeSpan.t0, length*mTimeSpan.t1);
+    }
+
+    /// @brief Return a new Ray that is transformed with the inverse of the specified map.
+    /// @param map  the map from which to construct the new Ray by inverse mapping.
+    /// @warning Assumes a linear map and a normalized direction.
+    /// @details The requirement that the direction is normalized
+    /// follows from the transformation of t0 and t1 - and that fact that
+    /// we want applyMap and applyInverseMap to be inverse operations.
+    template<typename MapType>
+    inline Ray applyInverseMap(const MapType& map) const
+    {
+        assert(map.isLinear());
+        assert(math::isRelOrApproxEqual(mDir.length(), RealT(1), Tolerance<RealT>::value(), Delta<RealT>::value()));
+        const Vec3T eye = map.applyInverseMap(mEye);
+        const Vec3T dir = map.applyInverseJacobian(mDir);
+        const RealT length = dir.length();
+        return Ray(eye, dir/length, length*mTimeSpan.t0, length*mTimeSpan.t1);
+    }
+
+    /// @brief Return a new ray in world space, assuming the existing
+    /// ray is represented in the index space of the specified grid.
+    template<typename GridType>
+    inline Ray indexToWorld(const GridType& grid) const
+    {
+        return this->applyMap(*(grid.transform().baseMap()));
+    }
+
+    /// @brief Return a new ray in the index space of the specified
+    /// grid, assuming the existing ray is represented in world space.
+    template<typename GridType>
+    inline Ray worldToIndex(const GridType& grid) const
+    {
+        return this->applyInverseMap(*(grid.transform().baseMap()));
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    /// @param t0     The first intersection point if an intersection exists.
+    /// @param t1     The second intersection point if an intersection exists.
+    /// @note If the return value is true, i.e. a hit, and t0 =
+    /// this->t0() or t1 == this->t1() only one true intersection exist.
+    inline bool intersects(const Vec3T& center, RealT radius, RealT& t0, RealT& t1) const
+    {
+        const Vec3T origin = mEye - center;
+        const RealT A = mDir.lengthSqr();
+        const RealT B = 2 * mDir.dot(origin);
+        const RealT C = origin.lengthSqr() - radius * radius;
+        const RealT D = B * B - 4 * A * C;
+
+        if (D < 0) return false;
+
+        const RealT Q = RealT(-0.5)*(B<0 ? (B + Sqrt(D)) : (B - Sqrt(D)));
+
+        t0 = Q / A;
+        t1 = C / Q;
+
+        if (t0 > t1) std::swap(t0, t1);
+        if (t0 < mTimeSpan.t0) t0 = mTimeSpan.t0;
+        if (t1 > mTimeSpan.t1) t1 = mTimeSpan.t1;
+        return t0 <= t1;
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    inline bool intersects(const Vec3T& center, RealT radius) const
+    {
+        RealT t0, t1;
+        return this->intersects(center, radius, t0, t1)>0;
+    }
+
+    /// @brief Return true if this ray intersects the specified sphere.
+    /// @note For intersection this ray is clipped to the two intersection points.
+    /// @param center The center of the sphere in the same space as this ray.
+    /// @param radius The radius of the sphere in the same units as this ray.
+    inline bool clip(const Vec3T& center, RealT radius)
+    {
+        RealT t0, t1;
+        const bool hit = this->intersects(center, radius, t0, t1);
+        if (hit) mTimeSpan.set(t0, t1);
+        return hit;
+    }
+
+    /// @brief Return true if the Ray intersects the specified
+    /// axisaligned bounding box.
+    /// @param bbox Axis-aligned bounding box in the same space as the Ray.
+    /// @param t0   If an intersection is detected this is assigned
+    ///             the time for the first intersection point.
+    /// @param t1   If an intersection is detected this is assigned
+    ///             the time for the second intersection point.
+    template<typename BBoxT>
+    inline bool intersects(const BBoxT& bbox, RealT& t0, RealT& t1) const
+    {
+        mTimeSpan.get(t0, t1);
+        for (int i = 0; i < 3; ++i) {
+            RealT a = (bbox.min()[i] - mEye[i]) * mInvDir[i];
+            RealT b = (bbox.max()[i] - mEye[i]) * mInvDir[i];
+            if (a > b) std::swap(a, b);
+            if (a > t0) t0 = a;
+            if (b < t1) t1 = b;
+            if (t0 > t1) return false;
+        }
+        return true;
+    }
+
+    /// @brief Return true if this ray intersects the specified bounding box.
+    /// @param bbox Axis-aligned bounding box in the same space as this ray.
+    template<typename BBoxT>
+    inline bool intersects(const BBoxT& bbox) const
+    {
+        RealT t0, t1;
+        return this->intersects(bbox, t0, t1);
+    }
+
+    /// @brief Return true if this ray intersects the specified bounding box.
+    /// @note For intersection this ray is clipped to the two intersection points.
+    /// @param bbox Axis-aligned bounding box in the same space as this ray.
+    template<typename BBoxT>
+    inline bool clip(const BBoxT& bbox)
+    {
+        RealT t0, t1;
+        const bool hit = this->intersects(bbox, t0, t1);
+        if (hit) mTimeSpan.set(t0, t1);
+        return hit;
+    }
+
+    /// @brief Return true if the Ray intersects the plane specified
+    /// by a normal and distance from the origin.
+    /// @param normal   Normal of the plane.
+    /// @param distance Distance of the plane to the origin.
+    /// @param t        Time of intersection, if one exists.
+    inline bool intersects(const Vec3T& normal, RealT distance, RealT& t) const
+      {
+          const RealT cosAngle = mDir.dot(normal);
+          if (math::isApproxZero(cosAngle)) return false;//parallel
+          t = (distance - mEye.dot(normal))/cosAngle;
+          return this->test(t);
+      }
+
+    /// @brief Return true if the Ray intersects the plane specified
+    /// by a normal and point.
+    /// @param normal   Normal of the plane.
+    /// @param point    Point in the plane.
+    /// @param t        Time of intersection, if one exists.
+    inline bool intersects(const Vec3T& normal, const Vec3T& point, RealT& t) const
+      {
+          return this->intersects(normal, point.dot(normal), t);
+      }
+
+private:
+    Vec3T mEye, mDir, mInvDir;
+    TimeSpan mTimeSpan;
+}; // end of Ray class
+
+/// @brief Output streaming of the Ray class.
+/// @note Primarily intended for debugging.
+template<typename RealT>
+inline std::ostream& operator<<(std::ostream& os, const Ray<RealT>& r)
+{
+    os << "eye=" << r.eye() << " dir=" << r.dir() << " 1/dir="<<r.invDir()
+       << " t0=" << r.t0()  << " t1="  << r.t1();
+    return os;
+}
+
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_RAY_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Stats.h b/nuparu/include/openvdb_new/math/Stats.h
new file mode 100644
index 00000000..004fa5df
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Stats.h
@@ -0,0 +1,399 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Stats.h
+///
+/// @author Ken Museth
+///
+/// @brief Classes to compute statistics and histograms
+
+#ifndef OPENVDB_MATH_STATS_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_STATS_HAS_BEEN_INCLUDED
+
+#include <iosfwd> // for ostringstream
+#include <openvdb/version.h>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <vector>
+#include <functional>// for std::less
+#include "Math.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @brief Templated class to compute the minimum and maximum values.
+template <typename ValueType, typename Less = std::less<ValueType> >
+class MinMax
+{
+public:
+
+    /// @brief Constructor
+    MinMax(const ValueType &min, const ValueType &max) : mMin(min), mMax(max)
+    {
+    }
+
+    /// Add a single sample.
+    inline void add(const ValueType &val, const Less &less = Less())
+    {
+        if (less(val, mMin)) mMin = val;
+        if (less(mMax, val)) mMax = val;
+    }
+
+    /// Return the minimum value.
+    inline const ValueType& min() const { return mMin; }
+
+    /// Return the maximum value.
+    inline const ValueType& max() const { return mMax; }
+
+    /// Add the samples from the other Stats instance.
+    inline void add(const MinMax& other, const Less &less = Less())
+    {
+        if (less(other.mMin, mMin)) mMin = other.mMin;
+        if (less(mMax, other.mMax)) mMax = other.mMax;
+    }
+
+    /// @brief Print MinMax to the specified output stream.
+    void print(const std::string &name= "", std::ostream &strm=std::cout, int precision=3) const
+    {
+        // Write to a temporary string stream so as not to affect the state
+        // (precision, field width, etc.) of the output stream.
+        std::ostringstream os;
+        os << std::setprecision(precision) << std::setiosflags(std::ios::fixed);
+        os << "MinMax ";
+        if (!name.empty()) os << "for \"" << name << "\" ";
+        os << "  Min="  << mMin << ", Max="  << mMax << std::endl;
+        strm << os.str();
+    }
+
+protected:
+    
+    ValueType mMin, mMax;
+};//end MinMax
+    
+/// @brief This class computes the minimum and maximum values of a population
+/// of floating-point values.
+class Extrema
+{
+public:
+
+    /// @brief Constructor
+    /// @warning The min/max values are initiated to extreme values
+    Extrema()
+        : mSize(0)
+        , mMin(std::numeric_limits<double>::max())
+        , mMax(-mMin)
+    {
+    }
+
+    /// Add a single sample.
+    void add(double val)
+    {
+        ++mSize;
+        mMin = std::min<double>(val, mMin);
+        mMax = std::max<double>(val, mMax);
+    }
+
+    /// Add @a n samples with constant value @a val.
+    void add(double val, uint64_t n)
+    {
+        mSize += n;
+        mMin  = std::min<double>(val, mMin);
+        mMax  = std::max<double>(val, mMax);
+    }
+
+    /// Return the size of the population, i.e., the total number of samples.
+    inline uint64_t size() const { return mSize; }
+
+    /// Return the minimum value.
+    inline double min() const { return mMin; }
+
+    /// Return the maximum value.
+    inline double max() const { return mMax; }
+
+    /// Return the range defined as the maximum value minus the minimum value.
+    inline double range() const { return mMax - mMin; }
+
+    /// Add the samples from the other Stats instance.
+    void add(const Extrema& other)
+    {
+        if (other.mSize > 0) this->join(other);
+    }
+
+    /// @brief Print extrema to the specified output stream.
+    void print(const std::string &name= "", std::ostream &strm=std::cout, int precision=3) const
+    {
+        // Write to a temporary string stream so as not to affect the state
+        // (precision, field width, etc.) of the output stream.
+        std::ostringstream os;
+        os << std::setprecision(precision) << std::setiosflags(std::ios::fixed);
+        os << "Extrema ";
+        if (!name.empty()) os << "for \"" << name << "\" ";
+        if (mSize>0) {
+            os << "with "   << mSize << " samples:\n"
+               << "  Min="  << mMin
+               << ", Max="  << mMax
+               << ", Range="<< this->range() << std::endl;
+        } else {
+            os << ": no samples were added." << std::endl;
+        }
+        strm << os.str();
+    }
+
+protected:
+
+    inline void join(const Extrema& other)
+    {
+        assert(other.mSize > 0);
+        mSize += other.mSize;
+        mMin   = std::min<double>(mMin, other.mMin);
+        mMax   = std::max<double>(mMax, other.mMax);
+    }
+
+    uint64_t mSize;
+    double mMin, mMax;
+};//end Extrema
+
+
+/// @brief This class computes statistics (minimum value, maximum
+/// value, mean, variance and standard deviation) of a population
+/// of floating-point values.
+///
+/// @details variance = Mean[ (X-Mean[X])^2 ] = Mean[X^2] - Mean[X]^2,
+///          standard deviation = sqrt(variance)
+///
+/// @note This class employs incremental computation and double precision.
+class Stats : public Extrema
+{
+public:
+    Stats()
+        : Extrema()
+        , mAvg(0.0)
+        , mAux(0.0)
+    {
+    }
+
+    /// Add a single sample.
+    void add(double val)
+    {
+        Extrema::add(val);
+        const double delta = val - mAvg;
+        mAvg += delta/double(mSize);
+        mAux += delta*(val - mAvg);
+    }
+
+    /// Add @a n samples with constant value @a val.
+    void add(double val, uint64_t n)
+    {
+        const double denom = 1.0/double(mSize + n);
+        const double delta = val - mAvg;
+        mAvg += denom * delta * double(n);
+        mAux += denom * delta * delta * double(mSize) * double(n);
+        Extrema::add(val, n);
+    }
+
+    /// Add the samples from the other Stats instance.
+    void add(const Stats& other)
+    {
+        if (other.mSize > 0) {
+            const double denom = 1.0/double(mSize + other.mSize);
+            const double delta = other.mAvg - mAvg;
+            mAvg += denom * delta * double(other.mSize);
+            mAux += other.mAux + denom * delta * delta * double(mSize) * double(other.mSize);
+            Extrema::join(other);
+        }
+    }
+
+    //@{
+    /// Return the  arithmetic mean, i.e. average, value.
+    inline double avg()  const { return mAvg; }
+    inline double mean() const { return mAvg; }
+    //@}
+
+    //@{
+    /// @brief Return the population variance.
+    /// @note The unbiased sample variance = population variance *
+    //num/(num-1)
+    inline double var()      const { return mSize<2 ? 0.0 : mAux/double(mSize); }
+    inline double variance() const { return this->var(); }
+    //@}
+
+    //@{
+    /// @brief Return the standard deviation (=Sqrt(variance)) as
+    /// defined from the (biased) population variance.
+    inline double std()    const { return sqrt(this->var()); }
+    inline double stdDev() const { return this->std(); }
+    //@}
+
+    /// @brief Print statistics to the specified output stream.
+    void print(const std::string &name= "", std::ostream &strm=std::cout, int precision=3) const
+    {
+        // Write to a temporary string stream so as not to affect the state
+        // (precision, field width, etc.) of the output stream.
+        std::ostringstream os;
+        os << std::setprecision(precision) << std::setiosflags(std::ios::fixed);
+        os << "Statistics ";
+        if (!name.empty()) os << "for \"" << name << "\" ";
+        if (mSize>0) {
+            os << "with " << mSize << " samples:\n"
+               << "  Min=" << mMin
+               << ", Max=" << mMax
+               << ", Ave=" << mAvg
+               << ", Std=" << this->stdDev()
+               << ", Var=" << this->variance() << std::endl;
+        } else {
+            os << ": no samples were added." << std::endl;
+        }
+        strm << os.str();
+    }
+
+protected:
+    using Extrema::mSize;
+    using Extrema::mMin;
+    using Extrema::mMax;
+    double mAvg, mAux;
+}; // end Stats
+
+
+////////////////////////////////////////
+
+
+/// @brief This class computes a histogram, with a fixed interval width,
+/// of a population of floating-point values.
+class Histogram
+{
+public:
+    /// Construct with given minimum and maximum values and the given bin count.
+    Histogram(double min, double max, size_t numBins = 10)
+        : mSize(0), mMin(min), mMax(max + 1e-10),
+          mDelta(double(numBins)/(max-min)), mBins(numBins)
+    {
+        if ( mMax <= mMin ) {
+            OPENVDB_THROW(ValueError, "Histogram: expected min < max");
+        } else if ( numBins == 0 ) {
+            OPENVDB_THROW(ValueError, "Histogram: expected at least one bin");
+        }
+        for (size_t i=0; i<numBins; ++i) mBins[i]=0;
+    }
+
+    /// @brief Construct with the given bin count and with minimum and maximum values
+    /// taken from a Stats object.
+    Histogram(const Stats& s, size_t numBins = 10):
+        mSize(0), mMin(s.min()), mMax(s.max()+1e-10),
+        mDelta(double(numBins)/(mMax-mMin)), mBins(numBins)
+    {
+        if ( mMax <= mMin ) {
+            OPENVDB_THROW(ValueError, "Histogram: expected min < max");
+        } else if ( numBins == 0 ) {
+            OPENVDB_THROW(ValueError, "Histogram: expected at least one bin");
+        }
+        for (size_t i=0; i<numBins; ++i) mBins[i]=0;
+    }
+
+    /// @brief Add @a n samples with constant value @a val, provided that the
+    /// @a val falls within this histogram's value range.
+    /// @return @c true if the sample value falls within this histogram's value range.
+    inline bool add(double val, uint64_t n = 1)
+    {
+        if (val<mMin || val>mMax) return false;
+        mBins[size_t(mDelta*(val-mMin))] += n;
+        mSize += n;
+        return true;
+    }
+
+    /// @brief Add all the contributions from the other histogram, provided that
+    /// it has the same configuration as this histogram.
+    bool add(const Histogram& other)
+    {
+        if (!isApproxEqual(mMin, other.mMin) || !isApproxEqual(mMax, other.mMax) ||
+            mBins.size() != other.mBins.size()) return false;
+        for (size_t i=0, e=mBins.size(); i!=e; ++i) mBins[i] += other.mBins[i];
+        mSize += other.mSize;
+        return true;
+    }
+
+    /// Return the number of bins in this histogram.
+    inline size_t numBins() const { return mBins.size(); }
+    /// Return the lower bound of this histogram's value range.
+    inline double min() const { return mMin; }
+    /// Return the upper bound of this histogram's value range.
+    inline double max() const { return mMax; }
+    /// Return the minimum value in the <i>n</i>th bin.
+    inline double min(int n) const { return mMin+n/mDelta; }
+    /// Return the maximum value in the <i>n</i>th bin.
+    inline double max(int n) const { return mMin+(n+1)/mDelta; }
+    /// Return the number of samples in the <i>n</i>th bin.
+    inline uint64_t count(int n) const { return mBins[n]; }
+    /// Return the population size, i.e., the total number of samples.
+    inline uint64_t size() const { return mSize; }
+
+    /// Print the histogram to the specified output stream.
+    void print(const std::string& name = "", std::ostream& strm = std::cout) const
+    {
+        // Write to a temporary string stream so as not to affect the state
+        // (precision, field width, etc.) of the output stream.
+        std::ostringstream os;
+        os << std::setprecision(6) << std::setiosflags(std::ios::fixed) << std::endl;
+        os << "Histogram ";
+        if (!name.empty()) os << "for \"" << name << "\" ";
+        if (mSize > 0) {
+            os << "with " << mSize << " samples:\n";
+            os << "==============================================================\n";
+            os << "||  #   |       Min      |       Max      | Frequency |  %  ||\n";
+            os << "==============================================================\n";
+            for (int i = 0, e = int(mBins.size()); i != e; ++i) {
+                os << "|| " << std::setw(4) << i << " | " << std::setw(14) << this->min(i) << " | "
+                   << std::setw(14) << this->max(i) << " | " << std::setw(9) << mBins[i] << " | "
+                   << std::setw(3) << (100*mBins[i]/mSize) << " ||\n";
+            }
+            os << "==============================================================\n";
+        } else {
+            os << ": no samples were added." << std::endl;
+        }
+        strm << os.str();
+    }
+
+private:
+    uint64_t mSize;
+    double mMin, mMax, mDelta;
+    std::vector<uint64_t> mBins;
+};
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_STATS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Stencils.h b/nuparu/include/openvdb_new/math/Stencils.h
new file mode 100644
index 00000000..dcdc7508
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Stencils.h
@@ -0,0 +1,1680 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file Stencils.h
+///
+/// @brief Defines various finite difference stencils by means of the
+///        "curiously recurring template pattern" on a BaseStencil
+///        that caches stencil values and stores a ValueAccessor for
+///        fast lookup.
+
+#ifndef OPENVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+
+#include <algorithm>
+#include <vector>
+#include <openvdb/math/Math.h>             // for Pow2, needed by WENO and Godunov
+#include <openvdb/Types.h>                 // for Real
+#include <openvdb/math/Coord.h>            // for Coord
+#include <openvdb/math/FiniteDifference.h> // for WENO5 and GodunovsNormSqrd
+#include <openvdb/tree/ValueAccessor.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+
+////////////////////////////////////////
+    
+template<typename DerivedType, typename GridT, bool IsSafe>
+class BaseStencil
+{
+public:
+    typedef GridT                                       GridType;
+    typedef typename GridT::TreeType                    TreeType;
+    typedef typename GridT::ValueType                   ValueType;
+    typedef tree::ValueAccessor<const TreeType, IsSafe> AccessorType;
+    typedef std::vector<ValueType>                      BufferType;
+    typedef typename BufferType::iterator               IterType;
+
+    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
+    /// and its neighbors.
+    /// @param ijk Index coordinates of stencil center
+    inline void moveTo(const Coord& ijk)
+    {
+        mCenter = ijk;
+        mStencil[0] = mCache.getValue(ijk);
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (i, j, k)
+    /// and its neighbors. The method also takes a value of the center
+    /// element of the stencil, assuming it is already known.
+    /// @param ijk Index coordinates of stnecil center
+    /// @param centerValue Value of the center element of the stencil
+    inline void moveTo(const Coord& ijk, const ValueType& centerValue)
+    {
+        mCenter = ijk;
+        mStencil[0] = centerValue;
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel
+    /// (x, y, z) and its neighbors.
+    ///
+    /// @note This version is slightly faster than the one above, since
+    /// the center voxel's value is read directly from the iterator.
+    template<typename IterType>
+    inline void moveTo(const IterType& iter)
+    {
+        mCenter = iter.getCoord();
+        mStencil[0] = *iter;
+        static_cast<DerivedType&>(*this).init(mCenter);
+    }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (x, y, z)
+    /// and its neighbors.
+    /// @param xyz Floating point voxel coordinates of stencil center
+    /// @details This method will check to see if it is necessary to
+    /// update the stencil based on the cached index coordinates of
+    /// the center point.
+    inline void moveTo(const Vec3R& xyz)
+    {
+        Coord ijk = openvdb::Coord::floor(xyz);
+        if (ijk != mCenter) this->moveTo(ijk);
+    }
+
+    /// @brief Return the value from the stencil buffer with linear
+    /// offset pos.
+    ///
+    /// @note The default (@a pos = 0) corresponds to the first element
+    /// which is typically the center point of the stencil.
+    inline const ValueType& getValue(unsigned int pos = 0) const
+    {
+        assert(pos < mStencil.size());
+        return mStencil[pos];
+    }
+
+    /// @brief Return the value at the specified location relative to the center of the stencil
+    template<int i, int j, int k>
+    inline const ValueType& getValue() const
+    {
+        return mStencil[static_cast<const DerivedType&>(*this).template pos<i,j,k>()];
+    }
+
+    /// @brief Set the value at the specified location relative to the center of the stencil
+    template<int i, int j, int k>
+    inline void setValue(const ValueType& value)
+    {
+        mStencil[static_cast<const DerivedType&>(*this).template pos<i,j,k>()] = value;
+    }
+
+    /// @brief Return the size of the stencil buffer.
+    inline int size() { return mStencil.size(); }
+
+    /// @brief Return the median value of the current stencil.
+    inline ValueType median() const
+    {
+        BufferType tmp(mStencil);//local copy
+        assert(!tmp.empty());
+        size_t midpoint = (tmp.size() - 1) >> 1;
+        // Partially sort the vector until the median value is at the midpoint.
+        std::nth_element(tmp.begin(), tmp.begin() + midpoint, tmp.end());
+        return tmp[midpoint];
+    }
+
+    /// @brief Return the mean value of the current stencil.
+    inline ValueType mean() const
+    {
+        ValueType sum = 0.0;
+        for (int n = 0, s = int(mStencil.size()); n < s; ++n) sum += mStencil[n];
+        return sum / mStencil.size();
+    }
+
+    /// @brief Return the smallest value in the stencil buffer.
+    inline ValueType min() const
+    {
+        IterType iter = std::min_element(mStencil.begin(), mStencil.end());
+        return *iter;
+    }
+
+    /// @brief Return the largest value in the stencil buffer.
+    inline ValueType max() const
+    {
+        IterType iter = std::max_element(mStencil.begin(), mStencil.end());
+        return *iter;
+    }
+
+    /// @brief Return the coordinates of the center point of the stencil.
+    inline const Coord& getCenterCoord() const { return mCenter; }
+
+    /// @brief Return the value at the center of the stencil
+    inline const ValueType& getCenterValue() const { return mStencil[0]; }
+
+    /// @brief Return true if the center of the stencil intersects the
+    /// iso-contour specified by the isoValue
+    inline bool intersects(const ValueType &isoValue = zeroVal<ValueType>()) const
+    {
+        const bool less = this->getValue< 0, 0, 0>() < isoValue;
+        return (less  ^  (this->getValue<-1, 0, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 1, 0, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0,-1, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 1, 0>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 0,-1>() < isoValue)) ||
+               (less  ^  (this->getValue< 0, 0, 1>() < isoValue))  ;
+    }
+
+    /// @brief Return a const reference to the grid from which this
+    /// stencil was constructed.
+    inline const GridType& grid() const { return *mGrid; }
+
+    /// @brief Return a const reference to the ValueAccessor
+    /// associated with this Stencil.
+    inline const AccessorType& accessor() const { return mCache; }
+
+protected:
+    // Constructor is protected to prevent direct instantiation.
+    BaseStencil(const GridType& grid, int size)
+        : mGrid(&grid)
+        , mCache(grid.tree())
+        , mStencil(size)
+        , mCenter(Coord::max())
+    {
+    }
+
+    const GridType* mGrid;
+    AccessorType    mCache;
+    BufferType      mStencil;
+    Coord           mCenter;
+
+}; // BaseStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the seven point stencil
+    template<int i, int j, int k> struct SevenPt {};
+    template<> struct SevenPt< 0, 0, 0> { enum { idx = 0 }; };
+    template<> struct SevenPt< 1, 0, 0> { enum { idx = 1 }; };
+    template<> struct SevenPt< 0, 1, 0> { enum { idx = 2 }; };
+    template<> struct SevenPt< 0, 0, 1> { enum { idx = 3 }; };
+    template<> struct SevenPt<-1, 0, 0> { enum { idx = 4 }; };
+    template<> struct SevenPt< 0,-1, 0> { enum { idx = 5 }; };
+    template<> struct SevenPt< 0, 0,-1> { enum { idx = 6 }; };
+
+}
+
+
+template<typename GridT, bool IsSafe = true>
+class SevenPointStencil: public BaseStencil<SevenPointStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef SevenPointStencil<GridT, IsSafe>  SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe> BaseType;
+public:
+    typedef GridT                             GridType;
+    typedef typename GridT::TreeType          TreeType;
+    typedef typename GridT::ValueType         ValueType;
+    
+    static const int SIZE = 7;
+
+    SevenPointStencil(const GridT& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return SevenPt<i,j,k>::idx; }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        BaseType::template setValue<-1, 0, 0>(mCache.getValue(ijk.offsetBy(-1, 0, 0)));
+        BaseType::template setValue< 1, 0, 0>(mCache.getValue(ijk.offsetBy( 1, 0, 0)));
+
+        BaseType::template setValue< 0,-1, 0>(mCache.getValue(ijk.offsetBy( 0,-1, 0)));
+        BaseType::template setValue< 0, 1, 0>(mCache.getValue(ijk.offsetBy( 0, 1, 0)));
+
+        BaseType::template setValue< 0, 0,-1>(mCache.getValue(ijk.offsetBy( 0, 0,-1)));
+        BaseType::template setValue< 0, 0, 1>(mCache.getValue(ijk.offsetBy( 0, 0, 1)));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// SevenPointStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the eight point box stencil
+    template<int i, int j, int k> struct BoxPt {};
+    template<> struct BoxPt< 0, 0, 0> { enum { idx = 0 }; };
+    template<> struct BoxPt< 0, 0, 1> { enum { idx = 1 }; };
+    template<> struct BoxPt< 0, 1, 1> { enum { idx = 2 }; };
+    template<> struct BoxPt< 0, 1, 0> { enum { idx = 3 }; };
+    template<> struct BoxPt< 1, 0, 0> { enum { idx = 4 }; };
+    template<> struct BoxPt< 1, 0, 1> { enum { idx = 5 }; };
+    template<> struct BoxPt< 1, 1, 1> { enum { idx = 6 }; };
+    template<> struct BoxPt< 1, 1, 0> { enum { idx = 7 }; };
+}
+
+template<typename GridT, bool IsSafe = true>
+class BoxStencil: public BaseStencil<BoxStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef BoxStencil<GridT, IsSafe>         SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe> BaseType;
+public:
+    typedef GridT                             GridType;
+    typedef typename GridT::TreeType          TreeType;
+    typedef typename GridT::ValueType         ValueType;
+    
+    static const int SIZE = 8;
+
+    BoxStencil(const GridType& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return BoxPt<i,j,k>::idx; }
+
+     /// @brief Return true if the center of the stencil intersects the
+    /// iso-contour specified by the isoValue
+    inline bool intersects(const ValueType &isoValue = zeroVal<ValueType>()) const
+    {
+        const bool less = mStencil[0] < isoValue;
+        return (less  ^  (mStencil[1] < isoValue)) ||
+               (less  ^  (mStencil[2] < isoValue)) ||
+               (less  ^  (mStencil[3] < isoValue)) ||
+               (less  ^  (mStencil[4] < isoValue)) ||
+               (less  ^  (mStencil[5] < isoValue)) ||
+               (less  ^  (mStencil[6] < isoValue)) ||
+               (less  ^  (mStencil[7] < isoValue))  ;
+    }
+
+    /// @brief Return the trilinear interpolation at the normalized position.
+    /// @param xyz Floating point coordinate position.
+    /// @warning It is assumed that the stencil has already been moved
+    /// to the relevant voxel position, e.g. using moveTo(xyz).
+    /// @note Trilinear interpolation kernal reads as:
+    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
+    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
+    inline ValueType interpolation(const math::Vec3<ValueType>& xyz) const
+    {
+        const Real u = xyz[0] - BaseType::mCenter[0]; assert(u>=0 && u<=1);
+        const Real v = xyz[1] - BaseType::mCenter[1]; assert(v>=0 && v<=1);
+        const Real w = xyz[2] - BaseType::mCenter[2]; assert(w>=0 && w<=1);
+
+        ValueType V = BaseType::template getValue<0,0,0>();
+        ValueType A = static_cast<ValueType>(V + (BaseType::template getValue<0,0,1>() - V) * w);
+        V = BaseType::template getValue< 0, 1, 0>();
+        ValueType B = static_cast<ValueType>(V + (BaseType::template getValue<0,1,1>() - V) * w);
+        ValueType C = static_cast<ValueType>(A + (B - A) * v);
+
+        V = BaseType::template getValue<1,0,0>();
+        A = static_cast<ValueType>(V + (BaseType::template getValue<1,0,1>() - V) * w);
+        V = BaseType::template getValue<1,1,0>();
+        B = static_cast<ValueType>(V + (BaseType::template getValue<1,1,1>() - V) * w);
+        ValueType D = static_cast<ValueType>(A + (B - A) * v);
+
+        return static_cast<ValueType>(C + (D - C) * u);
+    }
+
+    /// @brief Return the gradient in world space of the trilinear interpolation kernel.
+    /// @param xyz Floating point coordinate position.
+    /// @warning It is assumed that the stencil has already been moved
+    /// to the relevant voxel position, e.g. using moveTo(xyz).
+    /// @note Computed as partial derivatives of the trilinear interpolation kernel:
+    ///       v000 (1-u)(1-v)(1-w) + v001 (1-u)(1-v)w + v010 (1-u)v(1-w) + v011 (1-u)vw
+    ///     + v100 u(1-v)(1-w)     + v101 u(1-v)w     + v110 uv(1-w)     + v111 uvw
+    inline math::Vec3<ValueType> gradient(const math::Vec3<ValueType>& xyz) const
+    {
+        const Real u = xyz[0] - BaseType::mCenter[0]; assert(u>=0 && u<=1);
+        const Real v = xyz[1] - BaseType::mCenter[1]; assert(v>=0 && v<=1);
+        const Real w = xyz[2] - BaseType::mCenter[2]; assert(w>=0 && w<=1);
+
+        ValueType D[4]={BaseType::template getValue<0,0,1>()-BaseType::template getValue<0,0,0>(),
+                        BaseType::template getValue<0,1,1>()-BaseType::template getValue<0,1,0>(),
+                        BaseType::template getValue<1,0,1>()-BaseType::template getValue<1,0,0>(),
+                        BaseType::template getValue<1,1,1>()-BaseType::template getValue<1,1,0>()};
+
+        // Z component
+        ValueType A = static_cast<ValueType>(D[0] + (D[1]- D[0]) * v);
+        ValueType B = static_cast<ValueType>(D[2] + (D[3]- D[2]) * v);
+        math::Vec3<ValueType> grad(zeroVal<ValueType>(),
+                                   zeroVal<ValueType>(),
+                                   static_cast<ValueType>(A + (B - A) * u));
+
+        D[0] = static_cast<ValueType>(BaseType::template getValue<0,0,0>() + D[0] * w);
+        D[1] = static_cast<ValueType>(BaseType::template getValue<0,1,0>() + D[1] * w);
+        D[2] = static_cast<ValueType>(BaseType::template getValue<1,0,0>() + D[2] * w);
+        D[3] = static_cast<ValueType>(BaseType::template getValue<1,1,0>() + D[3] * w);
+
+        // X component
+        A = static_cast<ValueType>(D[0] + (D[1] - D[0]) * v);
+        B = static_cast<ValueType>(D[2] + (D[3] - D[2]) * v);
+
+        grad[0] = B - A;
+
+        // Y component
+        A = D[1] - D[0];
+        B = D[3] - D[2];
+
+        grad[1] = static_cast<ValueType>(A + (B - A) * u);
+
+        return BaseType::mGrid->transform().baseMap()->applyIJT(grad, xyz);
+    }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        BaseType::template setValue< 0, 0, 1>(mCache.getValue(ijk.offsetBy( 0, 0, 1)));
+        BaseType::template setValue< 0, 1, 1>(mCache.getValue(ijk.offsetBy( 0, 1, 1)));
+        BaseType::template setValue< 0, 1, 0>(mCache.getValue(ijk.offsetBy( 0, 1, 0)));
+        BaseType::template setValue< 1, 0, 0>(mCache.getValue(ijk.offsetBy( 1, 0, 0)));
+        BaseType::template setValue< 1, 0, 1>(mCache.getValue(ijk.offsetBy( 1, 0, 1)));
+        BaseType::template setValue< 1, 1, 1>(mCache.getValue(ijk.offsetBy( 1, 1, 1)));
+        BaseType::template setValue< 1, 1, 0>(mCache.getValue(ijk.offsetBy( 1, 1, 0)));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// BoxStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the dense point stencil
+    template<int i, int j, int k> struct DensePt {};
+    template<> struct DensePt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct DensePt< 1, 0, 0> { enum { idx = 1 }; };
+    template<> struct DensePt< 0, 1, 0> { enum { idx = 2 }; };
+    template<> struct DensePt< 0, 0, 1> { enum { idx = 3 }; };
+
+    template<> struct DensePt<-1, 0, 0> { enum { idx = 4 }; };
+    template<> struct DensePt< 0,-1, 0> { enum { idx = 5 }; };
+    template<> struct DensePt< 0, 0,-1> { enum { idx = 6 }; };
+
+    template<> struct DensePt<-1,-1, 0> { enum { idx = 7 }; };
+    template<> struct DensePt< 0,-1,-1> { enum { idx = 8 }; };
+    template<> struct DensePt<-1, 0,-1> { enum { idx = 9 }; };
+
+    template<> struct DensePt< 1,-1, 0> { enum { idx = 10 }; };
+    template<> struct DensePt< 0, 1,-1> { enum { idx = 11 }; };
+    template<> struct DensePt<-1, 0, 1> { enum { idx = 12 }; };
+
+    template<> struct DensePt<-1, 1, 0> { enum { idx = 13 }; };
+    template<> struct DensePt< 0,-1, 1> { enum { idx = 14 }; };
+    template<> struct DensePt< 1, 0,-1> { enum { idx = 15 }; };
+
+    template<> struct DensePt< 1, 1, 0> { enum { idx = 16 }; };
+    template<> struct DensePt< 0, 1, 1> { enum { idx = 17 }; };
+    template<> struct DensePt< 1, 0, 1> { enum { idx = 18 }; };
+
+}
+
+
+template<typename GridT, bool IsSafe = true>
+class SecondOrderDenseStencil
+    : public BaseStencil<SecondOrderDenseStencil<GridT, IsSafe>, GridT, IsSafe >
+{
+    typedef SecondOrderDenseStencil<GridT, IsSafe> SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe >     BaseType;
+public:
+    typedef GridT                                  GridType;
+    typedef typename GridT::TreeType               TreeType;
+    typedef typename GridType::ValueType           ValueType;
+
+    static const int SIZE = 19;
+
+    SecondOrderDenseStencil(const GridType& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return DensePt<i,j,k>::idx; }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        mStencil[DensePt< 1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,  0,  0));
+        mStencil[DensePt< 0, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,  1,  0));
+        mStencil[DensePt< 0, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,  0,  1));
+
+        mStencil[DensePt<-1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,  0,  0));
+        mStencil[DensePt< 0,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, -1,  0));
+        mStencil[DensePt< 0, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,  0, -1));
+
+        mStencil[DensePt<-1,-1, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, -1,  0));
+        mStencil[DensePt< 1,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, -1,  0));
+        mStencil[DensePt<-1, 1, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,  1,  0));
+        mStencil[DensePt< 1, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,  1,  0));
+
+        mStencil[DensePt<-1, 0,-1>::idx] = mCache.getValue(ijk.offsetBy(-1,  0, -1));
+        mStencil[DensePt< 1, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 1,  0, -1));
+        mStencil[DensePt<-1, 0, 1>::idx] = mCache.getValue(ijk.offsetBy(-1,  0,  1));
+        mStencil[DensePt< 1, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 1,  0,  1));
+
+        mStencil[DensePt< 0,-1,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, -1, -1));
+        mStencil[DensePt< 0, 1,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,  1, -1));
+        mStencil[DensePt< 0,-1, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, -1,  1));
+        mStencil[DensePt< 0, 1, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,  1,  1));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// SecondOrderDenseStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the dense point stencil
+    template<int i, int j, int k> struct ThirteenPt {};
+    template<> struct ThirteenPt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct ThirteenPt< 1, 0, 0> { enum { idx = 1 }; };
+    template<> struct ThirteenPt< 0, 1, 0> { enum { idx = 2 }; };
+    template<> struct ThirteenPt< 0, 0, 1> { enum { idx = 3 }; };
+
+    template<> struct ThirteenPt<-1, 0, 0> { enum { idx = 4 }; };
+    template<> struct ThirteenPt< 0,-1, 0> { enum { idx = 5 }; };
+    template<> struct ThirteenPt< 0, 0,-1> { enum { idx = 6 }; };
+
+    template<> struct ThirteenPt< 2, 0, 0> { enum { idx = 7 }; };
+    template<> struct ThirteenPt< 0, 2, 0> { enum { idx = 8 }; };
+    template<> struct ThirteenPt< 0, 0, 2> { enum { idx = 9 }; };
+
+    template<> struct ThirteenPt<-2, 0, 0> { enum { idx = 10 }; };
+    template<> struct ThirteenPt< 0,-2, 0> { enum { idx = 11 }; };
+    template<> struct ThirteenPt< 0, 0,-2> { enum { idx = 12 }; };
+
+}
+
+
+template<typename GridT, bool IsSafe = true>
+class ThirteenPointStencil
+    : public BaseStencil<ThirteenPointStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef ThirteenPointStencil<GridT, IsSafe> SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe >  BaseType;
+public:
+    typedef GridT                               GridType;
+    typedef typename GridT::TreeType            TreeType;
+    typedef typename GridType::ValueType        ValueType;
+
+    static const int SIZE = 13;
+
+    ThirteenPointStencil(const GridType& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return ThirteenPt<i,j,k>::idx; }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        mStencil[ThirteenPt< 2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,  0,  0));
+        mStencil[ThirteenPt< 1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,  0,  0));
+        mStencil[ThirteenPt<-1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,  0,  0));
+        mStencil[ThirteenPt<-2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,  0,  0));
+
+        mStencil[ThirteenPt< 0, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,  2,  0));
+        mStencil[ThirteenPt< 0, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,  1,  0));
+        mStencil[ThirteenPt< 0,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, -1,  0));
+        mStencil[ThirteenPt< 0,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, -2,  0));
+
+        mStencil[ThirteenPt< 0, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,  0,  2));
+        mStencil[ThirteenPt< 0, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,  0,  1));
+        mStencil[ThirteenPt< 0, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,  0, -1));
+        mStencil[ThirteenPt< 0, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,  0, -2));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// ThirteenPointStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the 4th-order dense point stencil
+    template<int i, int j, int k> struct FourthDensePt {};
+    template<> struct FourthDensePt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct FourthDensePt<-2, 2, 0> { enum { idx = 1 }; };
+    template<> struct FourthDensePt<-1, 2, 0> { enum { idx = 2 }; };
+    template<> struct FourthDensePt< 0, 2, 0> { enum { idx = 3 }; };
+    template<> struct FourthDensePt< 1, 2, 0> { enum { idx = 4 }; };
+    template<> struct FourthDensePt< 2, 2, 0> { enum { idx = 5 }; };
+
+    template<> struct FourthDensePt<-2, 1, 0> { enum { idx = 6 }; };
+    template<> struct FourthDensePt<-1, 1, 0> { enum { idx = 7 }; };
+    template<> struct FourthDensePt< 0, 1, 0> { enum { idx = 8 }; };
+    template<> struct FourthDensePt< 1, 1, 0> { enum { idx = 9 }; };
+    template<> struct FourthDensePt< 2, 1, 0> { enum { idx = 10 }; };
+
+    template<> struct FourthDensePt<-2, 0, 0> { enum { idx = 11 }; };
+    template<> struct FourthDensePt<-1, 0, 0> { enum { idx = 12 }; };
+    template<> struct FourthDensePt< 1, 0, 0> { enum { idx = 13 }; };
+    template<> struct FourthDensePt< 2, 0, 0> { enum { idx = 14 }; };
+
+    template<> struct FourthDensePt<-2,-1, 0> { enum { idx = 15 }; };
+    template<> struct FourthDensePt<-1,-1, 0> { enum { idx = 16 }; };
+    template<> struct FourthDensePt< 0,-1, 0> { enum { idx = 17 }; };
+    template<> struct FourthDensePt< 1,-1, 0> { enum { idx = 18 }; };
+    template<> struct FourthDensePt< 2,-1, 0> { enum { idx = 19 }; };
+
+    template<> struct FourthDensePt<-2,-2, 0> { enum { idx = 20 }; };
+    template<> struct FourthDensePt<-1,-2, 0> { enum { idx = 21 }; };
+    template<> struct FourthDensePt< 0,-2, 0> { enum { idx = 22 }; };
+    template<> struct FourthDensePt< 1,-2, 0> { enum { idx = 23 }; };
+    template<> struct FourthDensePt< 2,-2, 0> { enum { idx = 24 }; };
+
+
+    template<> struct FourthDensePt<-2, 0, 2> { enum { idx = 25 }; };
+    template<> struct FourthDensePt<-1, 0, 2> { enum { idx = 26 }; };
+    template<> struct FourthDensePt< 0, 0, 2> { enum { idx = 27 }; };
+    template<> struct FourthDensePt< 1, 0, 2> { enum { idx = 28 }; };
+    template<> struct FourthDensePt< 2, 0, 2> { enum { idx = 29 }; };
+
+    template<> struct FourthDensePt<-2, 0, 1> { enum { idx = 30 }; };
+    template<> struct FourthDensePt<-1, 0, 1> { enum { idx = 31 }; };
+    template<> struct FourthDensePt< 0, 0, 1> { enum { idx = 32 }; };
+    template<> struct FourthDensePt< 1, 0, 1> { enum { idx = 33 }; };
+    template<> struct FourthDensePt< 2, 0, 1> { enum { idx = 34 }; };
+
+    template<> struct FourthDensePt<-2, 0,-1> { enum { idx = 35 }; };
+    template<> struct FourthDensePt<-1, 0,-1> { enum { idx = 36 }; };
+    template<> struct FourthDensePt< 0, 0,-1> { enum { idx = 37 }; };
+    template<> struct FourthDensePt< 1, 0,-1> { enum { idx = 38 }; };
+    template<> struct FourthDensePt< 2, 0,-1> { enum { idx = 39 }; };
+
+    template<> struct FourthDensePt<-2, 0,-2> { enum { idx = 40 }; };
+    template<> struct FourthDensePt<-1, 0,-2> { enum { idx = 41 }; };
+    template<> struct FourthDensePt< 0, 0,-2> { enum { idx = 42 }; };
+    template<> struct FourthDensePt< 1, 0,-2> { enum { idx = 43 }; };
+    template<> struct FourthDensePt< 2, 0,-2> { enum { idx = 44 }; };
+
+
+    template<> struct FourthDensePt< 0,-2, 2> { enum { idx = 45 }; };
+    template<> struct FourthDensePt< 0,-1, 2> { enum { idx = 46 }; };
+    template<> struct FourthDensePt< 0, 1, 2> { enum { idx = 47 }; };
+    template<> struct FourthDensePt< 0, 2, 2> { enum { idx = 48 }; };
+
+    template<> struct FourthDensePt< 0,-2, 1> { enum { idx = 49 }; };
+    template<> struct FourthDensePt< 0,-1, 1> { enum { idx = 50 }; };
+    template<> struct FourthDensePt< 0, 1, 1> { enum { idx = 51 }; };
+    template<> struct FourthDensePt< 0, 2, 1> { enum { idx = 52 }; };
+
+    template<> struct FourthDensePt< 0,-2,-1> { enum { idx = 53 }; };
+    template<> struct FourthDensePt< 0,-1,-1> { enum { idx = 54 }; };
+    template<> struct FourthDensePt< 0, 1,-1> { enum { idx = 55 }; };
+    template<> struct FourthDensePt< 0, 2,-1> { enum { idx = 56 }; };
+
+    template<> struct FourthDensePt< 0,-2,-2> { enum { idx = 57 }; };
+    template<> struct FourthDensePt< 0,-1,-2> { enum { idx = 58 }; };
+    template<> struct FourthDensePt< 0, 1,-2> { enum { idx = 59 }; };
+    template<> struct FourthDensePt< 0, 2,-2> { enum { idx = 60 }; };
+
+}
+
+
+template<typename GridT, bool IsSafe = true>
+class FourthOrderDenseStencil
+    : public BaseStencil<FourthOrderDenseStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef FourthOrderDenseStencil<GridT, IsSafe> SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe >     BaseType;
+public:
+    typedef GridT                                  GridType;
+    typedef typename GridT::TreeType               TreeType;
+    typedef typename GridType::ValueType           ValueType;
+
+    static const int SIZE = 61;
+
+    FourthOrderDenseStencil(const GridType& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return FourthDensePt<i,j,k>::idx; }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        mStencil[FourthDensePt<-2, 2, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 2, 0));
+        mStencil[FourthDensePt<-1, 2, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 2, 0));
+        mStencil[FourthDensePt< 0, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 0));
+        mStencil[FourthDensePt< 1, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 2, 0));
+        mStencil[FourthDensePt< 2, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 2, 0));
+
+        mStencil[FourthDensePt<-2, 1, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 1, 0));
+        mStencil[FourthDensePt<-1, 1, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 1, 0));
+        mStencil[FourthDensePt< 0, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 0));
+        mStencil[FourthDensePt< 1, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 1, 0));
+        mStencil[FourthDensePt< 2, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 1, 0));
+
+        mStencil[FourthDensePt<-2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 0));
+        mStencil[FourthDensePt<-1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 0));
+        mStencil[FourthDensePt< 1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 0));
+        mStencil[FourthDensePt< 2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 0));
+
+        mStencil[FourthDensePt<-2,-1, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,-1, 0));
+        mStencil[FourthDensePt<-1,-1, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,-1, 0));
+        mStencil[FourthDensePt< 0,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 0));
+        mStencil[FourthDensePt< 1,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,-1, 0));
+        mStencil[FourthDensePt< 2,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,-1, 0));
+
+        mStencil[FourthDensePt<-2,-2, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,-2, 0));
+        mStencil[FourthDensePt<-1,-2, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,-2, 0));
+        mStencil[FourthDensePt< 0,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 0));
+        mStencil[FourthDensePt< 1,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,-2, 0));
+        mStencil[FourthDensePt< 2,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,-2, 0));
+
+        mStencil[FourthDensePt<-2, 0, 2>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 2));
+        mStencil[FourthDensePt<-1, 0, 2>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 2));
+        mStencil[FourthDensePt< 0, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 0, 2));
+        mStencil[FourthDensePt< 1, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 2));
+        mStencil[FourthDensePt< 2, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 2));
+
+        mStencil[FourthDensePt<-2, 0, 1>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 1));
+        mStencil[FourthDensePt<-1, 0, 1>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 1));
+        mStencil[FourthDensePt< 0, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 0, 1));
+        mStencil[FourthDensePt< 1, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 1));
+        mStencil[FourthDensePt< 2, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 1));
+
+        mStencil[FourthDensePt<-2, 0,-1>::idx] = mCache.getValue(ijk.offsetBy(-2, 0,-1));
+        mStencil[FourthDensePt<-1, 0,-1>::idx] = mCache.getValue(ijk.offsetBy(-1, 0,-1));
+        mStencil[FourthDensePt< 0, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 0,-1));
+        mStencil[FourthDensePt< 1, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 1, 0,-1));
+        mStencil[FourthDensePt< 2, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 2, 0,-1));
+
+        mStencil[FourthDensePt<-2, 0,-2>::idx] = mCache.getValue(ijk.offsetBy(-2, 0,-2));
+        mStencil[FourthDensePt<-1, 0,-2>::idx] = mCache.getValue(ijk.offsetBy(-1, 0,-2));
+        mStencil[FourthDensePt< 0, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 0,-2));
+        mStencil[FourthDensePt< 1, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 1, 0,-2));
+        mStencil[FourthDensePt< 2, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 2, 0,-2));
+
+
+        mStencil[FourthDensePt< 0,-2, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 2));
+        mStencil[FourthDensePt< 0,-1, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 2));
+        mStencil[FourthDensePt< 0, 1, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 2));
+        mStencil[FourthDensePt< 0, 2, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 2));
+
+        mStencil[FourthDensePt< 0,-2, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 1));
+        mStencil[FourthDensePt< 0,-1, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 1));
+        mStencil[FourthDensePt< 0, 1, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 1));
+        mStencil[FourthDensePt< 0, 2, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 1));
+
+        mStencil[FourthDensePt< 0,-2,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,-2,-1));
+        mStencil[FourthDensePt< 0,-1,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,-1,-1));
+        mStencil[FourthDensePt< 0, 1,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 1,-1));
+        mStencil[FourthDensePt< 0, 2,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 2,-1));
+
+        mStencil[FourthDensePt< 0,-2,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,-2,-2));
+        mStencil[FourthDensePt< 0,-1,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,-1,-2));
+        mStencil[FourthDensePt< 0, 1,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 1,-2));
+        mStencil[FourthDensePt< 0, 2,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 2,-2));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// FourthOrderDenseStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the dense point stencil
+    template<int i, int j, int k> struct NineteenPt {};
+    template<> struct NineteenPt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct NineteenPt< 1, 0, 0> { enum { idx = 1 }; };
+    template<> struct NineteenPt< 0, 1, 0> { enum { idx = 2 }; };
+    template<> struct NineteenPt< 0, 0, 1> { enum { idx = 3 }; };
+
+    template<> struct NineteenPt<-1, 0, 0> { enum { idx = 4 }; };
+    template<> struct NineteenPt< 0,-1, 0> { enum { idx = 5 }; };
+    template<> struct NineteenPt< 0, 0,-1> { enum { idx = 6 }; };
+
+    template<> struct NineteenPt< 2, 0, 0> { enum { idx = 7 }; };
+    template<> struct NineteenPt< 0, 2, 0> { enum { idx = 8 }; };
+    template<> struct NineteenPt< 0, 0, 2> { enum { idx = 9 }; };
+
+    template<> struct NineteenPt<-2, 0, 0> { enum { idx = 10 }; };
+    template<> struct NineteenPt< 0,-2, 0> { enum { idx = 11 }; };
+    template<> struct NineteenPt< 0, 0,-2> { enum { idx = 12 }; };
+
+    template<> struct NineteenPt< 3, 0, 0> { enum { idx = 13 }; };
+    template<> struct NineteenPt< 0, 3, 0> { enum { idx = 14 }; };
+    template<> struct NineteenPt< 0, 0, 3> { enum { idx = 15 }; };
+
+    template<> struct NineteenPt<-3, 0, 0> { enum { idx = 16 }; };
+    template<> struct NineteenPt< 0,-3, 0> { enum { idx = 17 }; };
+    template<> struct NineteenPt< 0, 0,-3> { enum { idx = 18 }; };
+
+}
+
+
+template<typename GridT, bool IsSafe = true>
+class NineteenPointStencil
+    : public BaseStencil<NineteenPointStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef NineteenPointStencil<GridT, IsSafe> SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe >  BaseType;
+public:
+    typedef GridT                               GridType;
+    typedef typename GridT::TreeType            TreeType;
+    typedef typename GridType::ValueType        ValueType;
+
+    static const int SIZE = 19;
+
+    NineteenPointStencil(const GridType& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return NineteenPt<i,j,k>::idx; }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        mStencil[NineteenPt< 3, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 3,  0,  0));
+        mStencil[NineteenPt< 2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,  0,  0));
+        mStencil[NineteenPt< 1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,  0,  0));
+        mStencil[NineteenPt<-1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,  0,  0));
+        mStencil[NineteenPt<-2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,  0,  0));
+        mStencil[NineteenPt<-3, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-3,  0,  0));
+
+        mStencil[NineteenPt< 0, 3, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,  3,  0));
+        mStencil[NineteenPt< 0, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,  2,  0));
+        mStencil[NineteenPt< 0, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,  1,  0));
+        mStencil[NineteenPt< 0,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, -1,  0));
+        mStencil[NineteenPt< 0,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, -2,  0));
+        mStencil[NineteenPt< 0,-3, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, -3,  0));
+
+        mStencil[NineteenPt< 0, 0, 3>::idx] = mCache.getValue(ijk.offsetBy( 0,  0,  3));
+        mStencil[NineteenPt< 0, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,  0,  2));
+        mStencil[NineteenPt< 0, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,  0,  1));
+        mStencil[NineteenPt< 0, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,  0, -1));
+        mStencil[NineteenPt< 0, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,  0, -2));
+        mStencil[NineteenPt< 0, 0,-3>::idx] = mCache.getValue(ijk.offsetBy( 0,  0, -3));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// NineteenPointStencil class
+
+
+////////////////////////////////////////
+
+
+namespace { // anonymous namespace for stencil-layout map
+
+    // the 4th-order dense point stencil
+    template<int i, int j, int k> struct SixthDensePt { };
+    template<> struct SixthDensePt< 0, 0, 0> { enum { idx = 0 }; };
+
+    template<> struct SixthDensePt<-3, 3, 0> { enum { idx = 1 }; };
+    template<> struct SixthDensePt<-2, 3, 0> { enum { idx = 2 }; };
+    template<> struct SixthDensePt<-1, 3, 0> { enum { idx = 3 }; };
+    template<> struct SixthDensePt< 0, 3, 0> { enum { idx = 4 }; };
+    template<> struct SixthDensePt< 1, 3, 0> { enum { idx = 5 }; };
+    template<> struct SixthDensePt< 2, 3, 0> { enum { idx = 6 }; };
+    template<> struct SixthDensePt< 3, 3, 0> { enum { idx = 7 }; };
+
+    template<> struct SixthDensePt<-3, 2, 0> { enum { idx = 8 }; };
+    template<> struct SixthDensePt<-2, 2, 0> { enum { idx = 9 }; };
+    template<> struct SixthDensePt<-1, 2, 0> { enum { idx = 10 }; };
+    template<> struct SixthDensePt< 0, 2, 0> { enum { idx = 11 }; };
+    template<> struct SixthDensePt< 1, 2, 0> { enum { idx = 12 }; };
+    template<> struct SixthDensePt< 2, 2, 0> { enum { idx = 13 }; };
+    template<> struct SixthDensePt< 3, 2, 0> { enum { idx = 14 }; };
+
+    template<> struct SixthDensePt<-3, 1, 0> { enum { idx = 15 }; };
+    template<> struct SixthDensePt<-2, 1, 0> { enum { idx = 16 }; };
+    template<> struct SixthDensePt<-1, 1, 0> { enum { idx = 17 }; };
+    template<> struct SixthDensePt< 0, 1, 0> { enum { idx = 18 }; };
+    template<> struct SixthDensePt< 1, 1, 0> { enum { idx = 19 }; };
+    template<> struct SixthDensePt< 2, 1, 0> { enum { idx = 20 }; };
+    template<> struct SixthDensePt< 3, 1, 0> { enum { idx = 21 }; };
+
+    template<> struct SixthDensePt<-3, 0, 0> { enum { idx = 22 }; };
+    template<> struct SixthDensePt<-2, 0, 0> { enum { idx = 23 }; };
+    template<> struct SixthDensePt<-1, 0, 0> { enum { idx = 24 }; };
+    template<> struct SixthDensePt< 1, 0, 0> { enum { idx = 25 }; };
+    template<> struct SixthDensePt< 2, 0, 0> { enum { idx = 26 }; };
+    template<> struct SixthDensePt< 3, 0, 0> { enum { idx = 27 }; };
+
+
+    template<> struct SixthDensePt<-3,-1, 0> { enum { idx = 28 }; };
+    template<> struct SixthDensePt<-2,-1, 0> { enum { idx = 29 }; };
+    template<> struct SixthDensePt<-1,-1, 0> { enum { idx = 30 }; };
+    template<> struct SixthDensePt< 0,-1, 0> { enum { idx = 31 }; };
+    template<> struct SixthDensePt< 1,-1, 0> { enum { idx = 32 }; };
+    template<> struct SixthDensePt< 2,-1, 0> { enum { idx = 33 }; };
+    template<> struct SixthDensePt< 3,-1, 0> { enum { idx = 34 }; };
+
+
+    template<> struct SixthDensePt<-3,-2, 0> { enum { idx = 35 }; };
+    template<> struct SixthDensePt<-2,-2, 0> { enum { idx = 36 }; };
+    template<> struct SixthDensePt<-1,-2, 0> { enum { idx = 37 }; };
+    template<> struct SixthDensePt< 0,-2, 0> { enum { idx = 38 }; };
+    template<> struct SixthDensePt< 1,-2, 0> { enum { idx = 39 }; };
+    template<> struct SixthDensePt< 2,-2, 0> { enum { idx = 40 }; };
+    template<> struct SixthDensePt< 3,-2, 0> { enum { idx = 41 }; };
+
+
+    template<> struct SixthDensePt<-3,-3, 0> { enum { idx = 42 }; };
+    template<> struct SixthDensePt<-2,-3, 0> { enum { idx = 43 }; };
+    template<> struct SixthDensePt<-1,-3, 0> { enum { idx = 44 }; };
+    template<> struct SixthDensePt< 0,-3, 0> { enum { idx = 45 }; };
+    template<> struct SixthDensePt< 1,-3, 0> { enum { idx = 46 }; };
+    template<> struct SixthDensePt< 2,-3, 0> { enum { idx = 47 }; };
+    template<> struct SixthDensePt< 3,-3, 0> { enum { idx = 48 }; };
+
+
+    template<> struct SixthDensePt<-3, 0, 3> { enum { idx = 49 }; };
+    template<> struct SixthDensePt<-2, 0, 3> { enum { idx = 50 }; };
+    template<> struct SixthDensePt<-1, 0, 3> { enum { idx = 51 }; };
+    template<> struct SixthDensePt< 0, 0, 3> { enum { idx = 52 }; };
+    template<> struct SixthDensePt< 1, 0, 3> { enum { idx = 53 }; };
+    template<> struct SixthDensePt< 2, 0, 3> { enum { idx = 54 }; };
+    template<> struct SixthDensePt< 3, 0, 3> { enum { idx = 55 }; };
+
+
+    template<> struct SixthDensePt<-3, 0, 2> { enum { idx = 56 }; };
+    template<> struct SixthDensePt<-2, 0, 2> { enum { idx = 57 }; };
+    template<> struct SixthDensePt<-1, 0, 2> { enum { idx = 58 }; };
+    template<> struct SixthDensePt< 0, 0, 2> { enum { idx = 59 }; };
+    template<> struct SixthDensePt< 1, 0, 2> { enum { idx = 60 }; };
+    template<> struct SixthDensePt< 2, 0, 2> { enum { idx = 61 }; };
+    template<> struct SixthDensePt< 3, 0, 2> { enum { idx = 62 }; };
+
+    template<> struct SixthDensePt<-3, 0, 1> { enum { idx = 63 }; };
+    template<> struct SixthDensePt<-2, 0, 1> { enum { idx = 64 }; };
+    template<> struct SixthDensePt<-1, 0, 1> { enum { idx = 65 }; };
+    template<> struct SixthDensePt< 0, 0, 1> { enum { idx = 66 }; };
+    template<> struct SixthDensePt< 1, 0, 1> { enum { idx = 67 }; };
+    template<> struct SixthDensePt< 2, 0, 1> { enum { idx = 68 }; };
+    template<> struct SixthDensePt< 3, 0, 1> { enum { idx = 69 }; };
+
+
+    template<> struct SixthDensePt<-3, 0,-1> { enum { idx = 70 }; };
+    template<> struct SixthDensePt<-2, 0,-1> { enum { idx = 71 }; };
+    template<> struct SixthDensePt<-1, 0,-1> { enum { idx = 72 }; };
+    template<> struct SixthDensePt< 0, 0,-1> { enum { idx = 73 }; };
+    template<> struct SixthDensePt< 1, 0,-1> { enum { idx = 74 }; };
+    template<> struct SixthDensePt< 2, 0,-1> { enum { idx = 75 }; };
+    template<> struct SixthDensePt< 3, 0,-1> { enum { idx = 76 }; };
+
+
+    template<> struct SixthDensePt<-3, 0,-2> { enum { idx = 77 }; };
+    template<> struct SixthDensePt<-2, 0,-2> { enum { idx = 78 }; };
+    template<> struct SixthDensePt<-1, 0,-2> { enum { idx = 79 }; };
+    template<> struct SixthDensePt< 0, 0,-2> { enum { idx = 80 }; };
+    template<> struct SixthDensePt< 1, 0,-2> { enum { idx = 81 }; };
+    template<> struct SixthDensePt< 2, 0,-2> { enum { idx = 82 }; };
+    template<> struct SixthDensePt< 3, 0,-2> { enum { idx = 83 }; };
+
+
+    template<> struct SixthDensePt<-3, 0,-3> { enum { idx = 84 }; };
+    template<> struct SixthDensePt<-2, 0,-3> { enum { idx = 85 }; };
+    template<> struct SixthDensePt<-1, 0,-3> { enum { idx = 86 }; };
+    template<> struct SixthDensePt< 0, 0,-3> { enum { idx = 87 }; };
+    template<> struct SixthDensePt< 1, 0,-3> { enum { idx = 88 }; };
+    template<> struct SixthDensePt< 2, 0,-3> { enum { idx = 89 }; };
+    template<> struct SixthDensePt< 3, 0,-3> { enum { idx = 90 }; };
+
+
+    template<> struct SixthDensePt< 0,-3, 3> { enum { idx = 91 }; };
+    template<> struct SixthDensePt< 0,-2, 3> { enum { idx = 92 }; };
+    template<> struct SixthDensePt< 0,-1, 3> { enum { idx = 93 }; };
+    template<> struct SixthDensePt< 0, 1, 3> { enum { idx = 94 }; };
+    template<> struct SixthDensePt< 0, 2, 3> { enum { idx = 95 }; };
+    template<> struct SixthDensePt< 0, 3, 3> { enum { idx = 96 }; };
+
+    template<> struct SixthDensePt< 0,-3, 2> { enum { idx = 97 }; };
+    template<> struct SixthDensePt< 0,-2, 2> { enum { idx = 98 }; };
+    template<> struct SixthDensePt< 0,-1, 2> { enum { idx = 99 }; };
+    template<> struct SixthDensePt< 0, 1, 2> { enum { idx = 100 }; };
+    template<> struct SixthDensePt< 0, 2, 2> { enum { idx = 101 }; };
+    template<> struct SixthDensePt< 0, 3, 2> { enum { idx = 102 }; };
+
+    template<> struct SixthDensePt< 0,-3, 1> { enum { idx = 103 }; };
+    template<> struct SixthDensePt< 0,-2, 1> { enum { idx = 104 }; };
+    template<> struct SixthDensePt< 0,-1, 1> { enum { idx = 105 }; };
+    template<> struct SixthDensePt< 0, 1, 1> { enum { idx = 106 }; };
+    template<> struct SixthDensePt< 0, 2, 1> { enum { idx = 107 }; };
+    template<> struct SixthDensePt< 0, 3, 1> { enum { idx = 108 }; };
+
+    template<> struct SixthDensePt< 0,-3,-1> { enum { idx = 109 }; };
+    template<> struct SixthDensePt< 0,-2,-1> { enum { idx = 110 }; };
+    template<> struct SixthDensePt< 0,-1,-1> { enum { idx = 111 }; };
+    template<> struct SixthDensePt< 0, 1,-1> { enum { idx = 112 }; };
+    template<> struct SixthDensePt< 0, 2,-1> { enum { idx = 113 }; };
+    template<> struct SixthDensePt< 0, 3,-1> { enum { idx = 114 }; };
+
+    template<> struct SixthDensePt< 0,-3,-2> { enum { idx = 115 }; };
+    template<> struct SixthDensePt< 0,-2,-2> { enum { idx = 116 }; };
+    template<> struct SixthDensePt< 0,-1,-2> { enum { idx = 117 }; };
+    template<> struct SixthDensePt< 0, 1,-2> { enum { idx = 118 }; };
+    template<> struct SixthDensePt< 0, 2,-2> { enum { idx = 119 }; };
+    template<> struct SixthDensePt< 0, 3,-2> { enum { idx = 120 }; };
+
+    template<> struct SixthDensePt< 0,-3,-3> { enum { idx = 121 }; };
+    template<> struct SixthDensePt< 0,-2,-3> { enum { idx = 122 }; };
+    template<> struct SixthDensePt< 0,-1,-3> { enum { idx = 123 }; };
+    template<> struct SixthDensePt< 0, 1,-3> { enum { idx = 124 }; };
+    template<> struct SixthDensePt< 0, 2,-3> { enum { idx = 125 }; };
+    template<> struct SixthDensePt< 0, 3,-3> { enum { idx = 126 }; };
+
+}
+
+
+template<typename GridT, bool IsSafe = true>
+class SixthOrderDenseStencil
+    : public BaseStencil<SixthOrderDenseStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef SixthOrderDenseStencil<GridT, IsSafe> SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe >    BaseType;
+public:
+    typedef GridT                                 GridType;
+    typedef typename GridT::TreeType              TreeType;
+    typedef typename GridType::ValueType          ValueType;
+
+    static const int SIZE = 127;
+
+    SixthOrderDenseStencil(const GridType& grid): BaseType(grid, SIZE) {}
+
+    /// Return linear offset for the specified stencil point relative to its center
+    template<int i, int j, int k>
+    unsigned int pos() const { return SixthDensePt<i,j,k>::idx; }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        mStencil[SixthDensePt<-3, 3, 0>::idx] = mCache.getValue(ijk.offsetBy(-3, 3, 0));
+        mStencil[SixthDensePt<-2, 3, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 3, 0));
+        mStencil[SixthDensePt<-1, 3, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 3, 0));
+        mStencil[SixthDensePt< 0, 3, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, 3, 0));
+        mStencil[SixthDensePt< 1, 3, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 3, 0));
+        mStencil[SixthDensePt< 2, 3, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 3, 0));
+        mStencil[SixthDensePt< 3, 3, 0>::idx] = mCache.getValue(ijk.offsetBy( 3, 3, 0));
+
+        mStencil[SixthDensePt<-3, 2, 0>::idx] = mCache.getValue(ijk.offsetBy(-3, 2, 0));
+        mStencil[SixthDensePt<-2, 2, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 2, 0));
+        mStencil[SixthDensePt<-1, 2, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 2, 0));
+        mStencil[SixthDensePt< 0, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 0));
+        mStencil[SixthDensePt< 1, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 2, 0));
+        mStencil[SixthDensePt< 2, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 2, 0));
+        mStencil[SixthDensePt< 3, 2, 0>::idx] = mCache.getValue(ijk.offsetBy( 3, 2, 0));
+
+        mStencil[SixthDensePt<-3, 1, 0>::idx] = mCache.getValue(ijk.offsetBy(-3, 1, 0));
+        mStencil[SixthDensePt<-2, 1, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 1, 0));
+        mStencil[SixthDensePt<-1, 1, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 1, 0));
+        mStencil[SixthDensePt< 0, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 0));
+        mStencil[SixthDensePt< 1, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 1, 0));
+        mStencil[SixthDensePt< 2, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 1, 0));
+        mStencil[SixthDensePt< 3, 1, 0>::idx] = mCache.getValue(ijk.offsetBy( 3, 1, 0));
+
+        mStencil[SixthDensePt<-3, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-3, 0, 0));
+        mStencil[SixthDensePt<-2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 0));
+        mStencil[SixthDensePt<-1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 0));
+        mStencil[SixthDensePt< 1, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 0));
+        mStencil[SixthDensePt< 2, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 0));
+        mStencil[SixthDensePt< 3, 0, 0>::idx] = mCache.getValue(ijk.offsetBy( 3, 0, 0));
+
+        mStencil[SixthDensePt<-3,-1, 0>::idx] = mCache.getValue(ijk.offsetBy(-3,-1, 0));
+        mStencil[SixthDensePt<-2,-1, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,-1, 0));
+        mStencil[SixthDensePt<-1,-1, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,-1, 0));
+        mStencil[SixthDensePt< 0,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 0));
+        mStencil[SixthDensePt< 1,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,-1, 0));
+        mStencil[SixthDensePt< 2,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,-1, 0));
+        mStencil[SixthDensePt< 3,-1, 0>::idx] = mCache.getValue(ijk.offsetBy( 3,-1, 0));
+
+        mStencil[SixthDensePt<-3,-2, 0>::idx] = mCache.getValue(ijk.offsetBy(-3,-2, 0));
+        mStencil[SixthDensePt<-2,-2, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,-2, 0));
+        mStencil[SixthDensePt<-1,-2, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,-2, 0));
+        mStencil[SixthDensePt< 0,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 0));
+        mStencil[SixthDensePt< 1,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,-2, 0));
+        mStencil[SixthDensePt< 2,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,-2, 0));
+        mStencil[SixthDensePt< 3,-2, 0>::idx] = mCache.getValue(ijk.offsetBy( 3,-2, 0));
+
+        mStencil[SixthDensePt<-3,-3, 0>::idx] = mCache.getValue(ijk.offsetBy(-3,-3, 0));
+        mStencil[SixthDensePt<-2,-3, 0>::idx] = mCache.getValue(ijk.offsetBy(-2,-3, 0));
+        mStencil[SixthDensePt<-1,-3, 0>::idx] = mCache.getValue(ijk.offsetBy(-1,-3, 0));
+        mStencil[SixthDensePt< 0,-3, 0>::idx] = mCache.getValue(ijk.offsetBy( 0,-3, 0));
+        mStencil[SixthDensePt< 1,-3, 0>::idx] = mCache.getValue(ijk.offsetBy( 1,-3, 0));
+        mStencil[SixthDensePt< 2,-3, 0>::idx] = mCache.getValue(ijk.offsetBy( 2,-3, 0));
+        mStencil[SixthDensePt< 3,-3, 0>::idx] = mCache.getValue(ijk.offsetBy( 3,-3, 0));
+
+        mStencil[SixthDensePt<-3, 0, 3>::idx] = mCache.getValue(ijk.offsetBy(-3, 0, 3));
+        mStencil[SixthDensePt<-2, 0, 3>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 3));
+        mStencil[SixthDensePt<-1, 0, 3>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 3));
+        mStencil[SixthDensePt< 0, 0, 3>::idx] = mCache.getValue(ijk.offsetBy( 0, 0, 3));
+        mStencil[SixthDensePt< 1, 0, 3>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 3));
+        mStencil[SixthDensePt< 2, 0, 3>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 3));
+        mStencil[SixthDensePt< 3, 0, 3>::idx] = mCache.getValue(ijk.offsetBy( 3, 0, 3));
+
+        mStencil[SixthDensePt<-3, 0, 2>::idx] = mCache.getValue(ijk.offsetBy(-3, 0, 2));
+        mStencil[SixthDensePt<-2, 0, 2>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 2));
+        mStencil[SixthDensePt<-1, 0, 2>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 2));
+        mStencil[SixthDensePt< 0, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 0, 2));
+        mStencil[SixthDensePt< 1, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 2));
+        mStencil[SixthDensePt< 2, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 2));
+        mStencil[SixthDensePt< 3, 0, 2>::idx] = mCache.getValue(ijk.offsetBy( 3, 0, 2));
+
+        mStencil[SixthDensePt<-3, 0, 1>::idx] = mCache.getValue(ijk.offsetBy(-3, 0, 1));
+        mStencil[SixthDensePt<-2, 0, 1>::idx] = mCache.getValue(ijk.offsetBy(-2, 0, 1));
+        mStencil[SixthDensePt<-1, 0, 1>::idx] = mCache.getValue(ijk.offsetBy(-1, 0, 1));
+        mStencil[SixthDensePt< 0, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 0, 1));
+        mStencil[SixthDensePt< 1, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 1, 0, 1));
+        mStencil[SixthDensePt< 2, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 2, 0, 1));
+        mStencil[SixthDensePt< 3, 0, 1>::idx] = mCache.getValue(ijk.offsetBy( 3, 0, 1));
+
+        mStencil[SixthDensePt<-3, 0,-1>::idx] = mCache.getValue(ijk.offsetBy(-3, 0,-1));
+        mStencil[SixthDensePt<-2, 0,-1>::idx] = mCache.getValue(ijk.offsetBy(-2, 0,-1));
+        mStencil[SixthDensePt<-1, 0,-1>::idx] = mCache.getValue(ijk.offsetBy(-1, 0,-1));
+        mStencil[SixthDensePt< 0, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 0,-1));
+        mStencil[SixthDensePt< 1, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 1, 0,-1));
+        mStencil[SixthDensePt< 2, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 2, 0,-1));
+        mStencil[SixthDensePt< 3, 0,-1>::idx] = mCache.getValue(ijk.offsetBy( 3, 0,-1));
+
+        mStencil[SixthDensePt<-3, 0,-2>::idx] = mCache.getValue(ijk.offsetBy(-3, 0,-2));
+        mStencil[SixthDensePt<-2, 0,-2>::idx] = mCache.getValue(ijk.offsetBy(-2, 0,-2));
+        mStencil[SixthDensePt<-1, 0,-2>::idx] = mCache.getValue(ijk.offsetBy(-1, 0,-2));
+        mStencil[SixthDensePt< 0, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 0,-2));
+        mStencil[SixthDensePt< 1, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 1, 0,-2));
+        mStencil[SixthDensePt< 2, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 2, 0,-2));
+        mStencil[SixthDensePt< 3, 0,-2>::idx] = mCache.getValue(ijk.offsetBy( 3, 0,-2));
+
+        mStencil[SixthDensePt<-3, 0,-3>::idx] = mCache.getValue(ijk.offsetBy(-3, 0,-3));
+        mStencil[SixthDensePt<-2, 0,-3>::idx] = mCache.getValue(ijk.offsetBy(-2, 0,-3));
+        mStencil[SixthDensePt<-1, 0,-3>::idx] = mCache.getValue(ijk.offsetBy(-1, 0,-3));
+        mStencil[SixthDensePt< 0, 0,-3>::idx] = mCache.getValue(ijk.offsetBy( 0, 0,-3));
+        mStencil[SixthDensePt< 1, 0,-3>::idx] = mCache.getValue(ijk.offsetBy( 1, 0,-3));
+        mStencil[SixthDensePt< 2, 0,-3>::idx] = mCache.getValue(ijk.offsetBy( 2, 0,-3));
+        mStencil[SixthDensePt< 3, 0,-3>::idx] = mCache.getValue(ijk.offsetBy( 3, 0,-3));
+
+        mStencil[SixthDensePt< 0,-3, 3>::idx] = mCache.getValue(ijk.offsetBy( 0,-3, 3));
+        mStencil[SixthDensePt< 0,-2, 3>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 3));
+        mStencil[SixthDensePt< 0,-1, 3>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 3));
+        mStencil[SixthDensePt< 0, 1, 3>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 3));
+        mStencil[SixthDensePt< 0, 2, 3>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 3));
+        mStencil[SixthDensePt< 0, 3, 3>::idx] = mCache.getValue(ijk.offsetBy( 0, 3, 3));
+
+        mStencil[SixthDensePt< 0,-3, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,-3, 2));
+        mStencil[SixthDensePt< 0,-2, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 2));
+        mStencil[SixthDensePt< 0,-1, 2>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 2));
+        mStencil[SixthDensePt< 0, 1, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 2));
+        mStencil[SixthDensePt< 0, 2, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 2));
+        mStencil[SixthDensePt< 0, 3, 2>::idx] = mCache.getValue(ijk.offsetBy( 0, 3, 2));
+
+        mStencil[SixthDensePt< 0,-3, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,-3, 1));
+        mStencil[SixthDensePt< 0,-2, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,-2, 1));
+        mStencil[SixthDensePt< 0,-1, 1>::idx] = mCache.getValue(ijk.offsetBy( 0,-1, 1));
+        mStencil[SixthDensePt< 0, 1, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 1, 1));
+        mStencil[SixthDensePt< 0, 2, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 2, 1));
+        mStencil[SixthDensePt< 0, 3, 1>::idx] = mCache.getValue(ijk.offsetBy( 0, 3, 1));
+
+        mStencil[SixthDensePt< 0,-3,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,-3,-1));
+        mStencil[SixthDensePt< 0,-2,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,-2,-1));
+        mStencil[SixthDensePt< 0,-1,-1>::idx] = mCache.getValue(ijk.offsetBy( 0,-1,-1));
+        mStencil[SixthDensePt< 0, 1,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 1,-1));
+        mStencil[SixthDensePt< 0, 2,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 2,-1));
+        mStencil[SixthDensePt< 0, 3,-1>::idx] = mCache.getValue(ijk.offsetBy( 0, 3,-1));
+
+        mStencil[SixthDensePt< 0,-3,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,-3,-2));
+        mStencil[SixthDensePt< 0,-2,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,-2,-2));
+        mStencil[SixthDensePt< 0,-1,-2>::idx] = mCache.getValue(ijk.offsetBy( 0,-1,-2));
+        mStencil[SixthDensePt< 0, 1,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 1,-2));
+        mStencil[SixthDensePt< 0, 2,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 2,-2));
+        mStencil[SixthDensePt< 0, 3,-2>::idx] = mCache.getValue(ijk.offsetBy( 0, 3,-2));
+
+        mStencil[SixthDensePt< 0,-3,-3>::idx] = mCache.getValue(ijk.offsetBy( 0,-3,-3));
+        mStencil[SixthDensePt< 0,-2,-3>::idx] = mCache.getValue(ijk.offsetBy( 0,-2,-3));
+        mStencil[SixthDensePt< 0,-1,-3>::idx] = mCache.getValue(ijk.offsetBy( 0,-1,-3));
+        mStencil[SixthDensePt< 0, 1,-3>::idx] = mCache.getValue(ijk.offsetBy( 0, 1,-3));
+        mStencil[SixthDensePt< 0, 2,-3>::idx] = mCache.getValue(ijk.offsetBy( 0, 2,-3));
+        mStencil[SixthDensePt< 0, 3,-3>::idx] = mCache.getValue(ijk.offsetBy( 0, 3,-3));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+};// SixthOrderDenseStencil class
+
+
+//////////////////////////////////////////////////////////////////////
+
+
+/// This is a simple 7-point nearest neighbor stencil that supports
+/// gradient by second-order central differencing, first-order upwinding,
+/// Laplacian, closest-point transform and zero-crossing test.
+///
+/// @note For optimal random access performance this class
+/// includes its own grid accessor.
+template<typename GridT, bool IsSafe = true>
+class GradStencil : public BaseStencil<GradStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef GradStencil<GridT, IsSafe>         SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe > BaseType;
+public:
+    typedef GridT                              GridType;
+    typedef typename GridT::TreeType           TreeType;
+    typedef typename GridType::ValueType       ValueType;
+
+    static const int SIZE = 7;
+
+    GradStencil(const GridType& grid)
+        : BaseType(grid, SIZE)
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    GradStencil(const GridType& grid, Real dx)
+        : BaseType(grid, SIZE)
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    /// @brief Return the norm square of the single-sided upwind gradient
+    /// (computed via Godunov's scheme) at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline ValueType normSqGrad() const
+    {
+        return mInvDx2 * math::GodunovsNormSqrd(mStencil[0] > 0,
+                                                mStencil[0] - mStencil[1],
+                                                mStencil[2] - mStencil[0],
+                                                mStencil[0] - mStencil[3],
+                                                mStencil[4] - mStencil[0],
+                                                mStencil[0] - mStencil[5],
+                                                mStencil[6] - mStencil[0]);
+    }
+
+    /// @brief Return the gradient computed at the previously buffered
+    /// location by second order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline math::Vec3<ValueType> gradient() const
+    {
+        return math::Vec3<ValueType>(mStencil[2] - mStencil[1],
+                                     mStencil[4] - mStencil[3],
+                                     mStencil[6] - mStencil[5])*mInv2Dx;
+    }
+    /// @brief Return the first-order upwind gradient corresponding to the direction V.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline math::Vec3<ValueType> gradient(const math::Vec3<ValueType>& V) const
+    {
+        return math::Vec3<ValueType>(
+               V[0]>0 ? mStencil[0] - mStencil[1] : mStencil[2] - mStencil[0],
+               V[1]>0 ? mStencil[0] - mStencil[3] : mStencil[4] - mStencil[0],
+               V[2]>0 ? mStencil[0] - mStencil[5] : mStencil[6] - mStencil[0])*2*mInv2Dx;
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    inline ValueType laplacian() const
+    {
+        return mInvDx2 * (mStencil[1] + mStencil[2] +
+                          mStencil[3] + mStencil[4] +
+                          mStencil[5] + mStencil[6] - 6*mStencil[0]);
+    }
+
+    /// Return @c true if the sign of the value at the center point of the stencil
+    /// is different from the signs of any of its six nearest neighbors.
+    inline bool zeroCrossing() const
+    {
+        const typename BaseType::BufferType& v = mStencil;
+        return (v[0]>0 ? (v[1]<0 || v[2]<0 || v[3]<0 || v[4]<0 || v[5]<0 || v[6]<0)
+                       : (v[1]>0 || v[2]>0 || v[3]>0 || v[4]>0 || v[5]>0 || v[6]>0));
+    }
+
+    /// @brief Compute the closest-point transform to a level set.
+    /// @return the closest point in index space to the surface
+    /// from which the level set was derived.
+    ///
+    /// @note This method assumes that the grid represents a level set
+    /// with distances in world units and a simple affine transfrom
+    /// with uniform scaling.
+    inline math::Vec3<ValueType> cpt()
+    {
+        const Coord& ijk = BaseType::getCenterCoord();
+        const ValueType d = ValueType(mStencil[0] * 0.5 * mInvDx2); // distance in voxels / (2dx^2)
+        return math::Vec3<ValueType>(ijk[0] - d*(mStencil[2] - mStencil[1]),
+                                     ijk[1] - d*(mStencil[4] - mStencil[3]),
+                                     ijk[2] - d*(mStencil[6] - mStencil[5]));
+    }
+
+private:
+
+    inline void init(const Coord& ijk)
+    {
+        mStencil[1] = mCache.getValue(ijk.offsetBy(-1,  0,  0));
+        mStencil[2] = mCache.getValue(ijk.offsetBy( 1,  0,  0));
+
+        mStencil[3] = mCache.getValue(ijk.offsetBy( 0, -1,  0));
+        mStencil[4] = mCache.getValue(ijk.offsetBy( 0,  1,  0));
+
+        mStencil[5] = mCache.getValue(ijk.offsetBy( 0,  0, -1));
+        mStencil[6] = mCache.getValue(ijk.offsetBy( 0,  0,  1));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+    const ValueType mInv2Dx, mInvDx2;
+}; // GradStencil class
+
+////////////////////////////////////////
+
+
+/// @brief This is a special 19-point stencil that supports optimal fifth-order WENO
+/// upwinding, second-order central differencing, Laplacian, and zero-crossing test.
+///
+/// @note For optimal random access performance this class
+/// includes its own grid accessor.
+template<typename GridT, bool IsSafe = true>
+class WenoStencil: public BaseStencil<WenoStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef WenoStencil<GridT, IsSafe>         SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe > BaseType;
+public:
+    typedef GridT                              GridType;
+    typedef typename GridT::TreeType           TreeType;
+    typedef typename GridType::ValueType       ValueType;
+
+    static const int SIZE = 19;
+
+    WenoStencil(const GridType& grid)
+        : BaseType(grid, SIZE)
+        , mDx2(ValueType(math::Pow2(grid.voxelSize()[0])))
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(1.0 / mDx2))
+    {
+    }
+
+    WenoStencil(const GridType& grid, Real dx)
+        : BaseType(grid, SIZE)
+        , mDx2(ValueType(dx * dx))
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(1.0 / mDx2))
+    {
+    }
+
+    /// @brief Return the norm-square of the WENO upwind gradient (computed via
+    /// WENO upwinding and Godunov's scheme) at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline ValueType normSqGrad() const
+    {
+        const typename BaseType::BufferType& v = mStencil;
+#ifdef DWA_OPENVDB
+        // SSE optimized
+        const simd::Float4
+            v1(v[2]-v[1], v[ 8]-v[ 7], v[14]-v[13], 0),
+            v2(v[3]-v[2], v[ 9]-v[ 8], v[15]-v[14], 0),
+            v3(v[0]-v[3], v[ 0]-v[ 9], v[ 0]-v[15], 0),
+            v4(v[4]-v[0], v[10]-v[ 0], v[16]-v[ 0], 0),
+            v5(v[5]-v[4], v[11]-v[10], v[17]-v[16], 0),
+            v6(v[6]-v[5], v[12]-v[11], v[18]-v[17], 0),
+            dP_m = math::WENO5(v1, v2, v3, v4, v5, mDx2),
+            dP_p = math::WENO5(v6, v5, v4, v3, v2, mDx2);
+
+        return mInvDx2 * math::GodunovsNormSqrd(mStencil[0] > 0, dP_m, dP_p);
+#else
+        const Real
+            dP_xm = math::WENO5(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3],v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2),
+            dP_xp = math::WENO5(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0],v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
+            dP_ym = math::WENO5(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9],v[10]-v[ 0],v[11]-v[10],mDx2),
+            dP_yp = math::WENO5(v[12]-v[11],v[11]-v[10],v[10]-v[ 0],v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
+            dP_zm = math::WENO5(v[14]-v[13],v[15]-v[14],v[ 0]-v[15],v[16]-v[ 0],v[17]-v[16],mDx2),
+            dP_zp = math::WENO5(v[18]-v[17],v[17]-v[16],v[16]-v[ 0],v[ 0]-v[15],v[15]-v[14],mDx2);
+        return static_cast<ValueType>(
+            mInvDx2*math::GodunovsNormSqrd(v[0]>0,dP_xm,dP_xp,dP_ym,dP_yp,dP_zm,dP_zp));
+#endif
+    }
+
+    /// Return the optimal fifth-order upwind gradient corresponding to the
+    /// direction V.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline math::Vec3<ValueType> gradient(const math::Vec3<ValueType>& V) const
+    {
+        const typename BaseType::BufferType& v = mStencil;
+        return 2*mInv2Dx * math::Vec3<ValueType>(
+            V[0]>0 ? math::WENO5(v[ 2]-v[ 1],v[ 3]-v[ 2],v[ 0]-v[ 3], v[ 4]-v[ 0],v[ 5]-v[ 4],mDx2)
+                : math::WENO5(v[ 6]-v[ 5],v[ 5]-v[ 4],v[ 4]-v[ 0], v[ 0]-v[ 3],v[ 3]-v[ 2],mDx2),
+            V[1]>0 ? math::WENO5(v[ 8]-v[ 7],v[ 9]-v[ 8],v[ 0]-v[ 9], v[10]-v[ 0],v[11]-v[10],mDx2)
+                : math::WENO5(v[12]-v[11],v[11]-v[10],v[10]-v[ 0], v[ 0]-v[ 9],v[ 9]-v[ 8],mDx2),
+            V[2]>0 ? math::WENO5(v[14]-v[13],v[15]-v[14],v[ 0]-v[15], v[16]-v[ 0],v[17]-v[16],mDx2)
+                : math::WENO5(v[18]-v[17],v[17]-v[16],v[16]-v[ 0], v[ 0]-v[15],v[15]-v[14],mDx2));
+    }
+    /// Return the gradient computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline math::Vec3<ValueType> gradient() const
+    {
+        return mInv2Dx * math::Vec3<ValueType>(mStencil[ 4] - mStencil[ 3],
+                                               mStencil[10] - mStencil[ 9],
+                                               mStencil[16] - mStencil[15]);
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline ValueType laplacian() const
+    {
+        return mInvDx2 * (
+            mStencil[ 3] + mStencil[ 4] +
+            mStencil[ 9] + mStencil[10] +
+            mStencil[15] + mStencil[16] - 6*mStencil[0]);
+    }
+
+    /// Return @c true if the sign of the value at the center point of the stencil
+    /// differs from the sign of any of its six nearest neighbors
+    inline bool zeroCrossing() const
+    {
+        const typename BaseType::BufferType& v = mStencil;
+        return (v[ 0]>0 ? (v[ 3]<0 || v[ 4]<0 || v[ 9]<0 || v[10]<0 || v[15]<0 || v[16]<0)
+                        : (v[ 3]>0 || v[ 4]>0 || v[ 9]>0 || v[10]>0 || v[15]>0 || v[16]>0));
+    }
+
+private:
+    inline void init(const Coord& ijk)
+    {
+        mStencil[ 1] = mCache.getValue(ijk.offsetBy(-3,  0,  0));
+        mStencil[ 2] = mCache.getValue(ijk.offsetBy(-2,  0,  0));
+        mStencil[ 3] = mCache.getValue(ijk.offsetBy(-1,  0,  0));
+        mStencil[ 4] = mCache.getValue(ijk.offsetBy( 1,  0,  0));
+        mStencil[ 5] = mCache.getValue(ijk.offsetBy( 2,  0,  0));
+        mStencil[ 6] = mCache.getValue(ijk.offsetBy( 3,  0,  0));
+
+        mStencil[ 7] = mCache.getValue(ijk.offsetBy( 0, -3,  0));
+        mStencil[ 8] = mCache.getValue(ijk.offsetBy( 0, -2,  0));
+        mStencil[ 9] = mCache.getValue(ijk.offsetBy( 0, -1,  0));
+        mStencil[10] = mCache.getValue(ijk.offsetBy( 0,  1,  0));
+        mStencil[11] = mCache.getValue(ijk.offsetBy( 0,  2,  0));
+        mStencil[12] = mCache.getValue(ijk.offsetBy( 0,  3,  0));
+
+        mStencil[13] = mCache.getValue(ijk.offsetBy( 0,  0, -3));
+        mStencil[14] = mCache.getValue(ijk.offsetBy( 0,  0, -2));
+        mStencil[15] = mCache.getValue(ijk.offsetBy( 0,  0, -1));
+        mStencil[16] = mCache.getValue(ijk.offsetBy( 0,  0,  1));
+        mStencil[17] = mCache.getValue(ijk.offsetBy( 0,  0,  2));
+        mStencil[18] = mCache.getValue(ijk.offsetBy( 0,  0,  3));
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+    const ValueType mDx2, mInv2Dx, mInvDx2;
+}; // WenoStencil class
+
+
+//////////////////////////////////////////////////////////////////////
+
+
+template<typename GridT, bool IsSafe = true>
+class CurvatureStencil: public BaseStencil<CurvatureStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef CurvatureStencil<GridT, IsSafe>   SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe> BaseType;
+public:
+    typedef GridT                             GridType;
+    typedef typename GridT::TreeType          TreeType;
+    typedef typename GridT::ValueType         ValueType;
+
+     static const int SIZE = 19;
+
+    CurvatureStencil(const GridType& grid)
+        : BaseType(grid, SIZE)
+        , mInv2Dx(ValueType(0.5 / grid.voxelSize()[0]))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    CurvatureStencil(const GridType& grid, Real dx)
+        : BaseType(grid, SIZE)
+        , mInv2Dx(ValueType(0.5 / dx))
+        , mInvDx2(ValueType(4.0 * mInv2Dx * mInv2Dx))
+    {
+    }
+
+    /// @brief Return the mean curvature at the previously buffered location.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline ValueType meanCurvature()
+    {
+        Real alpha, beta;
+        return this->meanCurvature(alpha, beta) ? ValueType(alpha*mInv2Dx/math::Pow3(beta)) : 0;
+    }
+
+    /// Return the mean curvature multiplied by the norm of the
+    /// central-difference gradient. This method is very useful for
+    /// mean-curvature flow of level sets!
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline ValueType meanCurvatureNormGrad()
+    {
+        Real alpha, beta;
+        return this->meanCurvature(alpha, beta) ? ValueType(alpha*mInvDx2/(2*math::Pow2(beta))) : 0;
+    }
+
+    /// Return the Laplacian computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline ValueType laplacian() const
+    {
+        return mInvDx2 * (
+            mStencil[1] + mStencil[2] +
+            mStencil[3] + mStencil[4] +
+            mStencil[5] + mStencil[6] - 6*mStencil[0]);
+    }
+
+    /// Return the gradient computed at the previously buffered
+    /// location by second-order central differencing.
+    ///
+    /// @note This method should not be called until the stencil
+    /// buffer has been populated via a call to moveTo(ijk).
+    inline math::Vec3<ValueType> gradient()
+    {
+        return math::Vec3<ValueType>(
+            mStencil[2] - mStencil[1],
+            mStencil[4] - mStencil[3],
+            mStencil[6] - mStencil[5])*mInv2Dx;
+    }
+
+private:
+    inline void init(const Coord &ijk)
+    {
+        mStencil[ 1] = mCache.getValue(ijk.offsetBy(-1,  0,  0));
+        mStencil[ 2] = mCache.getValue(ijk.offsetBy( 1,  0,  0));
+
+        mStencil[ 3] = mCache.getValue(ijk.offsetBy( 0, -1,  0));
+        mStencil[ 4] = mCache.getValue(ijk.offsetBy( 0,  1,  0));
+
+        mStencil[ 5] = mCache.getValue(ijk.offsetBy( 0,  0, -1));
+        mStencil[ 6] = mCache.getValue(ijk.offsetBy( 0,  0,  1));
+
+        mStencil[ 7] = mCache.getValue(ijk.offsetBy(-1, -1,  0));
+        mStencil[ 8] = mCache.getValue(ijk.offsetBy( 1, -1,  0));
+        mStencil[ 9] = mCache.getValue(ijk.offsetBy(-1,  1,  0));
+        mStencil[10] = mCache.getValue(ijk.offsetBy( 1,  1,  0));
+
+        mStencil[11] = mCache.getValue(ijk.offsetBy(-1,  0, -1));
+        mStencil[12] = mCache.getValue(ijk.offsetBy( 1,  0, -1));
+        mStencil[13] = mCache.getValue(ijk.offsetBy(-1,  0,  1));
+        mStencil[14] = mCache.getValue(ijk.offsetBy( 1,  0,  1));
+
+        mStencil[15] = mCache.getValue(ijk.offsetBy( 0, -1, -1));
+        mStencil[16] = mCache.getValue(ijk.offsetBy( 0,  1, -1));
+        mStencil[17] = mCache.getValue(ijk.offsetBy( 0, -1,  1));
+        mStencil[18] = mCache.getValue(ijk.offsetBy( 0,  1,  1));
+    }
+
+    inline bool meanCurvature(Real& alpha, Real& beta) const
+    {
+        // For performance all finite differences are unscaled wrt dx
+        const Real
+            Half(0.5), Quarter(0.25),
+            Dx  = Half * (mStencil[2] - mStencil[1]), Dx2 = Dx * Dx, // * 1/dx
+            Dy  = Half * (mStencil[4] - mStencil[3]), Dy2 = Dy * Dy, // * 1/dx
+            Dz  = Half * (mStencil[6] - mStencil[5]), Dz2 = Dz * Dz, // * 1/dx
+            normGrad = Dx2 + Dy2 + Dz2;
+        if (normGrad <= math::Tolerance<Real>::value()) {
+             alpha = beta = 0;
+             return false;
+        }
+        const Real
+            Dxx = mStencil[2] - 2 * mStencil[0] + mStencil[1], // * 1/dx2
+            Dyy = mStencil[4] - 2 * mStencil[0] + mStencil[3], // * 1/dx2
+            Dzz = mStencil[6] - 2 * mStencil[0] + mStencil[5], // * 1/dx2
+            Dxy = Quarter * (mStencil[10] - mStencil[ 8] + mStencil[7] - mStencil[ 9]), // * 1/dx2
+            Dxz = Quarter * (mStencil[14] - mStencil[12] + mStencil[11] - mStencil[13]), // * 1/dx2
+            Dyz = Quarter * (mStencil[18] - mStencil[16] + mStencil[15] - mStencil[17]); // * 1/dx2
+        alpha = (Dx2*(Dyy+Dzz)+Dy2*(Dxx+Dzz)+Dz2*(Dxx+Dyy)-2*(Dx*(Dy*Dxy+Dz*Dxz)+Dy*Dz*Dyz));
+        beta  = std::sqrt(normGrad); // * 1/dx
+        return true;
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+    const ValueType mInv2Dx, mInvDx2;
+}; // CurvatureStencil class
+
+
+//////////////////////////////////////////////////////////////////////
+
+
+/// @brief Dense stencil of a given width
+template<typename GridT, bool IsSafe = true>
+class DenseStencil: public BaseStencil<DenseStencil<GridT, IsSafe>, GridT, IsSafe>
+{
+    typedef DenseStencil<GridT, IsSafe>       SelfT;
+    typedef BaseStencil<SelfT, GridT, IsSafe> BaseType;
+public:
+    typedef GridT                             GridType;
+    typedef typename GridT::TreeType          TreeType;
+    typedef typename GridType::ValueType      ValueType;
+
+    DenseStencil(const GridType& grid, int halfWidth)
+        : BaseType(grid, /*size=*/math::Pow3(2 * halfWidth + 1))
+        , mHalfWidth(halfWidth)
+    {
+        assert(halfWidth>0);
+    }
+
+    inline const ValueType& getCenterValue() const { return mStencil[(mStencil.size()-1)>>1]; }
+
+    /// @brief Initialize the stencil buffer with the values of voxel (x, y, z)
+    /// and its neighbors.
+    inline void moveTo(const Coord& ijk)
+    {
+        BaseType::mCenter = ijk;
+        this->init(ijk);
+    }
+    /// @brief Initialize the stencil buffer with the values of voxel
+    /// (x, y, z) and its neighbors.
+    template<typename IterType>
+    inline void moveTo(const IterType& iter)
+    {
+        BaseType::mCenter = iter.getCoord();
+        this->init(BaseType::mCenter);
+    }
+
+private:
+    /// Initialize the stencil buffer centered at (i, j, k).
+    /// @warning The center point is NOT at mStencil[0] for this DenseStencil!
+    inline void init(const Coord& ijk)
+    {
+        int n = 0;
+        for (Coord p=ijk.offsetBy(-mHalfWidth), q=ijk.offsetBy(mHalfWidth); p[0] <= q[0]; ++p[0]) {
+            for (p[1] = ijk[1]-mHalfWidth; p[1] <= q[1]; ++p[1]) {
+                for (p[2] = ijk[2]-mHalfWidth; p[2] <= q[2]; ++p[2]) {
+                    mStencil[n++] = mCache.getValue(p);
+                }
+            }
+        }
+    }
+
+    template<typename, typename, bool> friend class BaseStencil; // allow base class to call init()
+    using BaseType::mCache;
+    using BaseType::mStencil;
+    const int mHalfWidth;
+};// DenseStencil class
+
+
+} // end math namespace
+} // namespace OPENVDB_VERSION_NAME
+} // end openvdb namespace
+
+#endif // OPENVDB_MATH_STENCILS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Transform.h b/nuparu/include/openvdb_new/math/Transform.h
new file mode 100644
index 00000000..df37d0e4
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Transform.h
@@ -0,0 +1,310 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_TRANSFORM_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_TRANSFORM_HAS_BEEN_INCLUDED
+
+#include "Maps.h"
+#include <openvdb/Types.h>
+#include <iosfwd>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+// Forward declaration
+class Transform;
+
+
+// Utility methods
+
+/// @brief Calculate an axis-aligned bounding box in index space from an
+/// axis-aligned bounding box in world space.
+/// @see Transform::worldToIndex(const BBoxd&) const
+OPENVDB_API void
+calculateBounds(const Transform& t, const Vec3d& minWS, const Vec3d& maxWS,
+    Vec3d& minIS, Vec3d& maxIS);
+
+/// @brief Calculate an axis-aligned bounding box in index space from a
+/// bounding sphere in world space.
+/// @todo void calculateBounds(const Transform& t, const Vec3d& center, const Real radius,
+///     Vec3d& minIS, Vec3d& maxIS);
+
+
+////////////////////////////////////////
+
+
+/// @class Transform
+class OPENVDB_API Transform
+{
+public:
+    typedef boost::shared_ptr<Transform> Ptr;
+    typedef boost::shared_ptr<const Transform> ConstPtr;
+
+    Transform(): mMap(MapBase::Ptr(new ScaleMap())) {}
+    Transform(const MapBase::Ptr&);
+    Transform(const Transform&);
+    ~Transform() {}
+
+    Ptr copy() const { return Ptr(new Transform(mMap->copy())); }
+
+    //@{
+    /// @brief Create and return a shared pointer to a new transform.
+    static Transform::Ptr createLinearTransform(double voxelSize = 1.0);
+    static Transform::Ptr createLinearTransform(const Mat4R&);
+    static Transform::Ptr createFrustumTransform(const BBoxd&, double taper,
+        double depth, double voxelSize = 1.0);
+    //@}
+
+    /// Return @c true if the transformation map is exclusively linear/affine.
+    bool isLinear() const { return mMap->isLinear(); }
+
+    /// Return @c true if the transform is equivalent to an idenity.
+    bool isIdentity() const ;
+    /// Return the transformation map's type-name
+    Name mapType() const { return mMap->type(); }
+
+
+    //@{
+    /// @brief Update the linear (affine) map by prepending or
+    /// postfixing the appropriate operation.  In the case of
+    /// a frustum, the pre-operations apply to the linear part
+    /// of the transform and not the entire transform, while the
+    /// post-operations are allways applied last.
+    void preRotate(double radians, const Axis axis = X_AXIS);
+    void preTranslate(const Vec3d&);
+    void preScale(const Vec3d&);
+    void preScale(double);
+    void preShear(double shear, Axis axis0, Axis axis1);
+    void preMult(const Mat4d&);
+    void preMult(const Mat3d&);
+
+    void postRotate(double radians, const Axis axis = X_AXIS);
+    void postTranslate(const Vec3d&);
+    void postScale(const Vec3d&);
+    void postScale(double);
+    void postShear(double shear, Axis axis0, Axis axis1);
+    void postMult(const Mat4d&);
+    void postMult(const Mat3d&);
+    //@}
+
+    /// Return the size of a voxel using the linear component of the map.
+    Vec3d voxelSize() const { return mMap->voxelSize(); }
+    /// @brief Return the size of a voxel at position (x, y, z).
+    /// @note Maps that have a nonlinear component (e.g., perspective and frustum maps)
+    /// have position-dependent voxel sizes.
+    Vec3d voxelSize(const Vec3d& xyz) const { return mMap->voxelSize(xyz); }
+
+    /// Return the voxel volume of the linear component of the map.
+    double voxelVolume() const { return mMap->determinant(); }
+    /// Return the voxel volume at position (x, y, z).
+    double voxelVolume(const Vec3d& xyz) const { return mMap->determinant(xyz); }
+    /// Return true if the voxels in world space are uniformly sized cubes
+    bool hasUniformScale() const { return mMap->hasUniformScale(); }
+
+    //@{
+    /// @brief Apply this transformation to the given coordinates.
+    Vec3d indexToWorld(const Vec3d& xyz) const { return mMap->applyMap(xyz); }
+    Vec3d indexToWorld(const Coord& ijk) const { return mMap->applyMap(ijk.asVec3d()); }
+    Vec3d worldToIndex(const Vec3d& xyz) const { return mMap->applyInverseMap(xyz); }
+    Coord worldToIndexCellCentered(const Vec3d& xyz) const {return Coord::round(worldToIndex(xyz));}
+    Coord worldToIndexNodeCentered(const Vec3d& xyz) const {return Coord::floor(worldToIndex(xyz));}
+    //@}
+
+    //@{
+    /// @brief Apply this transformation to the given index-space bounding box.
+    /// @return an axis-aligned world-space bounding box
+    BBoxd indexToWorld(const CoordBBox&) const;
+    BBoxd indexToWorld(const BBoxd&) const;
+    //@}
+    //@{
+    /// @brief Apply the inverse of this transformation to the given world-space bounding box.
+    /// @return an axis-aligned index-space bounding box
+    BBoxd worldToIndex(const BBoxd&) const;
+    CoordBBox worldToIndexCellCentered(const BBoxd&) const;
+    CoordBBox worldToIndexNodeCentered(const BBoxd&) const;
+    //@}
+
+    //@{
+    /// Return a base pointer to the transformation map.
+    MapBase::ConstPtr baseMap() const { return mMap; }
+    MapBase::Ptr baseMap() { return mMap; }
+    //@}
+
+    //@{
+    /// @brief Return the result of downcasting the base map pointer to a
+    /// @c MapType pointer, or return a null pointer if the types are incompatible.
+    template<typename MapType> typename MapType::Ptr map();
+    template<typename MapType> typename MapType::ConstPtr map() const;
+    template<typename MapType> typename MapType::ConstPtr constMap() const;
+    //@}
+
+    /// Unserialize this transform from the given stream.
+    void read(std::istream&);
+    /// Serialize this transform to the given stream.
+    void write(std::ostream&) const;
+
+    /// @brief Print a description of this transform.
+    /// @param os      a stream to which to write textual information
+    /// @param indent  a string with which to prefix each line of text
+    void print(std::ostream& os = std::cout, const std::string& indent = "") const;
+
+    bool operator==(const Transform& other) const;
+    inline bool operator!=(const Transform& other) const { return !(*this == other); }
+
+private:
+    MapBase::Ptr mMap;
+}; // class Transform
+
+
+OPENVDB_API std::ostream& operator<<(std::ostream&, const Transform&);
+
+
+////////////////////////////////////////
+
+
+template<typename MapType>
+inline typename MapType::Ptr
+Transform::map()
+{
+    if (mMap->type() == MapType::mapType()) {
+        return boost::static_pointer_cast<MapType>(mMap);
+    }
+    return typename MapType::Ptr();
+}
+
+
+template<typename MapType>
+inline typename MapType::ConstPtr
+Transform::map() const
+{
+    return boost::const_pointer_cast<const MapType>(
+        const_cast<Transform*>(this)->map<MapType>());
+}
+
+
+template<typename MapType>
+inline typename MapType::ConstPtr
+Transform::constMap() const
+{
+    return map<MapType>();
+}
+
+
+////////////////////////////////////////
+
+
+/// Helper function used internally by processTypedMap()
+template<typename ResolvedMapType, typename OpType>
+inline void
+doProcessTypedMap(Transform& transform, OpType& op)
+{
+    ResolvedMapType& resolvedMap = *transform.map<ResolvedMapType>();
+#ifdef _MSC_VER
+    op.operator()<ResolvedMapType>(resolvedMap);
+#else
+    op.template operator()<ResolvedMapType>(resolvedMap);
+#endif
+}
+
+/// Helper function used internally by processTypedMap()
+template<typename ResolvedMapType, typename OpType>
+inline void
+doProcessTypedMap(const Transform& transform, OpType& op)
+{
+    const ResolvedMapType& resolvedMap = *transform.map<ResolvedMapType>();
+#ifdef _MSC_VER
+    op.operator()<ResolvedMapType>(resolvedMap);
+#else
+    op.template operator()<ResolvedMapType>(resolvedMap);
+#endif
+}
+
+
+/// @brief Utility function that, given a generic map pointer,
+/// calls a functor on the fully-resoved map
+///
+/// Usage:
+/// @code
+/// struct Foo {
+///     template<typename MapT>
+///     void operator()(const MapT&  map) const { blah }
+/// };
+///
+/// processTypedMap(myMap, Foo());
+/// @endcode
+///
+/// @return @c false if the grid type is unknown or unhandled.
+template<typename TransformType, typename OpType>
+bool
+processTypedMap(TransformType& transform, OpType& op)
+{
+    using namespace openvdb;
+
+    const Name mapType = transform.mapType();
+    if (mapType == UniformScaleMap::mapType()) {
+        doProcessTypedMap<UniformScaleMap, OpType>(transform, op);
+
+    } else if (mapType == UniformScaleTranslateMap::mapType()) {
+        doProcessTypedMap<UniformScaleTranslateMap, OpType>(transform, op);
+
+    } else if (mapType == ScaleMap::mapType()) {
+        doProcessTypedMap<ScaleMap, OpType>(transform, op);
+
+    } else if  (mapType == ScaleTranslateMap::mapType()) {
+        doProcessTypedMap<ScaleTranslateMap, OpType>(transform, op);
+
+    } else if (mapType == UnitaryMap::mapType()) {
+        doProcessTypedMap<UnitaryMap, OpType>(transform, op);
+
+    } else if (mapType == AffineMap::mapType()) {
+        doProcessTypedMap<AffineMap, OpType>(transform, op);
+
+    } else if (mapType == TranslationMap::mapType()) {
+        doProcessTypedMap<TranslationMap, OpType>(transform, op);
+
+    } else if (mapType == NonlinearFrustumMap::mapType()) {
+        doProcessTypedMap<NonlinearFrustumMap, OpType>(transform, op);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_TRANSFORM_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Tuple.h b/nuparu/include/openvdb_new/math/Tuple.h
new file mode 100644
index 00000000..f46248a0
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Tuple.h
@@ -0,0 +1,212 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Tuple.h
+/// @author Ben Kwa
+
+#ifndef OPENVDB_MATH_TUPLE_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_TUPLE_HAS_BEEN_INCLUDED
+
+#include <sstream>
+#include <boost/type_traits/is_integral.hpp>
+#include "Math.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+/// @class Tuple "Tuple.h"
+/// A base class for homogenous tuple types
+template<int SIZE, typename T>
+class Tuple {
+public:
+    typedef T value_type;
+    typedef T ValueType;
+
+    static const int size = SIZE;
+
+    /// Default ctor.  Does nothing.  Required because declaring a copy (or
+    /// other) constructor means the default constructor gets left out.
+    Tuple() {}
+
+    /// Copy constructor.  Used when the class signature matches exactly.
+    inline Tuple(Tuple const &src) {
+        for (int i = 0; i < SIZE; ++i) {
+            mm[i] = src.mm[i];
+        }
+    }
+
+    /// Conversion constructor.  Tuples with different value types and
+    /// different sizes can be interconverted using this member.  Converting
+    /// from a larger tuple results in truncation; converting from a smaller
+    /// tuple results in the extra data members being zeroed out.  This
+    /// function assumes that the integer 0 is convertible to the tuple's
+    /// value type.
+    template <int src_size, typename src_valtype>
+    explicit Tuple(Tuple<src_size, src_valtype> const &src) {
+        enum { COPY_END = (SIZE < src_size ? SIZE : src_size) };
+
+        for (int i = 0; i < COPY_END; ++i) {
+            mm[i] = src[i];
+        }
+        for (int i = COPY_END; i < SIZE; ++i) {
+            mm[i] = 0;
+        }
+    }
+
+    T operator[](int i) const {
+        // we'd prefer to use size_t, but can't because gcc3.2 doesn't like
+        // it - it conflicts with child class conversion operators to
+        // pointer types.
+//             assert(i >= 0 && i < SIZE);
+        return mm[i];
+    }
+
+    T& operator[](int i) {
+        // see above for size_t vs int
+//             assert(i >= 0 && i < SIZE);
+        return mm[i];
+    }
+
+    /// @name Compatibility
+    /// These are mostly for backwards compability with functions that take
+    /// old-style Vs (which are just arrays).
+    //@{
+    /// Copies this tuple into an array of a compatible type
+    template <typename S>
+    void toV(S *v) const {
+        for (int i = 0; i < SIZE; ++i) {
+            v[i] = mm[i];
+        }
+    }
+
+    /// Exposes the internal array.  Be careful when using this function.
+    value_type *asV() {
+        return mm;
+    }
+    /// Exposes the internal array.  Be careful when using this function.
+    value_type const *asV() const {
+        return mm;
+    }
+    //@}  Compatibility
+
+    /// @return string representation of Classname
+    std::string
+    str() const {
+        std::ostringstream buffer;
+
+        buffer << "[";
+
+        // For each column
+        for (unsigned j(0); j < SIZE; j++) {
+            if (j) buffer << ", ";
+            buffer << mm[j];
+        }
+
+        buffer << "]";
+
+        return buffer.str();
+    }
+
+    void write(std::ostream& os) const {
+        os.write(reinterpret_cast<const char*>(&mm), sizeof(T)*SIZE);
+    }
+    void read(std::istream& is) {
+        is.read(reinterpret_cast<char*>(&mm), sizeof(T)*SIZE);
+    }
+
+protected:
+    T mm[SIZE];
+};
+
+
+////////////////////////////////////////
+
+
+/// @return true if t0 < t1, comparing components in order of significance.
+template<int SIZE, typename T0, typename T1>
+bool
+operator<(const Tuple<SIZE, T0>& t0, const Tuple<SIZE, T1>& t1)
+{
+    for (int i = 0; i < SIZE-1; ++i) {
+        if (!isExactlyEqual(t0[i], t1[i])) return t0[i] < t1[i];
+    }
+    return t0[SIZE-1] < t1[SIZE-1];
+}
+
+
+/// @return true if t0 > t1, comparing components in order of significance.
+template<int SIZE, typename T0, typename T1>
+bool
+operator>(const Tuple<SIZE, T0>& t0, const Tuple<SIZE, T1>& t1)
+{
+    for (int i = 0; i < SIZE-1; ++i) {
+        if (!isExactlyEqual(t0[i], t1[i])) return t0[i] > t1[i];
+    }
+    return t0[SIZE-1] > t1[SIZE-1];
+}
+
+
+////////////////////////////////////////
+
+
+/// @return the absolute value of the given Tuple.
+template<int SIZE, typename T>
+Tuple<SIZE, T>
+Abs(const Tuple<SIZE, T>& t)
+{
+    Tuple<SIZE, T> result;
+    for (int i = 0; i < SIZE; ++i) result[i] = math::Abs(t[i]);
+    return result;
+}
+
+
+////////////////////////////////////////
+
+
+/// Write a Tuple to an output stream
+template <int SIZE, typename T>
+std::ostream& operator<<(std::ostream& ostr, const Tuple<SIZE, T>& classname)
+{
+    ostr << classname.str();
+    return ostr;
+}
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_TUPLE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Vec2.h b/nuparu/include/openvdb_new/math/Vec2.h
new file mode 100644
index 00000000..71a1d726
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Vec2.h
@@ -0,0 +1,553 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_VEC2_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_VEC2_HAS_BEEN_INCLUDED
+
+#include <cmath>
+#include <openvdb/Exceptions.h>
+#include "Math.h"
+#include "Tuple.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename T> class Mat2;
+
+template<typename T>
+class Vec2: public Tuple<2, T>
+{
+public:
+    typedef T value_type;
+    typedef T ValueType;
+
+    /// Trivial constructor, the vector is NOT initialized
+    Vec2() {}
+
+    /// Constructor with one argument, e.g.   Vec2f v(0);
+    explicit Vec2(T val) { this->mm[0] = this->mm[1] = val; }
+
+    /// Constructor with two arguments, e.g.   Vec2f v(1,2,3);
+    Vec2(T x, T y)
+    {
+        this->mm[0] = x;
+        this->mm[1] = y;
+    }
+
+    /// Constructor with array argument, e.g.   float a[2]; Vec2f v(a);
+    template <typename Source>
+    Vec2(Source *a)
+    {
+        this->mm[0] = a[0];
+        this->mm[1] = a[1];
+    } // trivial
+
+    /// Conversion constructor
+    template<typename Source>
+    explicit Vec2(const Tuple<2, Source> &t)
+    {
+        this->mm[0] = static_cast<T>(t[0]);
+        this->mm[1] = static_cast<T>(t[1]);
+    }
+
+    /// Reference to the component, e.g.   v.x() = 4.5f;
+    T& x() {return this->mm[0];}
+    T& y() {return this->mm[1];}
+
+    /// Get the component, e.g.   float f = v.y();
+    T x() const {return this->mm[0];}
+    T y() const {return this->mm[1];}
+
+    /// Alternative indexed reference to the elements
+    T& operator()(int i) {return this->mm[i];}
+
+    /// Alternative indexed constant reference to the elements,
+    T operator()(int i) const {return this->mm[i];}
+
+    T* asPointer() {return this->mm;}
+    const T* asPointer() const {return this->mm;}
+
+    /// "this" vector gets initialized to [x, y, z],
+    /// calling v.init(); has same effect as calling v = Vec2::zero();
+    const Vec2<T>& init(T x=0, T y=0)
+    {
+        this->mm[0] = x; this->mm[1] = y;
+        return *this;
+    }
+
+    /// Set "this" vector to zero
+    const Vec2<T>& setZero()
+    {
+        this->mm[0] = 0; this->mm[1] = 0;
+        return *this;
+    }
+
+    /// Assignment operator
+    template<typename Source>
+    const Vec2<T>& operator=(const Vec2<Source> &v)
+    {
+        // note: don't static_cast because that suppresses warnings
+        this->mm[0] = v[0];
+        this->mm[1] = v[1];
+
+        return *this;
+    }
+
+    /// Equality operator, does exact floating point comparisons
+    bool operator==(const Vec2<T> &v) const
+    {
+        return (isExactlyEqual(this->mm[0], v.mm[0]) && isExactlyEqual(this->mm[1], v.mm[1]));
+    }
+
+    /// Inequality operator, does exact floating point comparisons
+    bool operator!=(const Vec2<T> &v) const { return !(*this==v); }
+
+    /// Test if "this" vector is equivalent to vector v with tolerance of eps
+    bool eq(const Vec2<T> &v, T eps = static_cast<T>(1.0e-7)) const
+    {
+        return isApproxEqual(this->mm[0], v.mm[0], eps) &&
+               isApproxEqual(this->mm[1], v.mm[1], eps);
+    } // trivial
+
+    /// Negation operator, for e.g.   v1 = -v2;
+    Vec2<T> operator-() const {return Vec2<T>(-this->mm[0], -this->mm[1]);}
+
+    /// this = v1 + v2
+    /// "this", v1 and v2 need not be distinct objects, e.g. v.add(v1,v);
+    template <typename T0, typename T1>
+    const Vec2<T>& add(const Vec2<T0> &v1, const Vec2<T1> &v2)
+    {
+        this->mm[0] = v1[0] + v2[0];
+        this->mm[1] = v1[1] + v2[1];
+
+        return *this;
+    }
+
+    /// this = v1 - v2
+    /// "this", v1 and v2 need not be distinct objects, e.g. v.sub(v1,v);
+    template <typename T0, typename T1>
+    const Vec2<T>& sub(const Vec2<T0> &v1, const Vec2<T1> &v2)
+    {
+        this->mm[0] = v1[0] - v2[0];
+        this->mm[1] = v1[1] - v2[1];
+
+        return *this;
+    }
+
+    /// this =  scalar*v, v need not be a distinct object from "this",
+    /// e.g. v.scale(1.5,v1);
+    template <typename T0, typename T1>
+    const Vec2<T>& scale(T0 scalar, const Vec2<T1> &v)
+    {
+        this->mm[0] = scalar * v[0];
+        this->mm[1] = scalar * v[1];
+
+        return *this;
+    }
+
+    template <typename T0, typename T1>
+    const Vec2<T> &div(T0 scalar, const Vec2<T1> &v)
+    {
+        this->mm[0] = v[0] / scalar;
+        this->mm[1] = v[1] / scalar;
+
+        return *this;
+    }
+
+    /// Dot product
+    T dot(const Vec2<T> &v) const { return this->mm[0]*v[0] + this->mm[1]*v[1]; } // trivial
+
+    /// Length of the vector
+    T length() const
+    {
+        return static_cast<T>(sqrt(double(this->mm[0]*this->mm[0] + this->mm[1]*this->mm[1])));
+    }
+
+    /// Squared length of the vector, much faster than length() as it
+    /// does not involve square root
+    T lengthSqr() const { return (this->mm[0]*this->mm[0] + this->mm[1]*this->mm[1]); }
+
+    /// Return a reference to itsef after the exponent has been
+    /// applied to all the vector components.
+    inline const Vec2<T>& exp()
+    {
+        this->mm[0] = std::exp(this->mm[0]);
+        this->mm[1] = std::exp(this->mm[1]);
+        return *this;
+    }
+
+    /// Return the sum of all the vector components.
+    inline T sum() const
+    {
+        return this->mm[0] + this->mm[1];
+    }
+
+    /// this = normalized this
+    bool normalize(T eps=1.0e-8)
+    {
+        T d = length();
+        if (isApproxEqual(d, T(0), eps)) {
+            return false;
+        }
+        *this *= (T(1) / d);
+        return true;
+    }
+
+    /// return normalized this, throws if null vector
+    Vec2<T> unit(T eps=0) const
+    {
+        T d;
+        return unit(eps, d);
+    }
+
+    /// return normalized this and length, throws if null vector
+    Vec2<T> unit(T eps, T& len) const
+    {
+        len = length();
+        if (isApproxEqual(len, T(0), eps)) {
+            OPENVDB_THROW(ArithmeticError, "Normalizing null 2-vector");
+        }
+        return *this / len;
+    }
+
+    /// Returns v, where \f$v_i *= scalar\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator*=(S scalar)
+    {
+        this->mm[0] *= scalar;
+        this->mm[1] *= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i *= v1_i\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator*=(const Vec2<S> &v1)
+    {
+        this->mm[0] *= v1[0];
+        this->mm[1] *= v1[1];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i /= scalar\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator/=(S scalar)
+    {
+        this->mm[0] /= scalar;
+        this->mm[1] /= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i /= v1_i\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator/=(const Vec2<S> &v1)
+    {
+        this->mm[0] /= v1[0];
+        this->mm[1] /= v1[1];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i += scalar\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator+=(S scalar)
+    {
+        this->mm[0] += scalar;
+        this->mm[1] += scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i += v1_i\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator+=(const Vec2<S> &v1)
+    {
+        this->mm[0] += v1[0];
+        this->mm[1] += v1[1];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i += scalar\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator-=(S scalar)
+    {
+        this->mm[0] -= scalar;
+        this->mm[1] -= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i -= v1_i\f$ for \f$i \in [0, 1]\f$
+    template <typename S>
+    const Vec2<T> &operator-=(const Vec2<S> &v1)
+    {
+        this->mm[0] -= v1[0];
+        this->mm[1] -= v1[1];
+        return *this;
+    }
+
+    // Number of cols, rows, elements
+    static unsigned numRows() { return 1; }
+    static unsigned numColumns() { return 2; }
+    static unsigned numElements() { return 2; }
+
+    /// Returns the scalar component of v in the direction of onto, onto need
+    /// not be unit. e.g   float c = Vec2f::component(v1,v2);
+    T component(const Vec2<T> &onto, T eps=1.0e-8) const
+    {
+        T l = onto.length();
+        if (isApproxEqual(l,  T(0), eps)) return 0;
+
+        return dot(onto)*(T(1)/l);
+    }
+
+    /// Return the projection of v onto the vector, onto need not be unit
+    /// e.g.   Vec2f v = Vec2f::projection(v,n);
+    Vec2<T> projection(const Vec2<T> &onto, T eps=1.0e-8) const
+    {
+        T l = onto.lengthSqr();
+        if (isApproxEqual(l, T(0), eps)) return Vec2::zero();
+
+        return onto*(dot(onto)*(T(1)/l));
+    }
+
+    /// Return an arbitrary unit vector perpendicular to v
+    /// Vector v must be a unit vector
+    /// e.g.   v.normalize(); Vec2f n = Vec2f::getArbPerpendicular(v);
+    Vec2<T> getArbPerpendicular() const { return Vec2<T>(-this->mm[1], this->mm[0]); }
+
+    /// True if a Nan is present in vector
+    bool isNan() const { return isnan(this->mm[0]) || isnan(this->mm[1]); }
+
+    /// True if an Inf is present in vector
+    bool isInfinite() const { return isinf(this->mm[0]) || isinf(this->mm[1]); }
+
+    /// True if all no Nan or Inf values present
+    bool isFinite() const { return finite(this->mm[0]) && finite(this->mm[1]); }
+
+    /// Predefined constants, e.g.   Vec2f v = Vec2f::xNegAxis();
+    static Vec2<T> zero() { return Vec2<T>(0, 0); }
+};
+
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 1]\f$
+template <typename S, typename T>
+inline Vec2<typename promote<S, T>::type> operator*(S scalar, const Vec2<T> &v)
+{
+    return v * scalar;
+}
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 1]\f$
+template <typename S, typename T>
+inline Vec2<typename promote<S, T>::type> operator*(const Vec2<T> &v, S scalar)
+{
+    Vec2<typename promote<S, T>::type> result(v);
+    result *= scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i * v1_i\f$ for \f$i \in [0, 1]\f$
+template <typename T0, typename T1>
+inline Vec2<typename promote<T0, T1>::type> operator*(const Vec2<T0> &v0, const Vec2<T1> &v1)
+{
+    Vec2<typename promote<T0, T1>::type> result(v0[0] * v1[0], v0[1] * v1[1]);
+    return result;
+}
+
+/// Returns V, where \f$V_i = scalar / v_i\f$ for \f$i \in [0, 1]\f$
+template <typename S, typename T>
+inline Vec2<typename promote<S, T>::type> operator/(S scalar, const Vec2<T> &v)
+{
+    return Vec2<typename promote<S, T>::type>(scalar/v[0], scalar/v[1]);
+}
+
+/// Returns V, where \f$V_i = v_i / scalar\f$ for \f$i \in [0, 1]\f$
+template <typename S, typename T>
+inline Vec2<typename promote<S, T>::type> operator/(const Vec2<T> &v, S scalar)
+{
+    Vec2<typename promote<S, T>::type> result(v);
+    result /= scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i / v1_i\f$ for \f$i \in [0, 1]\f$
+template <typename T0, typename T1>
+inline Vec2<typename promote<T0, T1>::type> operator/(const Vec2<T0> &v0, const Vec2<T1> &v1)
+{
+    Vec2<typename promote<T0, T1>::type> result(v0[0] / v1[0], v0[1] / v1[1]);
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i + v1_i\f$ for \f$i \in [0, 1]\f$
+template <typename T0, typename T1>
+inline Vec2<typename promote<T0, T1>::type> operator+(const Vec2<T0> &v0, const Vec2<T1> &v1)
+{
+    Vec2<typename promote<T0, T1>::type> result(v0);
+    result += v1;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v_i + scalar\f$ for \f$i \in [0, 1]\f$
+template <typename S, typename T>
+inline Vec2<typename promote<S, T>::type> operator+(const Vec2<T> &v, S scalar)
+{
+    Vec2<typename promote<S, T>::type> result(v);
+    result += scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i - v1_i\f$ for \f$i \in [0, 1]\f$
+template <typename T0, typename T1>
+inline Vec2<typename promote<T0, T1>::type> operator-(const Vec2<T0> &v0, const Vec2<T1> &v1)
+{
+    Vec2<typename promote<T0, T1>::type> result(v0);
+    result -= v1;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v_i - scalar\f$ for \f$i \in [0, 1]\f$
+template <typename S, typename T>
+inline Vec2<typename promote<S, T>::type> operator-(const Vec2<T> &v, S scalar)
+{
+    Vec2<typename promote<S, T>::type> result(v);
+    result -= scalar;
+    return result;
+}
+
+/// Angle between two vectors, the result is between [0, pi],
+/// e.g.   float a = Vec2f::angle(v1,v2);
+template <typename T>
+inline T angle(const Vec2<T> &v1, const Vec2<T> &v2)
+{
+    T c = v1.dot(v2);
+    return acos(c);
+}
+
+template <typename T>
+inline bool
+isApproxEqual(const Vec2<T>& a, const Vec2<T>& b)
+{
+    return a.eq(b);
+}
+template <typename T>
+inline bool
+isApproxEqual(const Vec2<T>& a, const Vec2<T>& b, const Vec2<T>& eps)
+{
+    return isApproxEqual(a.x(), b.x(), eps.x()) &&
+           isApproxEqual(a.y(), b.y(), eps.y());
+}
+
+template<typename T>
+inline bool
+isFinite(const Vec2<T>& v)
+{
+    return isFinite(v[0]) && isFinite(v[1]);
+}
+
+/// Return @c true if all components are exactly equal to zero.
+template<typename T>
+inline bool
+isZero(const Vec2<T>& v)
+{
+    return isZero(v[0]) && isZero(v[1]);
+}
+
+template<typename T>
+inline Vec2<T>
+Abs(const Vec2<T>& v)
+{
+    return Vec2<T>(Abs(v[0]), Abs(v[1]));
+}
+
+/// Orthonormalize vectors v1 and v2 and store back the resulting basis
+/// e.g.   Vec2f::orthonormalize(v1,v2);
+template <typename T>
+inline void orthonormalize(Vec2<T> &v1, Vec2<T> &v2)
+{
+    // If the input vectors are v0, v1, and v2, then the Gram-Schmidt
+    // orthonormalization produces vectors u0, u1, and u2 as follows,
+    //
+    //   u0 = v0/|v0|
+    //   u1 = (v1-(u0*v1)u0)/|v1-(u0*v1)u0|
+    //
+    // where |A| indicates length of vector A and A*B indicates dot
+    // product of vectors A and B.
+
+    // compute u0
+    v1.normalize();
+
+    // compute u1
+    T d0 = v1.dot(v2);
+    v2 -= v1*d0;
+    v2.normalize();
+}
+
+
+/// \remark We are switching to a more explicit name because the semantics
+/// are different from std::min/max. In that case, the function returns a
+/// reference to one of the objects based on a comparator. Here, we must
+/// fabricate a new object which might not match either of the inputs.
+
+/// Return component-wise minimum of the two vectors.
+template <typename T>
+inline Vec2<T> minComponent(const Vec2<T> &v1, const Vec2<T> &v2)
+{
+    return Vec2<T>(
+            std::min(v1.x(), v2.x()),
+            std::min(v1.y(), v2.y()));
+}
+
+/// Return component-wise maximum of the two vectors.
+template <typename T>
+inline Vec2<T> maxComponent(const Vec2<T> &v1, const Vec2<T> &v2)
+{
+    return Vec2<T>(
+            std::max(v1.x(), v2.x()),
+            std::max(v1.y(), v2.y()));
+}
+
+/// @brief Return a vector with the exponent applied to each of
+/// the components of the input vector.
+template <typename T>
+inline Vec2<T> Exp(Vec2<T> v) { return v.exp(); }
+
+typedef Vec2<int32_t>   Vec2i;
+typedef Vec2<uint32_t>  Vec2ui;
+typedef Vec2<float>     Vec2s;
+typedef Vec2<double>    Vec2d;
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_VEC2_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Vec3.h b/nuparu/include/openvdb_new/math/Vec3.h
new file mode 100644
index 00000000..9b4a8a85
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Vec3.h
@@ -0,0 +1,661 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_VEC3_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_VEC3_HAS_BEEN_INCLUDED
+
+#include <cmath>
+#include <openvdb/Exceptions.h>
+#include "Math.h"
+#include "Tuple.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename T> class Mat3;
+
+template<typename T>
+class Vec3: public Tuple<3, T>
+{
+public:
+    typedef T value_type;
+    typedef T ValueType;
+
+    /// Trivial constructor, the vector is NOT initialized
+    Vec3() {}
+
+    /// Constructor with one argument, e.g.   Vec3f v(0);
+    explicit Vec3(T val) { this->mm[0] = this->mm[1] = this->mm[2] = val; }
+
+    /// Constructor with three arguments, e.g.   Vec3d v(1,2,3);
+    Vec3(T x, T y, T z)
+    {
+        this->mm[0] = x;
+        this->mm[1] = y;
+        this->mm[2] = z;
+    }
+
+    /// Constructor with array argument, e.g.   double a[3]; Vec3d v(a);
+    template <typename Source>
+    Vec3(Source *a)
+    {
+        this->mm[0] = a[0];
+        this->mm[1] = a[1];
+        this->mm[2] = a[2];
+    }
+
+    /// @brief Construct a Vec3 from a 3-Tuple with a possibly different value type.
+    /// @details Type conversion warnings are suppressed.
+    template<typename Source>
+    explicit Vec3(const Tuple<3, Source> &v)
+    {
+        this->mm[0] = static_cast<T>(v[0]);
+        this->mm[1] = static_cast<T>(v[1]);
+        this->mm[2] = static_cast<T>(v[2]);
+    }
+
+    /// @brief Construct a Vec3 from another Vec3 with a possibly different value type.
+    /// @details Type conversion warnings are suppressed.
+    template<typename Other>
+    Vec3(const Vec3<Other>& v)
+    {
+        this->mm[0] = static_cast<T>(v[0]);
+        this->mm[1] = static_cast<T>(v[1]);
+        this->mm[2] = static_cast<T>(v[2]);
+    }
+
+    /// Reference to the component, e.g.   v.x() = 4.5f;
+    T& x() { return this->mm[0]; }
+    T& y() { return this->mm[1]; }
+    T& z() { return this->mm[2]; }
+
+    /// Get the component, e.g.   float f = v.y();
+    T x() const { return this->mm[0]; }
+    T y() const { return this->mm[1]; }
+    T z() const { return this->mm[2]; }
+
+    T* asPointer() { return this->mm; }
+    const T* asPointer() const { return this->mm; }
+
+    /// Alternative indexed reference to the elements
+    T& operator()(int i) { return this->mm[i]; }
+
+    /// Alternative indexed constant reference to the elements,
+    T operator()(int i) const { return this->mm[i]; }
+
+    /// "this" vector gets initialized to [x, y, z],
+    /// calling v.init(); has same effect as calling v = Vec3::zero();
+    const Vec3<T>& init(T x=0, T y=0, T z=0)
+    {
+        this->mm[0] = x; this->mm[1] = y; this->mm[2] = z;
+        return *this;
+    }
+
+
+    /// Set "this" vector to zero
+    const Vec3<T>& setZero()
+    {
+        this->mm[0] = 0; this->mm[1] = 0; this->mm[2] = 0;
+        return *this;
+    }
+
+    /// @brief Assignment operator
+    /// @details Type conversion warnings are not suppressed.
+    template<typename Source>
+    const Vec3<T>& operator=(const Vec3<Source> &v)
+    {
+        // note: don't static_cast because that suppresses warnings
+        this->mm[0] = v[0];
+        this->mm[1] = v[1];
+        this->mm[2] = v[2];
+
+        return *this;
+    }
+
+    /// Test if "this" vector is equivalent to vector v with tolerance of eps
+    bool eq(const Vec3<T> &v, T eps = static_cast<T>(1.0e-7)) const
+    {
+        return isRelOrApproxEqual(this->mm[0], v.mm[0], eps, eps) &&
+               isRelOrApproxEqual(this->mm[1], v.mm[1], eps, eps) &&
+               isRelOrApproxEqual(this->mm[2], v.mm[2], eps, eps);
+    }
+
+
+    /// Negation operator, for e.g.   v1 = -v2;
+    Vec3<T> operator-() const { return Vec3<T>(-this->mm[0], -this->mm[1], -this->mm[2]); }
+
+    /// this = v1 + v2
+    /// "this", v1 and v2 need not be distinct objects, e.g. v.add(v1,v);
+    template <typename T0, typename T1>
+    const Vec3<T>& add(const Vec3<T0> &v1, const Vec3<T1> &v2)
+    {
+        this->mm[0] = v1[0] + v2[0];
+        this->mm[1] = v1[1] + v2[1];
+        this->mm[2] = v1[2] + v2[2];
+
+        return *this;
+    }
+
+    /// this = v1 - v2
+    /// "this", v1 and v2 need not be distinct objects, e.g. v.sub(v1,v);
+    template <typename T0, typename T1>
+    const Vec3<T>& sub(const Vec3<T0> &v1, const Vec3<T1> &v2)
+    {
+        this->mm[0] = v1[0] - v2[0];
+        this->mm[1] = v1[1] - v2[1];
+        this->mm[2] = v1[2] - v2[2];
+
+        return *this;
+    }
+
+    /// this =  scalar*v, v need not be a distinct object from "this",
+    /// e.g. v.scale(1.5,v1);
+    template <typename T0, typename T1>
+    const Vec3<T>& scale(T0 scale, const Vec3<T1> &v)
+    {
+        this->mm[0] = scale * v[0];
+        this->mm[1] = scale * v[1];
+        this->mm[2] = scale * v[2];
+
+        return *this;
+    }
+
+    template <typename T0, typename T1>
+    const Vec3<T> &div(T0 scale, const Vec3<T1> &v)
+    {
+        this->mm[0] = v[0] / scale;
+        this->mm[1] = v[1] / scale;
+        this->mm[2] = v[2] / scale;
+
+        return *this;
+    }
+
+    /// Dot product
+    T dot(const Vec3<T> &v) const
+    {
+        return
+            this->mm[0]*v.mm[0] +
+            this->mm[1]*v.mm[1] +
+            this->mm[2]*v.mm[2];
+    }
+
+    /// Length of the vector
+    T length() const
+    {
+        return static_cast<T>(sqrt(double(
+            this->mm[0]*this->mm[0] +
+            this->mm[1]*this->mm[1] +
+            this->mm[2]*this->mm[2])));
+    }
+
+
+    /// Squared length of the vector, much faster than length() as it
+    /// does not involve square root
+    T lengthSqr() const
+    {
+        return
+            this->mm[0]*this->mm[0] +
+            this->mm[1]*this->mm[1] +
+            this->mm[2]*this->mm[2];
+    }
+
+    /// Return the cross product of "this" vector and v;
+    Vec3<T> cross(const Vec3<T> &v) const
+    {
+        return Vec3<T>(this->mm[1]*v.mm[2] - this->mm[2]*v.mm[1],
+                    this->mm[2]*v.mm[0] - this->mm[0]*v.mm[2],
+                    this->mm[0]*v.mm[1] - this->mm[1]*v.mm[0]);
+    }
+
+
+    /// this = v1 cross v2, v1 and v2 must be distinct objects than "this"
+    const Vec3<T>& cross(const Vec3<T> &v1, const Vec3<T> &v2)
+    {
+        // assert(this!=&v1);
+        // assert(this!=&v2);
+        this->mm[0] = v1.mm[1]*v2.mm[2] - v1.mm[2]*v2.mm[1];
+        this->mm[1] = v1.mm[2]*v2.mm[0] - v1.mm[0]*v2.mm[2];
+        this->mm[2] = v1.mm[0]*v2.mm[1] - v1.mm[1]*v2.mm[0];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i *= scalar\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator*=(S scalar)
+    {
+        this->mm[0] = static_cast<T>(this->mm[0] * scalar);
+        this->mm[1] = static_cast<T>(this->mm[1] * scalar);
+        this->mm[2] = static_cast<T>(this->mm[2] * scalar);
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i *= v1_i\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator*=(const Vec3<S> &v1)
+    {
+        this->mm[0] *= v1[0];
+        this->mm[1] *= v1[1];
+        this->mm[2] *= v1[2];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i /= scalar\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator/=(S scalar)
+    {
+        this->mm[0] /= scalar;
+        this->mm[1] /= scalar;
+        this->mm[2] /= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i /= v1_i\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator/=(const Vec3<S> &v1)
+    {
+        this->mm[0] /= v1[0];
+        this->mm[1] /= v1[1];
+        this->mm[2] /= v1[2];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i += scalar\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator+=(S scalar)
+    {
+        this->mm[0] = static_cast<T>(this->mm[0] + scalar);
+        this->mm[1] = static_cast<T>(this->mm[1] + scalar);
+        this->mm[2] = static_cast<T>(this->mm[2] + scalar);
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i += v1_i\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator+=(const Vec3<S> &v1)
+    {
+        this->mm[0] += v1[0];
+        this->mm[1] += v1[1];
+        this->mm[2] += v1[2];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i += scalar\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator-=(S scalar)
+    {
+        this->mm[0] -= scalar;
+        this->mm[1] -= scalar;
+        this->mm[2] -= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i -= v1_i\f$ for \f$i \in [0, 2]\f$
+    template <typename S>
+    const Vec3<T> &operator-=(const Vec3<S> &v1)
+    {
+        this->mm[0] -= v1[0];
+        this->mm[1] -= v1[1];
+        this->mm[2] -= v1[2];
+        return *this;
+    }
+
+    /// Return a reference to itsef after the exponent has been
+    /// applied to all the vector components.
+    inline const Vec3<T>& exp()
+    {
+        this->mm[0] = std::exp(this->mm[0]);
+        this->mm[1] = std::exp(this->mm[1]);
+        this->mm[2] = std::exp(this->mm[2]);
+        return *this;
+    }
+
+    /// Return the sum of all the vector components.
+    inline T sum() const
+    {
+        return this->mm[0] + this->mm[1] + this->mm[2];
+    }
+
+    /// this = normalized this
+    bool normalize(T eps = T(1.0e-7))
+    {
+        T d = length();
+        if (isApproxEqual(d, T(0), eps)) {
+            return false;
+        }
+        *this *= (T(1) / d);
+        return true;
+    }
+
+
+    /// return normalized this, throws if null vector
+    Vec3<T> unit(T eps=0) const
+    {
+        T d;
+        return unit(eps, d);
+    }
+
+    /// return normalized this and length, throws if null vector
+    Vec3<T> unit(T eps, T& len) const
+    {
+        len = length();
+        if (isApproxEqual(len, T(0), eps)) {
+            OPENVDB_THROW(ArithmeticError, "Normalizing null 3-vector");
+        }
+        return *this / len;
+    }
+
+    // Number of cols, rows, elements
+    static unsigned numRows() { return 1; }
+    static unsigned numColumns() { return 3; }
+    static unsigned numElements() { return 3; }
+
+    /// Returns the scalar component of v in the direction of onto, onto need
+    /// not be unit. e.g   double c = Vec3d::component(v1,v2);
+    T component(const Vec3<T> &onto, T eps = static_cast<T>(1.0e-7)) const
+    {
+        T l = onto.length();
+        if (isApproxEqual(l, T(0), eps)) return 0;
+
+        return dot(onto)*(T(1)/l);
+    }
+
+    /// Return the projection of v onto the vector, onto need not be unit
+    /// e.g.   Vec3d a = vprojection(n);
+    Vec3<T> projection(const Vec3<T> &onto, T eps = static_cast<T>(1.0e-7)) const
+    {
+        T l = onto.lengthSqr();
+        if (isApproxEqual(l, T(0), eps)) return Vec3::zero();
+
+        return onto*(dot(onto)*(T(1)/l));
+    }
+
+    /// Return an arbitrary unit vector perpendicular to v
+    /// Vector this must be a unit vector
+    /// e.g.   v = v.normalize(); Vec3d n = v.getArbPerpendicular();
+    Vec3<T> getArbPerpendicular() const
+    {
+        Vec3<T> u;
+        T l;
+
+        if ( fabs(this->mm[0]) >= fabs(this->mm[1]) ) {
+            // v.x or v.z is the largest magnitude component, swap them
+            l = this->mm[0]*this->mm[0] + this->mm[2]*this->mm[2];
+            l = static_cast<T>(T(1)/sqrt(double(l)));
+            u.mm[0] = -this->mm[2]*l;
+            u.mm[1] = (T)0.0;
+            u.mm[2] = +this->mm[0]*l;
+        } else {
+            // W.y or W.z is the largest magnitude component, swap them
+            l = this->mm[1]*this->mm[1] + this->mm[2]*this->mm[2];
+            l = static_cast<T>(T(1)/sqrt(double(l)));
+            u.mm[0] = (T)0.0;
+            u.mm[1] = +this->mm[2]*l;
+            u.mm[2] = -this->mm[1]*l;
+        }
+
+        return u;
+    }
+
+    /// True if a Nan is present in vector
+    bool isNan() const { return isnan(this->mm[0]) || isnan(this->mm[1]) || isnan(this->mm[2]); }
+
+    /// True if an Inf is present in vector
+    bool isInfinite() const
+    {
+        return isinf(this->mm[0]) || isinf(this->mm[1]) || isinf(this->mm[2]);
+    }
+
+    /// True if all no Nan or Inf values present
+    bool isFinite() const
+    {
+        return finite(this->mm[0]) && finite(this->mm[1]) && finite(this->mm[2]);
+    }
+
+    /// Predefined constants, e.g.   Vec3d v = Vec3d::xNegAxis();
+    static Vec3<T> zero() { return Vec3<T>(0, 0, 0); }
+};
+
+
+/// Equality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+inline bool operator==(const Vec3<T0> &v0, const Vec3<T1> &v1)
+{
+    return isExactlyEqual(v0[0], v1[0]) && isExactlyEqual(v0[1], v1[1])
+        && isExactlyEqual(v0[2], v1[2]);
+}
+
+/// Inequality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+inline bool operator!=(const Vec3<T0> &v0, const Vec3<T1> &v1) { return !(v0==v1); }
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 2]\f$
+template <typename S, typename T>
+inline Vec3<typename promote<S, T>::type> operator*(S scalar, const Vec3<T> &v) { return v*scalar; }
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 2]\f$
+template <typename S, typename T>
+inline Vec3<typename promote<S, T>::type> operator*(const Vec3<T> &v, S scalar)
+{
+    Vec3<typename promote<S, T>::type> result(v);
+    result *= scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i * v1_i\f$ for \f$i \in [0, 2]\f$
+template <typename T0, typename T1>
+inline Vec3<typename promote<T0, T1>::type> operator*(const Vec3<T0> &v0, const Vec3<T1> &v1)
+{
+    Vec3<typename promote<T0, T1>::type> result(v0[0] * v1[0], v0[1] * v1[1], v0[2] * v1[2]);
+    return result;
+}
+
+
+/// Returns V, where \f$V_i = scalar / v_i\f$ for \f$i \in [0, 2]\f$
+template <typename S, typename T>
+inline Vec3<typename promote<S, T>::type> operator/(S scalar, const Vec3<T> &v)
+{
+    return Vec3<typename promote<S, T>::type>(scalar/v[0], scalar/v[1], scalar/v[2]);
+}
+
+/// Returns V, where \f$V_i = v_i / scalar\f$ for \f$i \in [0, 2]\f$
+template <typename S, typename T>
+inline Vec3<typename promote<S, T>::type> operator/(const Vec3<T> &v, S scalar)
+{
+    Vec3<typename promote<S, T>::type> result(v);
+    result /= scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i / v1_i\f$ for \f$i \in [0, 2]\f$
+template <typename T0, typename T1>
+inline Vec3<typename promote<T0, T1>::type> operator/(const Vec3<T0> &v0, const Vec3<T1> &v1)
+{
+    Vec3<typename promote<T0, T1>::type> result(v0[0] / v1[0], v0[1] / v1[1], v0[2] / v1[2]);
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i + v1_i\f$ for \f$i \in [0, 2]\f$
+template <typename T0, typename T1>
+inline Vec3<typename promote<T0, T1>::type> operator+(const Vec3<T0> &v0, const Vec3<T1> &v1)
+{
+    Vec3<typename promote<T0, T1>::type> result(v0);
+    result += v1;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v_i + scalar\f$ for \f$i \in [0, 2]\f$
+template <typename S, typename T>
+inline Vec3<typename promote<S, T>::type> operator+(const Vec3<T> &v, S scalar)
+{
+    Vec3<typename promote<S, T>::type> result(v);
+    result += scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i - v1_i\f$ for \f$i \in [0, 2]\f$
+template <typename T0, typename T1>
+inline Vec3<typename promote<T0, T1>::type> operator-(const Vec3<T0> &v0, const Vec3<T1> &v1)
+{
+    Vec3<typename promote<T0, T1>::type> result(v0);
+    result -= v1;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v_i - scalar\f$ for \f$i \in [0, 2]\f$
+template <typename S, typename T>
+inline Vec3<typename promote<S, T>::type> operator-(const Vec3<T> &v, S scalar)
+{
+    Vec3<typename promote<S, T>::type> result(v);
+    result -= scalar;
+    return result;
+}
+
+/// Angle between two vectors, the result is between [0, pi],
+/// e.g.   double a = Vec3d::angle(v1,v2);
+template <typename T>
+inline T angle(const Vec3<T> &v1, const Vec3<T> &v2)
+{
+    Vec3<T> c = v1.cross(v2);
+    return static_cast<T>(atan2(c.length(), v1.dot(v2)));
+}
+
+template <typename T>
+inline bool
+isApproxEqual(const Vec3<T>& a, const Vec3<T>& b)
+{
+    return a.eq(b);
+}
+template <typename T>
+inline bool
+isApproxEqual(const Vec3<T>& a, const Vec3<T>& b, const Vec3<T>& eps)
+{
+    return isApproxEqual(a.x(), b.x(), eps.x()) &&
+           isApproxEqual(a.y(), b.y(), eps.y()) &&
+           isApproxEqual(a.z(), b.z(), eps.z());
+}
+
+template<typename T>
+inline bool
+isFinite(const Vec3<T>& v)
+{
+    return isFinite(v[0]) && isFinite(v[1]) && isFinite(v[2]);
+}
+
+/// Return @c true if all components are exactly equal to zero.
+template<typename T>
+inline bool
+isZero(const Vec3<T>& v)
+{
+    return isZero(v[0]) && isZero(v[1]) && isZero(v[2]);
+}
+
+template<typename T>
+inline Vec3<T>
+Abs(const Vec3<T>& v)
+{
+    return Vec3<T>(Abs(v[0]), Abs(v[1]), Abs(v[2]));
+}
+
+/// Orthonormalize vectors v1, v2 and v3 and store back the resulting
+/// basis e.g.   Vec3d::orthonormalize(v1,v2,v3);
+template <typename T>
+inline void orthonormalize(Vec3<T> &v1, Vec3<T> &v2, Vec3<T> &v3)
+{
+    // If the input vectors are v0, v1, and v2, then the Gram-Schmidt
+    // orthonormalization produces vectors u0, u1, and u2 as follows,
+    //
+    //   u0 = v0/|v0|
+    //   u1 = (v1-(u0*v1)u0)/|v1-(u0*v1)u0|
+    //   u2 = (v2-(u0*v2)u0-(u1*v2)u1)/|v2-(u0*v2)u0-(u1*v2)u1|
+    //
+    // where |A| indicates length of vector A and A*B indicates dot
+    // product of vectors A and B.
+
+    // compute u0
+    v1.normalize();
+
+    // compute u1
+    T d0 = v1.dot(v2);
+    v2 -= v1*d0;
+    v2.normalize();
+
+    // compute u2
+    T d1 = v2.dot(v3);
+    d0 = v1.dot(v3);
+    v3 -= v1*d0 + v2*d1;
+    v3.normalize();
+}
+
+/// @remark We are switching to a more explicit name because the semantics
+/// are different from std::min/max. In that case, the function returns a
+/// reference to one of the objects based on a comparator. Here, we must
+/// fabricate a new object which might not match either of the inputs.
+
+/// Return component-wise minimum of the two vectors.
+template <typename T>
+inline Vec3<T> minComponent(const Vec3<T> &v1, const Vec3<T> &v2)
+{
+    return Vec3<T>(
+            std::min(v1.x(), v2.x()),
+            std::min(v1.y(), v2.y()),
+            std::min(v1.z(), v2.z()));
+}
+
+/// Return component-wise maximum of the two vectors.
+template <typename T>
+inline Vec3<T> maxComponent(const Vec3<T> &v1, const Vec3<T> &v2)
+{
+    return Vec3<T>(
+            std::max(v1.x(), v2.x()),
+            std::max(v1.y(), v2.y()),
+            std::max(v1.z(), v2.z()));
+}
+
+/// @brief Return a vector with the exponent applied to each of
+/// the components of the input vector.
+template <typename T>
+inline Vec3<T> Exp(Vec3<T> v) { return v.exp(); }
+
+typedef Vec3<int32_t>   Vec3i;
+typedef Vec3<uint32_t>  Vec3ui;
+typedef Vec3<float>     Vec3s;
+typedef Vec3<double>    Vec3d;
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_VEC3_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/math/Vec4.h b/nuparu/include/openvdb_new/math/Vec4.h
new file mode 100644
index 00000000..64ac29fb
--- /dev/null
+++ b/nuparu/include/openvdb_new/math/Vec4.h
@@ -0,0 +1,594 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_MATH_VEC4_HAS_BEEN_INCLUDED
+#define OPENVDB_MATH_VEC4_HAS_BEEN_INCLUDED
+
+#include <cmath>
+#include <openvdb/Exceptions.h>
+#include "Math.h"
+#include "Tuple.h"
+#include "Vec3.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace math {
+
+template<typename T> class Mat3;
+
+template<typename T>
+class Vec4: public Tuple<4, T>
+{
+public:
+    typedef T value_type;
+    typedef T ValueType;
+
+    /// Trivial constructor, the vector is NOT initialized
+    Vec4() {}
+
+    /// Constructor with one argument, e.g.   Vec4f v(0);
+    explicit Vec4(T val) { this->mm[0] = this->mm[1] = this->mm[2] = this->mm[3] = val; }
+
+    /// Constructor with four arguments, e.g.   Vec4f v(1,2,3,4);
+    Vec4(T x, T y, T z, T w)
+    {
+        this->mm[0] = x;
+        this->mm[1] = y;
+        this->mm[2] = z;
+        this->mm[3] = w;
+    }
+
+    /// Constructor with array argument, e.g.   float a[4]; Vec4f v(a);
+    template <typename Source>
+    Vec4(Source *a)
+    {
+        this->mm[0] = a[0];
+        this->mm[1] = a[1];
+        this->mm[2] = a[2];
+        this->mm[3] = a[3];
+    }
+
+    /// Conversion constructor
+    template<typename Source>
+    explicit Vec4(const Tuple<4, Source> &v)
+    {
+        this->mm[0] = static_cast<T>(v[0]);
+        this->mm[1] = static_cast<T>(v[1]);
+        this->mm[2] = static_cast<T>(v[2]);
+        this->mm[3] = static_cast<T>(v[3]);
+    }
+
+    /// Reference to the component, e.g.   v.x() = 4.5f;
+    T& x() { return this->mm[0]; }
+    T& y() { return this->mm[1]; }
+    T& z() { return this->mm[2]; }
+    T& w() { return this->mm[3]; }
+
+    /// Get the component, e.g.   float f = v.y();
+    T x() const { return this->mm[0]; }
+    T y() const { return this->mm[1]; }
+    T z() const { return this->mm[2]; }
+    T w() const { return this->mm[3]; }
+
+    T* asPointer() { return this->mm; }
+    const T* asPointer() const { return this->mm; }
+
+    /// Alternative indexed reference to the elements
+    T& operator()(int i) { return this->mm[i]; }
+
+    /// Alternative indexed constant reference to the elements,
+    T operator()(int i) const { return this->mm[i]; }
+
+    /// Returns a Vec3 with the first three elements of the Vec4.
+    Vec3<T> getVec3() const { return Vec3<T>(this->mm[0], this->mm[1], this->mm[2]); }
+
+    /// "this" vector gets initialized to [x, y, z, w],
+    /// calling v.init(); has same effect as calling v = Vec4::zero();
+    const Vec4<T>& init(T x=0, T y=0, T z=0, T w=0)
+    {
+        this->mm[0] = x; this->mm[1] = y; this->mm[2] = z; this->mm[3] = w;
+        return *this;
+    }
+
+    /// Set "this" vector to zero
+    const Vec4<T>& setZero()
+    {
+        this->mm[0] = 0; this->mm[1] = 0; this->mm[2] = 0; this->mm[3] = 0;
+        return *this;
+    }
+
+    /// Assignment operator
+    template<typename Source>
+    const Vec4<T>& operator=(const Vec4<Source> &v)
+    {
+        // note: don't static_cast because that suppresses warnings
+        this->mm[0] = v[0];
+        this->mm[1] = v[1];
+        this->mm[2] = v[2];
+        this->mm[3] = v[3];
+
+        return *this;
+    }
+
+    /// Test if "this" vector is equivalent to vector v with tolerance
+    /// of eps
+    bool eq(const Vec4<T> &v, T eps=1.0e-8) const
+    {
+        return isApproxEqual(this->mm[0], v.mm[0], eps) &&
+            isApproxEqual(this->mm[1], v.mm[1], eps) &&
+            isApproxEqual(this->mm[2], v.mm[2], eps) &&
+            isApproxEqual(this->mm[3], v.mm[3], eps);
+    }
+
+    /// Negation operator, for e.g.   v1 = -v2;
+    Vec4<T> operator-() const
+    {
+        return Vec4<T>(
+            -this->mm[0],
+            -this->mm[1],
+            -this->mm[2],
+            -this->mm[3]);
+    }
+
+    /// this = v1 + v2
+    /// "this", v1 and v2 need not be distinct objects, e.g. v.add(v1,v);
+    template <typename T0, typename T1>
+    const Vec4<T>& add(const Vec4<T0> &v1, const Vec4<T1> &v2)
+    {
+        this->mm[0] = v1[0] + v2[0];
+        this->mm[1] = v1[1] + v2[1];
+        this->mm[2] = v1[2] + v2[2];
+        this->mm[3] = v1[3] + v2[3];
+
+        return *this;
+    }
+
+
+    /// this = v1 - v2
+    /// "this", v1 and v2 need not be distinct objects, e.g. v.sub(v1,v);
+    template <typename T0, typename T1>
+    const Vec4<T>& sub(const Vec4<T0> &v1, const Vec4<T1> &v2)
+    {
+        this->mm[0] = v1[0] - v2[0];
+        this->mm[1] = v1[1] - v2[1];
+        this->mm[2] = v1[2] - v2[2];
+        this->mm[3] = v1[3] - v2[3];
+
+        return *this;
+    }
+
+    /// this =  scalar*v, v need not be a distinct object from "this",
+    /// e.g. v.scale(1.5,v1);
+    template <typename T0, typename T1>
+    const Vec4<T>& scale(T0 scale, const Vec4<T1> &v)
+    {
+        this->mm[0] = scale * v[0];
+        this->mm[1] = scale * v[1];
+        this->mm[2] = scale * v[2];
+        this->mm[3] = scale * v[3];
+
+        return *this;
+    }
+
+    template <typename T0, typename T1>
+    const Vec4<T> &div(T0 scalar, const Vec4<T1> &v)
+    {
+        this->mm[0] = v[0] / scalar;
+        this->mm[1] = v[1] / scalar;
+        this->mm[2] = v[2] / scalar;
+        this->mm[3] = v[3] / scalar;
+
+        return *this;
+    }
+
+    /// Dot product
+    T dot(const Vec4<T> &v) const
+    {
+        return (this->mm[0]*v.mm[0] + this->mm[1]*v.mm[1]
+            + this->mm[2]*v.mm[2] + this->mm[3]*v.mm[3]);
+    }
+
+    /// Length of the vector
+    T length() const
+    {
+        return sqrt(
+            this->mm[0]*this->mm[0] +
+            this->mm[1]*this->mm[1] +
+            this->mm[2]*this->mm[2] +
+            this->mm[3]*this->mm[3]);
+    }
+
+
+    /// Squared length of the vector, much faster than length() as it
+    /// does not involve square root
+    T lengthSqr() const
+    {
+        return (this->mm[0]*this->mm[0] + this->mm[1]*this->mm[1]
+            + this->mm[2]*this->mm[2] + this->mm[3]*this->mm[3]);
+    }
+
+    /// Return a reference to itsef after the exponent has been
+    /// applied to all the vector components.
+    inline const Vec4<T>& exp()
+    {
+        this->mm[0] = std::exp(this->mm[0]);
+        this->mm[1] = std::exp(this->mm[1]);
+        this->mm[2] = std::exp(this->mm[2]);
+        this->mm[3] = std::exp(this->mm[3]);
+        return *this;
+    }
+
+    /// Return the sum of all the vector components.
+    inline T sum() const
+    {
+        return this->mm[0] + this->mm[1] + this->mm[2] + this->mm[3];
+    }
+
+
+    /// this = normalized this
+    bool normalize(T eps=1.0e-8)
+    {
+        T d = length();
+        if (isApproxEqual(d, T(0), eps)) {
+            return false;
+        }
+        *this *= (T(1) / d);
+        return true;
+    }
+
+    /// return normalized this, throws if null vector
+    Vec4<T> unit(T eps=0) const
+    {
+        T d;
+        return unit(eps, d);
+    }
+
+    /// return normalized this and length, throws if null vector
+    Vec4<T> unit(T eps, T& len) const
+    {
+        len = length();
+        if (isApproxEqual(len, T(0), eps)) {
+            throw ArithmeticError("Normalizing null 4-vector");
+        }
+        return *this / len;
+    }
+
+    /// Returns v, where \f$v_i *= scalar\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator*=(S scalar)
+    {
+        this->mm[0] *= scalar;
+        this->mm[1] *= scalar;
+        this->mm[2] *= scalar;
+        this->mm[3] *= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i *= v1_i\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator*=(const Vec4<S> &v1)
+    {
+        this->mm[0] *= v1[0];
+        this->mm[1] *= v1[1];
+        this->mm[2] *= v1[2];
+        this->mm[3] *= v1[3];
+
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i /= scalar\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator/=(S scalar)
+    {
+        this->mm[0] /= scalar;
+        this->mm[1] /= scalar;
+        this->mm[2] /= scalar;
+        this->mm[3] /= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i /= v1_i\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator/=(const Vec4<S> &v1)
+    {
+        this->mm[0] /= v1[0];
+        this->mm[1] /= v1[1];
+        this->mm[2] /= v1[2];
+        this->mm[3] /= v1[3];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i += scalar\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator+=(S scalar)
+    {
+        this->mm[0] += scalar;
+        this->mm[1] += scalar;
+        this->mm[2] += scalar;
+        this->mm[3] += scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i += v1_i\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator+=(const Vec4<S> &v1)
+    {
+        this->mm[0] += v1[0];
+        this->mm[1] += v1[1];
+        this->mm[2] += v1[2];
+        this->mm[3] += v1[3];
+        return *this;
+    }
+
+    /// Returns v, where \f$v_i += scalar\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator-=(S scalar)
+    {
+        this->mm[0] -= scalar;
+        this->mm[1] -= scalar;
+        this->mm[2] -= scalar;
+        this->mm[3] -= scalar;
+        return *this;
+    }
+
+    /// Returns v0, where \f$v0_i -= v1_i\f$ for \f$i \in [0, 3]\f$
+    template <typename S>
+    const Vec4<T> &operator-=(const Vec4<S> &v1)
+    {
+        this->mm[0] -= v1[0];
+        this->mm[1] -= v1[1];
+        this->mm[2] -= v1[2];
+        this->mm[3] -= v1[3];
+        return *this;
+    }
+
+    // Number of cols, rows, elements
+    static unsigned numRows() { return 1; }
+    static unsigned numColumns()  { return 4; }
+    static unsigned numElements()  { return 4; }
+
+    /// True if a Nan is present in vector
+    bool isNan() const
+    {
+        return isnan(this->mm[0]) || isnan(this->mm[1])
+            || isnan(this->mm[2]) || isnan(this->mm[3]);
+    }
+
+    /// True if an Inf is present in vector
+    bool isInfinite() const
+    {
+        return isinf(this->mm[0]) || isinf(this->mm[1])
+            || isinf(this->mm[2]) || isinf(this->mm[3]);
+    }
+
+    /// True if all no Nan or Inf values present
+    bool isFinite() const
+    {
+        return finite(this->mm[0]) && finite(this->mm[1])
+            && finite(this->mm[2]) && finite(this->mm[3]);
+    }
+
+    /// Predefined constants, e.g.   Vec4f v = Vec4f::xNegAxis();
+    static Vec4<T> zero() { return Vec4<T>(0, 0, 0, 0); }
+    static Vec4<T> origin() { return Vec4<T>(0, 0, 0, 1); }
+};
+
+/// Equality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+inline bool operator==(const Vec4<T0> &v0, const Vec4<T1> &v1)
+{
+    return
+        isExactlyEqual(v0[0], v1[0]) &&
+        isExactlyEqual(v0[1], v1[1]) &&
+        isExactlyEqual(v0[2], v1[2]) &&
+        isExactlyEqual(v0[3], v1[3]);
+}
+
+/// Inequality operator, does exact floating point comparisons
+template <typename T0, typename T1>
+inline bool operator!=(const Vec4<T0> &v0, const Vec4<T1> &v1) { return !(v0==v1); }
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+inline Vec4<typename promote<S, T>::type> operator*(S scalar, const Vec4<T> &v)
+{ return v*scalar; }
+
+/// Returns V, where \f$V_i = v_i * scalar\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+inline Vec4<typename promote<S, T>::type> operator*(const Vec4<T> &v, S scalar)
+{
+    Vec4<typename promote<S, T>::type> result(v);
+    result *= scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i * v1_i\f$ for \f$i \in [0, 3]\f$
+template <typename T0, typename T1>
+inline Vec4<typename promote<T0, T1>::type> operator*(const Vec4<T0> &v0,
+                                               const Vec4<T1> &v1)
+{
+    Vec4<typename promote<T0, T1>::type> result(v0[0]*v1[0],
+                                                v0[1]*v1[1],
+                                                v0[2]*v1[2],
+                                                v0[3]*v1[3]);
+    return result;
+}
+
+/// Returns V, where \f$V_i = scalar / v_i\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+inline Vec4<typename promote<S, T>::type> operator/(S scalar, const Vec4<T> &v)
+{
+    return Vec4<typename promote<S, T>::type>(scalar/v[0],
+                                              scalar/v[1],
+                                              scalar/v[2],
+                                              scalar/v[3]);
+}
+
+/// Returns V, where \f$V_i = v_i / scalar\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+inline Vec4<typename promote<S, T>::type> operator/(const Vec4<T> &v, S scalar)
+{
+    Vec4<typename promote<S, T>::type> result(v);
+    result /= scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i / v1_i\f$ for \f$i \in [0, 3]\f$
+template <typename T0, typename T1>
+inline Vec4<typename promote<T0, T1>::type> operator/(const Vec4<T0> &v0,
+                                               const Vec4<T1> &v1)
+{
+    Vec4<typename promote<T0, T1>::type>
+        result(v0[0]/v1[0], v0[1]/v1[1], v0[2]/v1[2], v0[3]/v1[3]);
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i + v1_i\f$ for \f$i \in [0, 3]\f$
+template <typename T0, typename T1>
+inline Vec4<typename promote<T0, T1>::type> operator+(const Vec4<T0> &v0, const Vec4<T1> &v1)
+{
+    Vec4<typename promote<T0, T1>::type> result(v0);
+    result += v1;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v_i + scalar\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+inline Vec4<typename promote<S, T>::type> operator+(const Vec4<T> &v, S scalar)
+{
+    Vec4<typename promote<S, T>::type> result(v);
+    result += scalar;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v0_i - v1_i\f$ for \f$i \in [0, 3]\f$
+template <typename T0, typename T1>
+inline Vec4<typename promote<T0, T1>::type> operator-(const Vec4<T0> &v0, const Vec4<T1> &v1)
+{
+    Vec4<typename promote<T0, T1>::type> result(v0);
+    result -= v1;
+    return result;
+}
+
+/// Returns V, where \f$V_i = v_i - scalar\f$ for \f$i \in [0, 3]\f$
+template <typename S, typename T>
+inline Vec4<typename promote<S, T>::type> operator-(const Vec4<T> &v, S scalar)
+{
+    Vec4<typename promote<S, T>::type> result(v);
+    result -= scalar;
+    return result;
+}
+
+template <typename T>
+inline bool
+isApproxEqual(const Vec4<T>& a, const Vec4<T>& b)
+{
+    return a.eq(b);
+}
+template <typename T>
+inline bool
+isApproxEqual(const Vec4<T>& a, const Vec4<T>& b, const Vec4<T>& eps)
+{
+    return isApproxEqual(a[0], b[0], eps[0]) &&
+           isApproxEqual(a[1], b[1], eps[1]) &&
+           isApproxEqual(a[2], b[2], eps[2]) &&
+           isApproxEqual(a[3], b[3], eps[3]);
+}
+
+template<typename T>
+inline bool
+isFinite(const Vec4<T>& v)
+{
+    return isFinite(v[0]) && isFinite(v[1]) && isFinite(v[2]) && isFinite(v[3]);
+}
+
+/// Return @c true if all components are exactly equal to zero.
+template<typename T>
+inline bool
+isZero(const Vec4<T>& v)
+{
+    return isZero(v[0]) && isZero(v[1]) && isZero(v[2]) && isZero(v[3]);
+}
+
+template<typename T>
+inline Vec4<T>
+Abs(const Vec4<T>& v)
+{
+    return Vec4<T>(Abs(v[0]), Abs(v[1]), Abs(v[2]), Abs(v[3]));
+}
+
+/// @remark We are switching to a more explicit name because the semantics
+/// are different from std::min/max. In that case, the function returns a
+/// reference to one of the objects based on a comparator. Here, we must
+/// fabricate a new object which might not match either of the inputs.
+
+/// Return component-wise minimum of the two vectors.
+template <typename T>
+inline Vec4<T> minComponent(const Vec4<T> &v1, const Vec4<T> &v2)
+{
+    return Vec4<T>(
+            std::min(v1.x(), v2.x()),
+            std::min(v1.y(), v2.y()),
+            std::min(v1.z(), v2.z()),
+            std::min(v1.w(), v2.w()));
+}
+
+/// Return component-wise maximum of the two vectors.
+template <typename T>
+inline Vec4<T> maxComponent(const Vec4<T> &v1, const Vec4<T> &v2)
+{
+    return Vec4<T>(
+            std::max(v1.x(), v2.x()),
+            std::max(v1.y(), v2.y()),
+            std::max(v1.z(), v2.z()),
+            std::max(v1.w(), v2.w()));
+}
+
+/// @brief Return a vector with the exponent applied to each of
+/// the components of the input vector.
+template <typename T>
+inline Vec4<T> Exp(Vec4<T> v) { return v.exp(); }
+
+typedef Vec4<int32_t>   Vec4i;
+typedef Vec4<uint32_t>  Vec4ui;
+typedef Vec4<float>     Vec4s;
+typedef Vec4<double>    Vec4d;
+
+} // namespace math
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_MATH_VEC4_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/metadata/MetaMap.h b/nuparu/include/openvdb_new/metadata/MetaMap.h
new file mode 100644
index 00000000..455b31a8
--- /dev/null
+++ b/nuparu/include/openvdb_new/metadata/MetaMap.h
@@ -0,0 +1,251 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_METADATA_METAMAP_HAS_BEEN_INCLUDED
+#define OPENVDB_METADATA_METAMAP_HAS_BEEN_INCLUDED
+
+#include <iosfwd>
+#include <map>
+#include <openvdb/metadata/Metadata.h>
+#include <openvdb/Types.h>
+#include <openvdb/Exceptions.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+/// Container that maps names (strings) to values of arbitrary types
+class OPENVDB_API MetaMap
+{
+public:
+    typedef boost::shared_ptr<MetaMap> Ptr;
+    typedef boost::shared_ptr<const MetaMap> ConstPtr;
+
+    typedef std::map<Name, Metadata::Ptr> MetadataMap;
+    typedef MetadataMap::iterator MetaIterator;
+    typedef MetadataMap::const_iterator ConstMetaIterator;
+        ///< @todo this should really iterate over a map of Metadata::ConstPtrs
+
+    MetaMap() {}
+    MetaMap(const MetaMap& other);
+    virtual ~MetaMap() {}
+
+    /// Return a copy of this map whose fields are shared with this map.
+    MetaMap::Ptr copyMeta() const;
+    /// Return a deep copy of this map that shares no data with this map.
+    MetaMap::Ptr deepCopyMeta() const;
+
+    /// Assign a deep copy of another map to this map.
+    MetaMap& operator=(const MetaMap&);
+
+    /// Unserialize metadata from the given stream.
+    void readMeta(std::istream&);
+    /// Serialize metadata to the given stream.
+    void writeMeta(std::ostream&) const;
+
+    /// @brief Insert a new metadata field or overwrite the value of an existing field.
+    /// @details If a field with the given name doesn't already exist, add a new field.
+    /// Otherwise, if the new value's type is the same as the existing field's value type,
+    /// overwrite the existing value with new value.
+    /// @throw TypeError if a field with the given name already exists, but its value type
+    /// is not the same as the new value's
+    /// @throw ValueError if the given field name is empty.
+    void insertMeta(const Name&, const Metadata& value);
+    /// @brief Deep copy all of the metadata fields from the given map into this map.
+    /// @throw TypeError if any field in the given map has the same name as
+    /// but a different value type than one of this map's fields.
+    void insertMeta(const MetaMap&);
+
+    /// Remove the given metadata field if it exists.
+    void removeMeta(const Name&);
+
+    //@{
+    /// @brief Return a pointer to the metadata with the given name.
+    /// If no such field exists, return a null pointer.
+    Metadata::Ptr operator[](const Name&);
+    Metadata::ConstPtr operator[](const Name&) const;
+    //@}
+
+    //@{
+    /// @brief Return a pointer to a TypedMetadata object of type @c T and with the given name.
+    /// If no such field exists or if there is a type mismatch, return a null pointer.
+    template<typename T> typename T::Ptr getMetadata(const Name&);
+    template<typename T> typename T::ConstPtr getMetadata(const Name&) const;
+    //@}
+
+    /// @brief Return a reference to the value of type @c T stored in the given metadata field.
+    /// @throw LookupError if no field with the given name exists.
+    /// @throw TypeError if the given field is not of type @c T.
+    template<typename T> T& metaValue(const Name&);
+    template<typename T> const T& metaValue(const Name&) const;
+
+    // Functions for iterating over the metadata
+    MetaIterator beginMeta() { return mMeta.begin(); }
+    MetaIterator endMeta() { return mMeta.end(); }
+    ConstMetaIterator beginMeta() const { return mMeta.begin(); }
+    ConstMetaIterator endMeta() const { return mMeta.end(); }
+
+    void clearMetadata() { mMeta.clear(); }
+
+    size_t metaCount() const { return mMeta.size(); }
+
+    /// Return a string describing this metadata map.  Prefix each line with @a indent.
+    std::string str(const std::string& indent = "") const;
+
+    /// Return @c true if the given map is equivalent to this map.
+    bool operator==(const MetaMap& other) const;
+    /// Return @c true if the given map is different from this map.
+    bool operator!=(const MetaMap& other) const { return !(*this == other); }
+
+private:
+    /// @brief Return a pointer to TypedMetadata with the given template parameter.
+    /// @throw LookupError if no field with the given name is found.
+    /// @throw TypeError if the given field is not of type T.
+    template<typename T>
+    typename TypedMetadata<T>::Ptr getValidTypedMetadata(const Name&) const;
+
+    MetadataMap mMeta;
+};
+
+/// Write a MetaMap to an output stream
+std::ostream& operator<<(std::ostream&, const MetaMap&);
+
+
+////////////////////////////////////////
+
+
+inline Metadata::Ptr
+MetaMap::operator[](const Name& name)
+{
+    MetaIterator iter = mMeta.find(name);
+    return (iter == mMeta.end() ? Metadata::Ptr() : iter->second);
+}
+
+inline Metadata::ConstPtr
+MetaMap::operator[](const Name &name) const
+{
+    ConstMetaIterator iter = mMeta.find(name);
+    return (iter == mMeta.end() ? Metadata::Ptr() : iter->second);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename T>
+inline typename T::Ptr
+MetaMap::getMetadata(const Name &name)
+{
+    ConstMetaIterator iter = mMeta.find(name);
+    if(iter == mMeta.end()) {
+        return typename T::Ptr();
+    }
+
+    // To ensure that we get valid conversion if the metadata pointers cross dso
+    // boundaries, we have to check the qualified typename and then do a static
+    // cast. This is slower than doing a dynamic_pointer_cast, but is safer when
+    // pointers cross dso boundaries.
+    if (iter->second->typeName() == T::staticTypeName()) {
+        return boost::static_pointer_cast<T, Metadata>(iter->second);
+    } // else
+    return typename T::Ptr();
+}
+
+template <typename T>
+inline typename T::ConstPtr
+MetaMap::getMetadata(const Name &name) const
+{
+    ConstMetaIterator iter = mMeta.find(name);
+    if(iter == mMeta.end()) {
+        return typename T::ConstPtr();
+    }
+    // To ensure that we get valid conversion if the metadata pointers cross dso
+    // boundaries, we have to check the qualified typename and then do a static
+    // cast. This is slower than doing a dynamic_pointer_cast, but is safer when
+    // pointers cross dso boundaries.
+    if (iter->second->typeName() == T::staticTypeName()) {
+        return boost::static_pointer_cast<const T, const Metadata>(iter->second);
+    } // else
+    return typename T::ConstPtr();
+}
+
+
+////////////////////////////////////////
+
+
+template <typename T>
+inline typename TypedMetadata<T>::Ptr
+MetaMap::getValidTypedMetadata(const Name &name) const
+{
+    ConstMetaIterator iter = mMeta.find(name);
+    if (iter == mMeta.end()) OPENVDB_THROW(LookupError, "Cannot find metadata " << name);
+
+    // To ensure that we get valid conversion if the metadata pointers cross dso
+    // boundaries, we have to check the qualified typename and then do a static
+    // cast. This is slower than doing a dynamic_pointer_cast, but is safer when
+    // pointers cross dso boundaries.
+    typename TypedMetadata<T>::Ptr m;
+    if (iter->second->typeName() == TypedMetadata<T>::staticTypeName()) {
+        m = boost::static_pointer_cast<TypedMetadata<T>, Metadata>(iter->second);
+    }
+    if (!m) OPENVDB_THROW(TypeError, "Invalid type for metadata " << name);
+    return m;
+}
+
+
+////////////////////////////////////////
+
+
+template <typename T>
+inline T&
+MetaMap::metaValue(const Name &name)
+{
+    typename TypedMetadata<T>::Ptr m = getValidTypedMetadata<T>(name);
+    return m->value();
+}
+
+
+template <typename T>
+inline const T&
+MetaMap::metaValue(const Name &name) const
+{
+    typename TypedMetadata<T>::Ptr m = getValidTypedMetadata<T>(name);
+    return m->value();
+}
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_METADATA_METAMAP_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/metadata/Metadata.h b/nuparu/include/openvdb_new/metadata/Metadata.h
new file mode 100644
index 00000000..96c44ef3
--- /dev/null
+++ b/nuparu/include/openvdb_new/metadata/Metadata.h
@@ -0,0 +1,397 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_METADATA_METADATA_HAS_BEEN_INCLUDED
+#define OPENVDB_METADATA_METADATA_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <string>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h> // for math::isZero()
+#include <openvdb/util/Name.h>
+#include <openvdb/Exceptions.h>
+#include <boost/shared_ptr.hpp>
+#include <boost/cstdint.hpp>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+/// @brief Base class for storing metadata information in a grid.
+class OPENVDB_API Metadata
+{
+public:
+    typedef boost::shared_ptr<Metadata> Ptr;
+    typedef boost::shared_ptr<const Metadata> ConstPtr;
+
+    Metadata() {}
+    virtual ~Metadata() {}
+
+    /// Return the type name of the metadata.
+    virtual Name typeName() const = 0;
+
+    /// Return a copy of the metadata.
+    virtual Metadata::Ptr copy() const = 0;
+
+    /// Copy the given metadata into this metadata.
+    virtual void copy(const Metadata& other) = 0;
+
+    /// Return a textual representation of this metadata.
+    virtual std::string str() const = 0;
+
+    /// Return the boolean representation of this metadata (empty strings
+    /// and zeroVals evaluate to false; most other values evaluate to true).
+    virtual bool asBool() const = 0;
+
+    /// Return @c true if the given metadata is equivalent to this metadata.
+    bool operator==(const Metadata& other) const;
+    /// Return @c true if the given metadata is different from this metadata.
+    bool operator!=(const Metadata& other) const { return !(*this == other); }
+
+    /// Return the size of this metadata in bytes.
+    virtual Index32 size() const = 0;
+
+    /// Unserialize this metadata from a stream.
+    void read(std::istream&);
+    /// Serialize this metadata to a stream.
+    void write(std::ostream&) const;
+
+    /// Create new metadata of the given type.
+    static Metadata::Ptr createMetadata(const Name& typeName);
+
+    /// Return @c true if the given type is known by the metadata type registry.
+    static bool isRegisteredType(const Name& typeName);
+
+    /// Clear out the metadata registry.
+    static void clearRegistry();
+
+    /// Register the given metadata type along with a factory function.
+    static void registerType(const Name& typeName, Metadata::Ptr (*createMetadata)());
+    static void unregisterType(const Name& typeName);
+
+protected:
+    /// Read the size of the metadata from a stream.
+    static Index32 readSize(std::istream&);
+    /// Write the size of the metadata to a stream.
+    void writeSize(std::ostream&) const;
+
+    /// Read the metadata from a stream.
+    virtual void readValue(std::istream&, Index32 numBytes) = 0;
+    /// Write the metadata to a stream.
+    virtual void writeValue(std::ostream&) const = 0;
+
+private:
+    // Disallow copying of instances of this class.
+    Metadata(const Metadata&);
+    Metadata& operator=(const Metadata&);
+};
+
+
+/// @brief Subclass to read (and ignore) data of an unregistered type
+class OPENVDB_API UnknownMetadata: public Metadata
+{
+public:
+    UnknownMetadata() {}
+    virtual ~UnknownMetadata() {}
+    virtual Name typeName() const { return "<unknown>"; }
+    virtual Metadata::Ptr copy() const { OPENVDB_THROW(TypeError, "Metadata has unknown type"); }
+    virtual void copy(const Metadata&) { OPENVDB_THROW(TypeError, "Destination has unknown type"); }
+    virtual std::string str() const { return "<unknown>"; }
+    virtual bool asBool() const { return false; }
+    virtual Index32 size() const { return 0; }
+
+protected:
+    virtual void readValue(std::istream&s, Index32 numBytes);
+    virtual void writeValue(std::ostream&) const;
+};
+
+
+/// @brief Templated metadata class to hold specific types.
+template<typename T>
+class TypedMetadata: public Metadata
+{
+public:
+    typedef boost::shared_ptr<TypedMetadata<T> > Ptr;
+    typedef boost::shared_ptr<const TypedMetadata<T> > ConstPtr;
+
+    TypedMetadata();
+    TypedMetadata(const T& value);
+    TypedMetadata(const TypedMetadata<T>& other);
+    virtual ~TypedMetadata();
+
+    virtual Name typeName() const;
+    virtual Metadata::Ptr copy() const;
+    virtual void copy(const Metadata& other);
+    virtual std::string str() const;
+    virtual bool asBool() const;
+    virtual Index32 size() const { return static_cast<Index32>(sizeof(T)); }
+
+    /// Set this metadata's value.
+    void setValue(const T&);
+    /// Return this metadata's value.
+    T& value();
+    const T& value() const;
+
+    // Static specialized function for the type name. This function must be
+    // template specialized for each type T.
+    static Name staticTypeName() { return typeNameAsString<T>(); }
+
+    /// Create new metadata of this type.
+    static Metadata::Ptr createMetadata();
+
+    static void registerType();
+    static void unregisterType();
+    static bool isRegisteredType();
+
+protected:
+    virtual void readValue(std::istream&, Index32 numBytes);
+    virtual void writeValue(std::ostream&) const;
+
+private:
+    T mValue;
+};
+
+/// Write a Metadata to an output stream
+std::ostream& operator<<(std::ostream& ostr, const Metadata& metadata);
+
+
+////////////////////////////////////////
+
+
+inline void
+Metadata::writeSize(std::ostream& os) const
+{
+    const Index32 n = this->size();
+    os.write(reinterpret_cast<const char*>(&n), sizeof(Index32));
+}
+
+
+inline Index32
+Metadata::readSize(std::istream& is)
+{
+    Index32 n = 0;
+    is.read(reinterpret_cast<char*>(&n), sizeof(Index32));
+    return n;
+}
+
+
+inline void
+Metadata::read(std::istream& is)
+{
+    const Index32 numBytes = this->readSize(is);
+    this->readValue(is, numBytes);
+}
+
+
+inline void
+Metadata::write(std::ostream& os) const
+{
+    this->writeSize(os);
+    this->writeValue(os);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename T>
+inline
+TypedMetadata<T>::TypedMetadata() : mValue(T())
+{
+}
+
+template <typename T>
+inline
+TypedMetadata<T>::TypedMetadata(const T &value) : mValue(value)
+{
+}
+
+template <typename T>
+inline
+TypedMetadata<T>::TypedMetadata(const TypedMetadata<T> &other) :
+    Metadata(),
+    mValue(other.mValue)
+{
+}
+
+template <typename T>
+inline
+TypedMetadata<T>::~TypedMetadata()
+{
+}
+
+template <typename T>
+inline Name
+TypedMetadata<T>::typeName() const
+{
+    return TypedMetadata<T>::staticTypeName();
+}
+
+template <typename T>
+inline void
+TypedMetadata<T>::setValue(const T& val)
+{
+    mValue = val;
+}
+
+template <typename T>
+inline T&
+TypedMetadata<T>::value()
+{
+    return mValue;
+}
+
+template <typename T>
+inline const T&
+TypedMetadata<T>::value() const
+{
+    return mValue;
+}
+
+template <typename T>
+inline Metadata::Ptr
+TypedMetadata<T>::copy() const
+{
+    Metadata::Ptr metadata(new TypedMetadata<T>());
+    metadata->copy(*this);
+    return metadata;
+}
+
+template <typename T>
+inline void
+TypedMetadata<T>::copy(const Metadata &other)
+{
+    const TypedMetadata<T>* t = dynamic_cast<const TypedMetadata<T>*>(&other);
+    if (t == NULL) OPENVDB_THROW(TypeError, "Incompatible type during copy");
+    mValue = t->mValue;
+}
+
+
+template<typename T>
+inline void
+TypedMetadata<T>::readValue(std::istream& is, Index32 /*numBytes*/)
+{
+    //assert(this->size() == numBytes);
+    is.read(reinterpret_cast<char*>(&mValue), this->size());
+}
+
+template<typename T>
+inline void
+TypedMetadata<T>::writeValue(std::ostream& os) const
+{
+    os.write(reinterpret_cast<const char*>(&mValue), this->size());
+}
+
+template <typename T>
+inline std::string
+TypedMetadata<T>::str() const
+{
+    std::ostringstream ostr;
+    ostr << mValue;
+    return ostr.str();
+}
+
+template<typename T>
+inline bool
+TypedMetadata<T>::asBool() const
+{
+    return !math::isZero(mValue);
+}
+
+template <typename T>
+inline Metadata::Ptr
+TypedMetadata<T>::createMetadata()
+{
+    Metadata::Ptr ret(new TypedMetadata<T>());
+    return ret;
+}
+
+template <typename T>
+inline void
+TypedMetadata<T>::registerType()
+{
+    Metadata::registerType(TypedMetadata<T>::staticTypeName(),
+                           TypedMetadata<T>::createMetadata);
+}
+
+template <typename T>
+inline void
+TypedMetadata<T>::unregisterType()
+{
+    Metadata::unregisterType(TypedMetadata<T>::staticTypeName());
+}
+
+template <typename T>
+inline bool
+TypedMetadata<T>::isRegisteredType()
+{
+    return Metadata::isRegisteredType(TypedMetadata<T>::staticTypeName());
+}
+
+
+template<>
+inline std::string
+TypedMetadata<bool>::str() const
+{
+    return (mValue ? "true" : "false");
+}
+
+
+inline std::ostream&
+operator<<(std::ostream& ostr, const Metadata& metadata)
+{
+    ostr << metadata.str();
+    return ostr;
+}
+
+
+typedef TypedMetadata<bool>            BoolMetadata;
+typedef TypedMetadata<double>          DoubleMetadata;
+typedef TypedMetadata<float>           FloatMetadata;
+typedef TypedMetadata<boost::int32_t>  Int32Metadata;
+typedef TypedMetadata<boost::int64_t>  Int64Metadata;
+typedef TypedMetadata<Vec2d>           Vec2DMetadata;
+typedef TypedMetadata<Vec2i>           Vec2IMetadata;
+typedef TypedMetadata<Vec2s>           Vec2SMetadata;
+typedef TypedMetadata<Vec3d>           Vec3DMetadata;
+typedef TypedMetadata<Vec3i>           Vec3IMetadata;
+typedef TypedMetadata<Vec3s>           Vec3SMetadata;
+typedef TypedMetadata<Mat4s>           Mat4SMetadata;
+typedef TypedMetadata<Mat4d>           Mat4DMetadata;
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_METADATA_METADATA_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/metadata/StringMetadata.h b/nuparu/include/openvdb_new/metadata/StringMetadata.h
new file mode 100644
index 00000000..53030cf1
--- /dev/null
+++ b/nuparu/include/openvdb_new/metadata/StringMetadata.h
@@ -0,0 +1,75 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_METADATA_STRINGMETADATA_HAS_BEEN_INCLUDED
+#define OPENVDB_METADATA_STRINGMETADATA_HAS_BEEN_INCLUDED
+
+#include <string>
+#include <openvdb/metadata/Metadata.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+typedef TypedMetadata<std::string> StringMetadata;
+
+
+template <>
+inline Index32
+StringMetadata::size() const
+{
+    return Index32(mValue.size());
+}
+
+
+template<>
+inline void
+StringMetadata::readValue(std::istream& is, Index32 size)
+{
+    mValue.resize(size, '\0');
+    is.read(&mValue[0], size);
+}
+
+template<>
+inline void
+StringMetadata::writeValue(std::ostream &os) const
+{
+    os.write(reinterpret_cast<const char*>(&mValue[0]), this->size());
+}
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_METADATA_STRINGMETADATA_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/openvdb.h b/nuparu/include/openvdb_new/openvdb.h
new file mode 100644
index 00000000..ef29f51f
--- /dev/null
+++ b/nuparu/include/openvdb_new/openvdb.h
@@ -0,0 +1,99 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_OPENVDB_HAS_BEEN_INCLUDED
+#define OPENVDB_OPENVDB_HAS_BEEN_INCLUDED
+
+#include "Platform.h"
+#include "Types.h"
+#include "Metadata.h"
+#include "math/Maps.h"
+#include "math/Transform.h"
+#include "Grid.h"
+#include "tree/Tree.h"
+#include "io/File.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+/// Common tree types
+typedef tree::Tree4<ValueMask,   5, 4, 3>::Type  MaskTree;
+typedef tree::Tree4<bool,        5, 4, 3>::Type  BoolTree;
+typedef tree::Tree4<float,       5, 4, 3>::Type  FloatTree;
+typedef tree::Tree4<double,      5, 4, 3>::Type  DoubleTree;
+typedef tree::Tree4<int32_t,     5, 4, 3>::Type  Int32Tree;
+typedef tree::Tree4<uint32_t,    5, 4, 3>::Type  UInt32Tree;
+typedef tree::Tree4<int64_t,     5, 4, 3>::Type  Int64Tree;
+typedef tree::Tree4<Vec2i,       5, 4, 3>::Type  Vec2ITree;
+typedef tree::Tree4<Vec2s,       5, 4, 3>::Type  Vec2STree;
+typedef tree::Tree4<Vec2d,       5, 4, 3>::Type  Vec2DTree;
+typedef tree::Tree4<Vec3i,       5, 4, 3>::Type  Vec3ITree;
+typedef tree::Tree4<Vec3f,       5, 4, 3>::Type  Vec3STree;
+typedef tree::Tree4<Vec3d,       5, 4, 3>::Type  Vec3DTree;
+typedef tree::Tree4<std::string, 5, 4, 3>::Type  StringTree;
+typedef MaskTree  TopologyTree;    
+typedef Vec3STree Vec3fTree;
+typedef Vec3DTree Vec3dTree;
+typedef FloatTree ScalarTree;
+typedef Vec3fTree VectorTree;
+
+/// Common grid types
+typedef Grid<MaskTree>      MaskGrid;
+typedef Grid<BoolTree>      BoolGrid;
+typedef Grid<FloatTree>     FloatGrid;
+typedef Grid<DoubleTree>    DoubleGrid;
+typedef Grid<Int32Tree>     Int32Grid;
+typedef Grid<Int64Tree>     Int64Grid;
+typedef Grid<Vec3ITree>     Vec3IGrid;
+typedef Grid<Vec3STree>     Vec3SGrid;
+typedef Grid<Vec3DTree>     Vec3DGrid;
+typedef Grid<StringTree>    StringGrid;
+typedef Vec3SGrid           Vec3fGrid;
+typedef Vec3DGrid           Vec3dGrid;
+typedef FloatGrid           ScalarGrid;
+typedef Vec3fGrid           VectorGrid;
+typedef MaskGrid            TopologyGrid;
+
+/// Global registration of basic types
+OPENVDB_API void initialize();
+
+/// Global deregistration of basic types
+OPENVDB_API void uninitialize();
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_OPENVDB_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/ChangeBackground.h b/nuparu/include/openvdb_new/tools/ChangeBackground.h
new file mode 100644
index 00000000..3e1e61d0
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/ChangeBackground.h
@@ -0,0 +1,278 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file ChangeBackground.h
+///
+/// @brief Efficient multi-threaded replacement of the background
+/// values in tree.
+///
+/// @author Ken Museth
+
+#ifndef OPENVDB_TOOLS_ChangeBACKGROUND_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ChangeBACKGROUND_HAS_BEEN_INCLUDED
+
+#include <openvdb/math/Math.h> // for isNegative and negative
+#include <openvdb/Types.h> // for Index typedef
+#include <openvdb/tree/NodeManager.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Replace the background value in all the nodes of a tree.
+/// @details The sign of the background value is preserved, and only
+/// inactive values equal to the old background value are replaced.
+///
+/// @note If a LeafManager is used the cached leaf nodes are reused,
+/// resulting in slightly better overall performance.
+///
+/// @param tree          Tree (or LeafManager) that will have its background value changed
+/// @param background    the new background value
+/// @param threaded      enable or disable threading  (threading is enabled by default)
+/// @param grainSize     used to control the threading granularity (default is 32)
+template<typename TreeOrLeafManagerT>
+inline void
+changeBackground(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& background,
+    bool threaded = true,
+    size_t grainSize = 32);
+
+
+/// @brief Replace the background value in all the nodes of a floating-point tree
+/// containing a symmetric narrow-band level set.
+/// @details All inactive values will be set to +| @a halfWidth | if outside
+/// and -| @a halfWidth | if inside, where @a halfWidth is half the width
+/// of the symmetric narrow band.
+///
+/// @note This method is faster than changeBackground since it does not
+/// perform tests to see if inactive values are equal to the old background value.
+/// @note If a LeafManager is used the cached leaf nodes are reused,
+/// resulting in slightly better overall performance.
+///
+/// @param tree          Tree (or LeafManager) that will have its background value changed
+/// @param halfWidth     half of the width of the symmetric narrow band
+/// @param threaded      enable or disable threading  (threading is enabled by default)
+/// @param grainSize     used to control the threading granularity (default is 32)
+///
+/// @throw ValueError if @a halfWidth is negative (as defined by math::isNegative)
+template<typename TreeOrLeafManagerT>
+inline void
+changeLevelSetBackground(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& halfWidth,
+    bool threaded = true,
+    size_t grainSize = 32);
+
+
+/// @brief Replace the background values in all the nodes of a floating-point tree
+/// containing a possibly asymmetric narrow-band level set.
+/// @details All inactive values will be set to +| @a outsideWidth | if outside
+/// and -| @a insideWidth | if inside, where @a outsideWidth is the outside
+/// width of the narrow band and @a insideWidth is its inside width.
+///
+/// @note This method is faster than changeBackground since it does not
+/// perform tests to see if inactive values are equal to the old background value.
+/// @note If a LeafManager is used the cached leaf nodes are reused,
+/// resulting in slightly better overall performance.
+///
+/// @param tree          Tree (or LeafManager) that will have its background value changed
+/// @param outsideWidth  The width of the outside of the narrow band
+/// @param insideWidth   The width of the inside of the narrow band
+/// @param threaded      enable or disable threading  (threading is enabled by default)
+/// @param grainSize     used to control the threading granularity (default is 32)
+///
+/// @throw ValueError if @a outsideWidth is negative or @a insideWidth is
+/// not negative (as defined by math::isNegative)
+template<typename TreeOrLeafManagerT>
+inline void
+changeAsymmetricLevelSetBackground(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& outsideWidth,
+    const typename TreeOrLeafManagerT::ValueType& insideWidth,
+    bool threaded = true,
+    size_t grainSize = 32);
+
+
+//////////////////////////////////////////////////////
+
+
+// Replaces the background value in a Tree of any type.
+template<typename TreeOrLeafManagerT>
+class ChangeBackgroundOp
+{
+public:
+    typedef typename TreeOrLeafManagerT::ValueType    ValueT;
+    typedef typename TreeOrLeafManagerT::RootNodeType RootT;
+    typedef typename TreeOrLeafManagerT::LeafNodeType LeafT;
+
+
+    ChangeBackgroundOp(const TreeOrLeafManagerT& tree, const ValueT& newValue)
+        : mOldValue(tree.root().background())
+        , mNewValue(newValue)
+    {
+    }
+    void operator()(RootT& root) const
+    {
+        for (typename RootT::ValueOffIter it = root.beginValueOff(); it; ++it) this->set(it);
+        root.setBackground(mNewValue, false);
+    }
+    void operator()(LeafT& node) const
+    {
+        for (typename LeafT::ValueOffIter it = node.beginValueOff(); it; ++it) this->set(it);
+    }
+    template<typename NodeT>
+    void operator()(NodeT& node) const
+    {
+        typename NodeT::NodeMaskType mask = node.getValueOffMask();
+        for (typename NodeT::ValueOnIter it(mask.beginOn(), &node); it; ++it) this->set(it);
+    }
+private:
+
+    template<typename IterT>
+    inline void set(IterT& iter) const
+    {
+        if (math::isApproxEqual(*iter, mOldValue)) {
+            iter.setValue(mNewValue);
+        } else if (math::isApproxEqual(*iter, math::negative(mOldValue))) {
+            iter.setValue(math::negative(mNewValue));
+        }
+    }
+    const ValueT mOldValue, mNewValue;
+};// ChangeBackgroundOp
+
+
+// Replaces the background value in a Tree assumed to represent a
+// level set. It is generally faster than ChangeBackgroundOp.
+// Note that is follows the sign-convention that outside is positive
+// and inside is negative!
+template<typename TreeOrLeafManagerT>
+class ChangeLevelSetBackgroundOp
+{
+public:
+    typedef typename TreeOrLeafManagerT::ValueType    ValueT;
+    typedef typename TreeOrLeafManagerT::RootNodeType RootT;
+    typedef typename TreeOrLeafManagerT::LeafNodeType LeafT;
+
+    /// @brief Constructor for asymmetric narrow-bands
+    ChangeLevelSetBackgroundOp(const ValueT& outside, const ValueT& inside)
+        : mOutside(outside)
+        , mInside(inside)
+    {
+        if (math::isNegative(mOutside)) {
+            OPENVDB_THROW(ValueError,
+                          "ChangeLevelSetBackgroundOp: the outside value cannot be negative!");
+        }
+        if (!math::isNegative(mInside)) {
+            OPENVDB_THROW(ValueError,
+                          "ChangeLevelSetBackgroundOp: the inside value must be negative!");
+        }
+    }
+    void operator()(RootT& root) const
+    {
+        for (typename RootT::ValueOffIter it = root.beginValueOff(); it; ++it) this->set(it);
+        root.setBackground(mOutside, false);
+    }
+    void operator()(LeafT& node) const
+    {
+        for(typename LeafT::ValueOffIter it = node.beginValueOff(); it; ++it) this->set(it);
+    }
+    template<typename NodeT>
+    void operator()(NodeT& node) const
+    {
+        typedef typename NodeT::ValueOffIter IterT;
+        for (IterT it(node.getChildMask().beginOff(), &node); it; ++it) this->set(it);
+    }
+private:
+
+    template<typename IterT>
+    inline void set(IterT& iter) const
+    {
+        //this is safe since we know ValueType is_floating_point
+        ValueT& v = const_cast<ValueT&>(*iter);
+        v = v < 0 ? mInside : mOutside;
+    }
+    const ValueT mOutside, mInside;
+};// ChangeLevelSetBackgroundOp
+
+
+template<typename TreeOrLeafManagerT>
+inline void
+changeBackground(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& background,
+    bool threaded,
+    size_t grainSize)
+{
+    tree::NodeManager<TreeOrLeafManagerT> linearTree(tree);
+    ChangeBackgroundOp<TreeOrLeafManagerT> op(tree, background);
+    linearTree.foreachTopDown(op, threaded, grainSize);
+}
+
+
+template<typename TreeOrLeafManagerT>
+inline void
+changeAsymmetricLevelSetBackground(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& outsideValue,
+    const typename TreeOrLeafManagerT::ValueType& insideValue,
+    bool threaded,
+    size_t grainSize)
+{
+    tree::NodeManager<TreeOrLeafManagerT> linearTree(tree);
+    ChangeLevelSetBackgroundOp<TreeOrLeafManagerT> op(outsideValue, insideValue);
+    linearTree.foreachTopDown(op, threaded, grainSize);
+}
+
+
+// If the narrow-band is symmetric only one background value is required
+template<typename TreeOrLeafManagerT>
+inline void
+changeLevelSetBackground(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& background,
+    bool threaded,
+    size_t grainSize)
+{
+    changeAsymmetricLevelSetBackground(
+        tree, background, math::negative(background), threaded, grainSize);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_CHANGEBACKGROUND_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Clip.h b/nuparu/include/openvdb_new/tools/Clip.h
new file mode 100644
index 00000000..c3f09d49
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Clip.h
@@ -0,0 +1,413 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Clip.h
+///
+/// @brief Functions to clip a grid against a bounding box or against
+/// another grid's active voxel topology
+
+#ifndef OPENVDB_TOOLS_CLIP_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_CLIP_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>// for ValueMask
+#include <openvdb/Grid.h>
+#include <openvdb/math/Math.h>// for isNegative
+#include <openvdb/tree/LeafManager.h>
+#include "GridTransformer.h" // for resampleToMatch()
+#include <boost/type_traits/is_same.hpp>
+#include <boost/type_traits/is_signed.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+#include "Prune.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Clip the given grid against a world-space bounding box
+/// and return a new grid containing the result.
+/// @warning Clipping a level set will likely produce a grid that is
+/// no longer a valid level set.
+template<typename GridType> OPENVDB_STATIC_SPECIALIZATION
+inline typename GridType::Ptr clip(const GridType& grid, const BBoxd&);
+
+/// @brief Clip a grid against the active voxels of another grid
+/// and return a new grid containing the result.
+/// @param grid  the grid to be clipped
+/// @param mask  a grid whose active voxels form a boolean clipping mask
+/// @details The mask grid need not have the same transform as the source grid.
+/// Also, if the mask grid is a level set, consider using tools::sdfInteriorMask
+/// to construct a new mask comprising the interior (rather than the narrow band)
+/// of the level set.
+/// @warning Clipping a level set will likely produce a grid that is
+/// no longer a valid level set.
+template<typename GridType, typename MaskTreeType> OPENVDB_STATIC_SPECIALIZATION
+inline typename GridType::Ptr clip(const GridType& grid, const Grid<MaskTreeType>& mask);
+
+
+////////////////////////////////////////
+
+
+namespace clip_internal {
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+class MaskInteriorVoxels
+{
+public:
+    typedef typename TreeT::ValueType ValueT;
+    typedef typename TreeT::LeafNodeType LeafNodeT;
+
+    MaskInteriorVoxels(const TreeT& tree): mAcc(tree) {}
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t /*leafIndex*/) const
+    {
+        const LeafNodeT *refLeaf = mAcc.probeConstLeaf(leaf.origin());
+        if (refLeaf) {
+            typename LeafNodeType::ValueOffIter iter = leaf.beginValueOff();
+            for ( ; iter; ++iter) {
+                const Index pos = iter.pos();
+                leaf.setActiveState(pos, math::isNegative(refLeaf->getValue(pos)));
+            }
+        }
+    }
+
+private:
+     tree::ValueAccessor<const TreeT> mAcc;
+};
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+class CopyLeafNodes
+{
+public:
+    typedef typename TreeT::template ValueConverter<ValueMask>::Type MaskTreeT;
+    typedef tree::LeafManager<const MaskTreeT> MaskLeafManagerT;
+
+    CopyLeafNodes(const TreeT& tree, const MaskLeafManagerT& leafNodes);
+
+    void run(bool threaded = true);
+
+    typename TreeT::Ptr tree() const { return mNewTree; }
+
+    CopyLeafNodes(CopyLeafNodes&, tbb::split);
+    void operator()(const tbb::blocked_range<size_t>&);
+    void join(const CopyLeafNodes& rhs) { mNewTree->merge(*rhs.mNewTree); }
+
+private:
+    const MaskTreeT* mClipMask;
+    const TreeT* mTree;
+    const MaskLeafManagerT* mLeafNodes;
+    typename TreeT::Ptr mNewTree;
+};
+
+
+template<typename TreeT>
+CopyLeafNodes<TreeT>::CopyLeafNodes(const TreeT& tree, const MaskLeafManagerT& leafNodes)
+    : mTree(&tree)
+    , mLeafNodes(&leafNodes)
+    , mNewTree(new TreeT(mTree->background()))
+{
+}
+
+
+template<typename TreeT>
+CopyLeafNodes<TreeT>::CopyLeafNodes(CopyLeafNodes& rhs, tbb::split)
+    : mTree(rhs.mTree)
+    , mLeafNodes(rhs.mLeafNodes)
+    , mNewTree(new TreeT(mTree->background()))
+{
+}
+
+
+template<typename TreeT>
+void
+CopyLeafNodes<TreeT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_reduce(mLeafNodes->getRange(), *this);
+    else (*this)(mLeafNodes->getRange());
+}
+
+
+template<typename TreeT>
+void
+CopyLeafNodes<TreeT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    typedef typename TreeT::LeafNodeType LeafT;
+    typedef typename MaskTree::LeafNodeType MaskLeafT;
+    typename MaskLeafT::ValueOnCIter it;
+
+    tree::ValueAccessor<TreeT> acc(*mNewTree);
+    tree::ValueAccessor<const TreeT> refAcc(*mTree);
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+        const MaskLeafT& maskLeaf = mLeafNodes->leaf(n);
+        const Coord& ijk = maskLeaf.origin();
+        const LeafT* refLeaf = refAcc.probeConstLeaf(ijk);
+
+        LeafT* newLeaf = acc.touchLeaf(ijk);
+
+        if (refLeaf) {
+            for (it = maskLeaf.cbeginValueOn(); it; ++it) {
+                const Index pos = it.pos();
+                newLeaf->setValueOnly(pos, refLeaf->getValue(pos));
+                newLeaf->setActiveState(pos, refLeaf->isValueOn(pos));
+            }
+        } else {
+            typename TreeT::ValueType value;
+            bool isActive = refAcc.probeValue(ijk, value);
+
+            for (it = maskLeaf.cbeginValueOn(); it; ++it) {
+                const Index pos = it.pos();
+                newLeaf->setValueOnly(pos, value);
+                newLeaf->setActiveState(pos, isActive);
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+struct BoolSampler
+{
+    static const char* name() { return "bin"; }
+    static int radius() { return 2; }
+    static bool mipmap() { return false; }
+    static bool consistent() { return true; }
+
+    template<class TreeT>
+    static bool sample(const TreeT& inTree,
+        const Vec3R& inCoord, typename TreeT::ValueType& result)
+    {
+        Coord ijk;
+        ijk[0] = int(std::floor(inCoord[0]));
+        ijk[1] = int(std::floor(inCoord[1]));
+        ijk[2] = int(std::floor(inCoord[2]));
+        return inTree.probeValue(ijk, result);
+    }
+};
+
+
+////////////////////////////////////////
+
+
+// Convert a grid of one type to a grid of another type
+template<typename FromGridT, typename ToGridT>
+struct ConvertGrid
+{
+    typedef typename FromGridT::Ptr FromGridPtrT;
+    typedef typename ToGridT::Ptr ToGridPtrT;
+    ToGridPtrT operator()(const FromGridPtrT& grid) { return ToGridPtrT(new ToGridT(*grid)); }
+};
+
+// Partial specialization that avoids copying when
+// the input and output grid types are the same
+template<typename GridT>
+struct ConvertGrid<GridT, GridT>
+{
+    typedef typename GridT::Ptr GridPtrT;
+    GridPtrT operator()(const GridPtrT& grid) { return grid; }
+};
+
+
+////////////////////////////////////////
+
+
+// Convert a grid of arbitrary type to a mask grid and return a pointer to the new grid.
+template<typename GridT>
+inline typename boost::disable_if<boost::is_same<ValueMask, typename GridT::BuildType>,
+    typename GridT::template ValueConverter<ValueMask>::Type::Ptr>::type
+convertToMaskGrid(const GridT& grid)
+{
+    typedef typename GridT::template ValueConverter<ValueMask>::Type MaskGridT;
+    typedef typename MaskGridT::Ptr MaskGridPtrT;
+
+    // Convert the input grid to a boolean mask grid (with the same tree configuration).
+    MaskGridPtrT mask = MaskGridT::create(/*background=*/false);
+    mask->topologyUnion(grid);
+    mask->setTransform(grid.constTransform().copy());
+    return mask;
+}
+
+// Overload that avoids any processing if the input grid is already a mask grid
+template<typename GridT>
+inline typename boost::enable_if<boost::is_same<ValueMask, typename GridT::BuildType>,
+                                 typename GridT::Ptr>::type
+convertToMaskGrid(const GridT& grid)
+{
+    return grid.copy(); // shallow copy
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+doClip(const GridType& grid, const typename GridType::template ValueConverter<ValueMask>::Type& aMask)
+{
+    typedef typename GridType::TreeType TreeT;
+    typedef typename GridType::TreeType::template ValueConverter<ValueMask>::Type MaskTreeT;
+
+    const GridClass gridClass = grid.getGridClass();
+    const TreeT& tree = grid.tree();
+
+    MaskTreeT mask(false);
+    mask.topologyUnion(tree);
+
+    if (gridClass == GRID_LEVEL_SET) {
+        tree::LeafManager<MaskTreeT> leafNodes(mask);
+        leafNodes.foreach(MaskInteriorVoxels<TreeT>(tree));
+
+        tree::ValueAccessor<const TreeT> acc(tree);
+
+        typename MaskTreeT::ValueAllIter iter(mask);
+        iter.setMaxDepth(MaskTreeT::ValueAllIter::LEAF_DEPTH - 1);
+
+        for ( ; iter; ++iter) {
+            iter.setActiveState(math::isNegative(acc.getValue(iter.getCoord())));
+        }
+    }
+
+    mask.topologyIntersection(aMask.constTree());
+
+    typename GridType::Ptr outGrid;
+    {
+        // Copy voxel values and states.
+        tree::LeafManager<const MaskTreeT> leafNodes(mask);
+        CopyLeafNodes<TreeT> maskOp(tree, leafNodes);
+        maskOp.run();
+        outGrid = GridType::create(maskOp.tree());
+    }
+    {
+        // Copy tile values and states.
+        tree::ValueAccessor<const TreeT> refAcc(tree);
+        tree::ValueAccessor<const MaskTreeT> maskAcc(mask);
+
+        typename TreeT::ValueAllIter it(outGrid->tree());
+        it.setMaxDepth(TreeT::ValueAllIter::LEAF_DEPTH - 1);
+        for ( ; it; ++it) {
+            Coord ijk = it.getCoord();
+
+            if (maskAcc.isValueOn(ijk)) {
+                typename TreeT::ValueType value;
+                bool isActive = refAcc.probeValue(ijk, value);
+
+                it.setValue(value);
+                if (!isActive) it.setValueOff();
+            }
+        }
+    }
+
+    outGrid->setTransform(grid.transform().copy());
+    if (gridClass != GRID_LEVEL_SET) outGrid->setGridClass(gridClass);
+
+    return outGrid;
+}
+
+} // namespace clip_internal
+
+
+////////////////////////////////////////
+
+
+template<typename GridType>
+OPENVDB_STATIC_SPECIALIZATION
+inline typename GridType::Ptr
+clip(const GridType& grid, const BBoxd& bbox)
+{
+    typedef typename GridType::template ValueConverter<ValueMask>::Type MaskGridT;
+
+    // Transform the world-space bounding box into the source grid's index space.
+    Vec3d idxMin, idxMax;
+    math::calculateBounds(grid.constTransform(), bbox.min(), bbox.max(), idxMin, idxMax);
+    CoordBBox region(Coord::floor(idxMin), Coord::floor(idxMax));
+    // Construct a boolean mask grid that is true inside the index-space bounding box
+    // and false everywhere else.
+    MaskGridT clipMask(/*background=*/false);
+    clipMask.fill(region, /*value=*/true, /*active=*/true);
+
+    return clip_internal::doClip(grid, clipMask);
+}
+
+
+template<typename GridType1, typename TreeType2>
+OPENVDB_STATIC_SPECIALIZATION
+inline typename GridType1::Ptr
+clip(const GridType1& grid1, const Grid<TreeType2>& grid2)
+{
+    typedef typename GridType1::template ValueConverter<ValueMask>::Type MaskGridT1;
+    typedef typename MaskGridT1::Ptr MaskGridPtrT1;
+
+    typedef Grid<TreeType2> GridType2;
+    typedef typename GridType2::template ValueConverter<ValueMask>::Type MaskGridT2;
+    typedef typename MaskGridT2::Ptr MaskGridPtrT2;
+
+    // Convert the mask grid to a boolean grid with the same tree configuration.
+    MaskGridPtrT2 maskGrid = clip_internal::convertToMaskGrid( grid2 );
+
+    // Resample the boolean mask grid into the source grid's index space.
+    if (grid1.constTransform() != maskGrid->constTransform()) {
+        MaskGridPtrT2 resampledMask = MaskGridT2::create(/*background=*/false);
+        resampledMask->setTransform(grid1.constTransform().copy());
+        tools::resampleToMatch<clip_internal::BoolSampler>(*maskGrid, *resampledMask);
+        tools::prune(resampledMask->tree());
+        maskGrid = resampledMask;
+    }
+
+    // Convert the bool mask grid to a bool grid of the same configuration as the source grid.
+    MaskGridPtrT1 clipMask =
+        clip_internal::ConvertGrid</*from=*/MaskGridT2, /*to=*/MaskGridT1>()( maskGrid );
+
+    // Clip the source grid against the boolean mask grid.
+    return clip_internal::doClip(grid1, *clipMask);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_CLIP_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Composite.h b/nuparu/include/openvdb_new/tools/Composite.h
new file mode 100644
index 00000000..fe4f6fd3
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Composite.h
@@ -0,0 +1,1107 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Composite.h
+///
+/// @brief Functions to efficiently perform various compositing operations on grids
+///
+/// @authors Peter Cucka, Mihai Alden
+
+#ifndef OPENVDB_TOOLS_COMPOSITE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_COMPOSITE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <openvdb/Exceptions.h>
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/math/Math.h> // for isExactlyEqual()
+#include "ValueTransformer.h" // for transformValues()
+#include "Prune.h"// for prune
+#include "SignedFloodFill.h" // for signedFloodFill()
+#include <boost/utility/enable_if.hpp>
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/task_group.h>
+#include <tbb/task_scheduler_init.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Given two level set grids, replace the A grid with the union of A and B.
+/// @throw ValueError if the background value of either grid is not greater than zero.
+/// @note This operation always leaves the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void csgUnion(GridOrTreeT& a, GridOrTreeT& b, bool prune = true);
+/// @brief Given two level set grids, replace the A grid with the intersection of A and B.
+/// @throw ValueError if the background value of either grid is not greater than zero.
+/// @note This operation always leaves the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void csgIntersection(GridOrTreeT& a, GridOrTreeT& b, bool prune = true);
+/// @brief Given two level set grids, replace the A grid with the difference A / B.
+/// @throw ValueError if the background value of either grid is not greater than zero.
+/// @note This operation always leaves the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void csgDifference(GridOrTreeT& a, GridOrTreeT& b, bool prune = true);
+
+/// @brief  Threaded CSG union operation that produces a new grid or tree from
+///         immutable inputs.
+/// @return The CSG union of the @a and @b level set inputs.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline typename GridOrTreeT::Ptr csgUnionCopy(const GridOrTreeT& a, const GridOrTreeT& b);
+/// @brief  Threaded CSG intersection operation that produces a new grid or tree from
+///         immutable inputs.
+/// @return The CSG intersection of the @a and @b level set inputs.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline typename GridOrTreeT::Ptr csgIntersectionCopy(const GridOrTreeT& a, const GridOrTreeT& b);
+/// @brief  Threaded CSG difference operation that produces a new grid or tree from
+///         immutable inputs.
+/// @return The CSG difference of the @a and @b level set inputs.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline typename GridOrTreeT::Ptr csgDifferenceCopy(const GridOrTreeT& a, const GridOrTreeT& b);
+
+/// @brief Given grids A and B, compute max(a, b) per voxel (using sparse traversal).
+/// Store the result in the A grid and leave the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void compMax(GridOrTreeT& a, GridOrTreeT& b);
+/// @brief Given grids A and B, compute min(a, b) per voxel (using sparse traversal).
+/// Store the result in the A grid and leave the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void compMin(GridOrTreeT& a, GridOrTreeT& b);
+/// @brief Given grids A and B, compute a + b per voxel (using sparse traversal).
+/// Store the result in the A grid and leave the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void compSum(GridOrTreeT& a, GridOrTreeT& b);
+/// @brief Given grids A and B, compute a * b per voxel (using sparse traversal).
+/// Store the result in the A grid and leave the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void compMul(GridOrTreeT& a, GridOrTreeT& b);
+/// @brief Given grids A and B, compute a / b per voxel (using sparse traversal).
+/// Store the result in the A grid and leave the B grid empty.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void compDiv(GridOrTreeT& a, GridOrTreeT& b);
+
+/// Copy the active voxels of B into A.
+template<typename GridOrTreeT> OPENVDB_STATIC_SPECIALIZATION
+inline void compReplace(GridOrTreeT& a, const GridOrTreeT& b);
+
+
+////////////////////////////////////////
+
+
+namespace composite {
+
+// composite::min() and composite::max() for non-vector types compare with operator<().
+template<typename T> inline
+const typename boost::disable_if_c<VecTraits<T>::IsVec, T>::type& // = T if T is not a vector type
+min(const T& a, const T& b) { return std::min(a, b); }
+
+template<typename T> inline
+const typename boost::disable_if_c<VecTraits<T>::IsVec, T>::type&
+max(const T& a, const T& b) { return std::max(a, b); }
+
+
+// composite::min() and composite::max() for OpenVDB vector types compare by magnitude.
+template<typename T> inline
+const typename boost::enable_if_c<VecTraits<T>::IsVec, T>::type& // = T if T is a vector type
+min(const T& a, const T& b)
+{
+    const typename T::ValueType aMag = a.lengthSqr(), bMag = b.lengthSqr();
+    return (aMag < bMag ? a : (bMag < aMag ? b : std::min(a, b)));
+}
+
+template<typename T> inline
+const typename boost::enable_if_c<VecTraits<T>::IsVec, T>::type&
+max(const T& a, const T& b)
+{
+    const typename T::ValueType aMag = a.lengthSqr(), bMag = b.lengthSqr();
+    return (aMag < bMag ? b : (bMag < aMag ? a : std::max(a, b)));
+}
+
+
+template<typename T> inline
+typename boost::disable_if<boost::is_integral<T>, T>::type // = T if T is not an integer type
+divide(const T& a, const T& b) { return a / b; }
+
+template<typename T> inline
+typename boost::enable_if<boost::is_integral<T>, T>::type // = T if T is an integer type
+divide(const T& a, const T& b)
+{
+    const T zero(0);
+    if (b != zero) return a / b;
+    if (a == zero) return 0;
+    return (a > 0 ? std::numeric_limits<T>::max() : -std::numeric_limits<T>::max());
+}
+
+// If b is true, return a / 1 = a.
+// If b is false and a is true, return 1 / 0 = inf = MAX_BOOL = 1 = a.
+// If b is false and a is false, return 0 / 0 = NaN = 0 = a.
+inline bool divide(bool a, bool /*b*/) { return a; }
+
+
+enum CSGOperation { CSG_UNION, CSG_INTERSECTION, CSG_DIFFERENCE };
+
+template<typename TreeType, CSGOperation Operation>
+struct BuildPrimarySegment
+{
+    typedef typename TreeType::ValueType                                            ValueType;
+    typedef typename TreeType::Ptr                                                  TreePtrType;
+    typedef typename TreeType::LeafNodeType                                         LeafNodeType;
+    typedef typename LeafNodeType::NodeMaskType                                     NodeMaskType;
+    typedef typename TreeType::RootNodeType                                         RootNodeType;
+    typedef typename RootNodeType::NodeChainType                                    NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type      InternalNodeType;
+
+    BuildPrimarySegment(const TreeType& lhs, const TreeType& rhs)
+        : mSegment(new TreeType(lhs.background()))
+        , mLhsTree(&lhs)
+        , mRhsTree(&rhs)
+    {
+    }
+
+    void operator()() const
+    {
+        std::vector<const LeafNodeType*> leafNodes;
+
+        {
+            std::vector<const InternalNodeType*> internalNodes;
+            mLhsTree->getNodes(internalNodes);
+
+            ProcessInternalNodes op(internalNodes, *mRhsTree, *mSegment, leafNodes);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, internalNodes.size()), op);
+        }
+
+        ProcessLeafNodes op(leafNodes, *mRhsTree, *mSegment);
+        tbb::parallel_reduce(tbb::blocked_range<size_t>(0, leafNodes.size()), op);
+    }
+
+    TreePtrType& segment() { return mSegment; }
+
+private:
+
+    struct ProcessInternalNodes {
+
+        ProcessInternalNodes(std::vector<const InternalNodeType*>& lhsNodes, const TreeType& rhsTree,
+            TreeType& outputTree, std::vector<const LeafNodeType*>& outputLeafNodes)
+            : mLhsNodes(lhsNodes.empty() ? NULL : &lhsNodes.front())
+            , mRhsTree(&rhsTree)
+            , mLocalTree(mRhsTree->background())
+            , mOutputTree(&outputTree)
+            , mLocalLeafNodes()
+            , mOutputLeafNodes(&outputLeafNodes)
+        {
+        }
+
+        ProcessInternalNodes(ProcessInternalNodes& other, tbb::split)
+            : mLhsNodes(other.mLhsNodes)
+            , mRhsTree(other.mRhsTree)
+            , mLocalTree(mRhsTree->background())
+            , mOutputTree(&mLocalTree)
+            , mLocalLeafNodes()
+            , mOutputLeafNodes(&mLocalLeafNodes)
+        {
+        }
+
+        void join(ProcessInternalNodes& other)
+        {
+            mOutputTree->merge(*other.mOutputTree);
+            mOutputLeafNodes->insert(mOutputLeafNodes->end(),
+                other.mOutputLeafNodes->begin(), other.mOutputLeafNodes->end());
+        }
+
+        void operator()(const tbb::blocked_range<size_t>& range)
+        {
+            tree::ValueAccessor<const TreeType> rhsAcc(*mRhsTree);
+            tree::ValueAccessor<TreeType>       outputAcc(*mOutputTree);
+
+            std::vector<const LeafNodeType*> tmpLeafNodes;
+
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+                const InternalNodeType& lhsNode = *mLhsNodes[n];
+                const Coord& ijk = lhsNode.origin();
+                const InternalNodeType * rhsNode = rhsAcc.template probeConstNode<InternalNodeType>(ijk);
+
+                if (rhsNode) {
+                    lhsNode.getNodes(*mOutputLeafNodes);
+                } else {
+                    if (Operation == CSG_INTERSECTION) {
+                        if (rhsAcc.getValue(ijk) < ValueType(0.0)) {
+                            tmpLeafNodes.clear();
+                            lhsNode.getNodes(tmpLeafNodes);
+                            for (size_t i = 0, I = tmpLeafNodes.size(); i < I; ++i) {
+                                outputAcc.addLeaf(new LeafNodeType(*tmpLeafNodes[i]));
+                            }
+                        }
+                    } else { // Union & Difference
+                        if (!(rhsAcc.getValue(ijk) < ValueType(0.0))) {
+                            tmpLeafNodes.clear();
+                            lhsNode.getNodes(tmpLeafNodes);
+                            for (size_t i = 0, I = tmpLeafNodes.size(); i < I; ++i) {
+                                outputAcc.addLeaf(new LeafNodeType(*tmpLeafNodes[i]));
+                            }
+                        }
+                    }
+                }
+            } //  end range loop
+        }
+
+        InternalNodeType const * const * const mLhsNodes;
+        TreeType                 const * const mRhsTree;
+        TreeType                               mLocalTree;
+        TreeType                       * const mOutputTree;
+
+        std::vector<const LeafNodeType*>         mLocalLeafNodes;
+        std::vector<const LeafNodeType*> * const mOutputLeafNodes;
+    }; // struct ProcessInternalNodes
+
+    struct ProcessLeafNodes {
+
+        ProcessLeafNodes(std::vector<const LeafNodeType*>& lhsNodes, const TreeType& rhsTree, TreeType& output)
+            : mLhsNodes(lhsNodes.empty() ? NULL : &lhsNodes.front())
+            , mRhsTree(&rhsTree)
+            , mLocalTree(mRhsTree->background())
+            , mOutputTree(&output)
+        {
+        }
+
+        ProcessLeafNodes(ProcessLeafNodes& other, tbb::split)
+            : mLhsNodes(other.mLhsNodes)
+            , mRhsTree(other.mRhsTree)
+            , mLocalTree(mRhsTree->background())
+            , mOutputTree(&mLocalTree)
+        {
+        }
+
+        void join(ProcessLeafNodes& rhs) { mOutputTree->merge(*rhs.mOutputTree); }
+
+        void operator()(const tbb::blocked_range<size_t>& range)
+        {
+            tree::ValueAccessor<const TreeType> rhsAcc(*mRhsTree);
+            tree::ValueAccessor<TreeType>       outputAcc(*mOutputTree);
+
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+                const LeafNodeType& lhsNode = *mLhsNodes[n];
+                const Coord& ijk = lhsNode.origin();
+
+                const LeafNodeType* rhsNodePt = rhsAcc.probeConstLeaf(ijk);
+
+                if (rhsNodePt) { // combine overlapping nodes
+
+                    LeafNodeType* outputNode = outputAcc.touchLeaf(ijk);
+                    ValueType * outputData = outputNode->buffer().data();
+                    NodeMaskType& outputMask = outputNode->getValueMask();
+
+                    const ValueType * lhsData = lhsNode.buffer().data();
+                    const NodeMaskType& lhsMask = lhsNode.getValueMask();
+
+                    const ValueType * rhsData = rhsNodePt->buffer().data();
+                    const NodeMaskType& rhsMask = rhsNodePt->getValueMask();
+
+                    if (Operation == CSG_INTERSECTION) {
+                        for (Index pos = 0; pos < LeafNodeType::SIZE; ++pos) {
+                            const bool fromRhs = lhsData[pos] < rhsData[pos];
+                            outputData[pos] = fromRhs ? rhsData[pos] : lhsData[pos];
+                            outputMask.set(pos, fromRhs ? rhsMask.isOn(pos) : lhsMask.isOn(pos));
+                        }
+                    } else if (Operation == CSG_DIFFERENCE){
+                        for (Index pos = 0; pos < LeafNodeType::SIZE; ++pos) {
+                            const ValueType rhsVal = math::negative(rhsData[pos]);
+                            const bool fromRhs = lhsData[pos] < rhsVal;
+                            outputData[pos] = fromRhs ? rhsVal : lhsData[pos];
+                            outputMask.set(pos, fromRhs ? rhsMask.isOn(pos) : lhsMask.isOn(pos));
+                        }
+                    } else { // Union
+                        for (Index pos = 0; pos < LeafNodeType::SIZE; ++pos) {
+                            const bool fromRhs = lhsData[pos] > rhsData[pos];
+                            outputData[pos] = fromRhs ? rhsData[pos] : lhsData[pos];
+                            outputMask.set(pos, fromRhs ? rhsMask.isOn(pos) : lhsMask.isOn(pos));
+                        }
+                    }
+
+                } else {
+                    if (Operation == CSG_INTERSECTION) {
+                        if (rhsAcc.getValue(ijk) < ValueType(0.0)) {
+                            outputAcc.addLeaf(new LeafNodeType(lhsNode));
+                        }
+                    } else { // Union & Difference
+                        if (!(rhsAcc.getValue(ijk) < ValueType(0.0))) {
+                            outputAcc.addLeaf(new LeafNodeType(lhsNode));
+                        }
+                    }
+                }
+            } //  end range loop
+        }
+
+        LeafNodeType const * const * const mLhsNodes;
+        TreeType             const * const mRhsTree;
+        TreeType                           mLocalTree;
+        TreeType                   * const mOutputTree;
+    }; // struct ProcessLeafNodes
+
+    TreePtrType               mSegment;
+    TreeType    const * const mLhsTree;
+    TreeType    const * const mRhsTree;
+}; // struct BuildPrimarySegment
+
+
+template<typename TreeType, CSGOperation Operation>
+struct BuildSecondarySegment
+{
+    typedef typename TreeType::ValueType                                            ValueType;
+    typedef typename TreeType::Ptr                                                  TreePtrType;
+    typedef typename TreeType::LeafNodeType                                         LeafNodeType;
+    typedef typename LeafNodeType::NodeMaskType                                     NodeMaskType;
+    typedef typename TreeType::RootNodeType                                         RootNodeType;
+    typedef typename RootNodeType::NodeChainType                                    NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type      InternalNodeType;
+
+    BuildSecondarySegment(const TreeType& lhs, const TreeType& rhs)
+        : mSegment(new TreeType(lhs.background()))
+        , mLhsTree(&lhs)
+        , mRhsTree(&rhs)
+    {
+    }
+
+    void operator()() const
+    {
+        std::vector<const LeafNodeType*> leafNodes;
+
+        {
+            std::vector<const InternalNodeType*> internalNodes;
+            mRhsTree->getNodes(internalNodes);
+
+            ProcessInternalNodes op(internalNodes, *mLhsTree, *mSegment, leafNodes);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, internalNodes.size()), op);
+        }
+
+        ProcessLeafNodes op(leafNodes, *mLhsTree, *mSegment);
+        tbb::parallel_reduce(tbb::blocked_range<size_t>(0, leafNodes.size()), op);
+    }
+
+    TreePtrType& segment() { return mSegment; }
+
+private:
+
+    struct ProcessInternalNodes {
+
+        ProcessInternalNodes(std::vector<const InternalNodeType*>& rhsNodes, const TreeType& lhsTree,
+            TreeType& outputTree, std::vector<const LeafNodeType*>& outputLeafNodes)
+            : mRhsNodes(rhsNodes.empty() ? NULL : &rhsNodes.front())
+            , mLhsTree(&lhsTree)
+            , mLocalTree(mLhsTree->background())
+            , mOutputTree(&outputTree)
+            , mLocalLeafNodes()
+            , mOutputLeafNodes(&outputLeafNodes)
+        {
+        }
+
+        ProcessInternalNodes(ProcessInternalNodes& other, tbb::split)
+            : mRhsNodes(other.mRhsNodes)
+            , mLhsTree(other.mLhsTree)
+            , mLocalTree(mLhsTree->background())
+            , mOutputTree(&mLocalTree)
+            , mLocalLeafNodes()
+            , mOutputLeafNodes(&mLocalLeafNodes)
+        {
+        }
+
+        void join(ProcessInternalNodes& other)
+        {
+            mOutputTree->merge(*other.mOutputTree);
+            mOutputLeafNodes->insert(mOutputLeafNodes->end(),
+                other.mOutputLeafNodes->begin(), other.mOutputLeafNodes->end());
+        }
+
+        void operator()(const tbb::blocked_range<size_t>& range)
+        {
+            tree::ValueAccessor<const TreeType> lhsAcc(*mLhsTree);
+            tree::ValueAccessor<TreeType>       outputAcc(*mOutputTree);
+
+            std::vector<const LeafNodeType*> tmpLeafNodes;
+
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+                const InternalNodeType& rhsNode = *mRhsNodes[n];
+                const Coord& ijk = rhsNode.origin();
+                const InternalNodeType * lhsNode = lhsAcc.template probeConstNode<InternalNodeType>(ijk);
+
+                if (lhsNode) {
+                   rhsNode.getNodes(*mOutputLeafNodes);
+                } else {
+                    if (Operation == CSG_INTERSECTION) {
+                        if (lhsAcc.getValue(ijk) < ValueType(0.0)) {
+                            tmpLeafNodes.clear();
+                            rhsNode.getNodes(tmpLeafNodes);
+                            for (size_t i = 0, I = tmpLeafNodes.size(); i < I; ++i) {
+                                outputAcc.addLeaf(new LeafNodeType(*tmpLeafNodes[i]));
+                            }
+                        }
+                    } else if (Operation == CSG_DIFFERENCE) {
+                        if (lhsAcc.getValue(ijk) < ValueType(0.0)) {
+                            tmpLeafNodes.clear();
+                            rhsNode.getNodes(tmpLeafNodes);
+                            for (size_t i = 0, I = tmpLeafNodes.size(); i < I; ++i) {
+                                LeafNodeType* outputNode = new LeafNodeType(*tmpLeafNodes[i]);
+                                outputNode->negate();
+                                outputAcc.addLeaf(outputNode);
+                            }
+                        }
+                    } else { // Union
+                        if (!(lhsAcc.getValue(ijk) < ValueType(0.0))) {
+                            tmpLeafNodes.clear();
+                            rhsNode.getNodes(tmpLeafNodes);
+                            for (size_t i = 0, I = tmpLeafNodes.size(); i < I; ++i) {
+                                outputAcc.addLeaf(new LeafNodeType(*tmpLeafNodes[i]));
+                            }
+                        }
+                    }
+                }
+            } //  end range loop
+        }
+
+        InternalNodeType const * const * const mRhsNodes;
+        TreeType                 const * const mLhsTree;
+        TreeType                               mLocalTree;
+        TreeType                       * const mOutputTree;
+
+        std::vector<const LeafNodeType*>         mLocalLeafNodes;
+        std::vector<const LeafNodeType*> * const mOutputLeafNodes;
+    }; // struct ProcessInternalNodes
+
+    struct ProcessLeafNodes {
+
+        ProcessLeafNodes(std::vector<const LeafNodeType*>& rhsNodes, const TreeType& lhsTree, TreeType& output)
+            : mRhsNodes(rhsNodes.empty() ? NULL : &rhsNodes.front())
+            , mLhsTree(&lhsTree)
+            , mLocalTree(mLhsTree->background())
+            , mOutputTree(&output)
+        {
+        }
+
+        ProcessLeafNodes(ProcessLeafNodes& rhs, tbb::split)
+            : mRhsNodes(rhs.mRhsNodes)
+            , mLhsTree(rhs.mLhsTree)
+            , mLocalTree(mLhsTree->background())
+            , mOutputTree(&mLocalTree)
+        {
+        }
+
+        void join(ProcessLeafNodes& rhs) { mOutputTree->merge(*rhs.mOutputTree); }
+
+        void operator()(const tbb::blocked_range<size_t>& range)
+        {
+            tree::ValueAccessor<const TreeType> lhsAcc(*mLhsTree);
+            tree::ValueAccessor<TreeType>       outputAcc(*mOutputTree);
+
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+                const LeafNodeType& rhsNode = *mRhsNodes[n];
+                const Coord& ijk = rhsNode.origin();
+
+                const LeafNodeType* lhsNode = lhsAcc.probeConstLeaf(ijk);
+
+                if (!lhsNode) {
+                    if (Operation == CSG_INTERSECTION) {
+                        if (lhsAcc.getValue(ijk) < ValueType(0.0)) {
+                            outputAcc.addLeaf(new LeafNodeType(rhsNode));
+                        }
+                    } else if (Operation == CSG_DIFFERENCE) {
+                        if (lhsAcc.getValue(ijk) < ValueType(0.0)) {
+                            LeafNodeType* outputNode = new LeafNodeType(rhsNode);
+                            outputNode->negate();
+                            outputAcc.addLeaf(outputNode);
+                        }
+                    } else { // Union
+                        if (!(lhsAcc.getValue(ijk) < ValueType(0.0))) {
+                            outputAcc.addLeaf(new LeafNodeType(rhsNode));
+                        }
+                    }
+                }
+            } //  end range loop
+        }
+
+        LeafNodeType const * const * const mRhsNodes;
+        TreeType             const * const mLhsTree;
+        TreeType                           mLocalTree;
+        TreeType                   * const mOutputTree;
+    }; // struct ProcessLeafNodes
+
+    TreePtrType               mSegment;
+    TreeType    const * const mLhsTree;
+    TreeType    const * const mRhsTree;
+}; // struct BuildSecondarySegment
+
+
+template<CSGOperation Operation, typename TreeType>
+inline typename TreeType::Ptr
+doCSGCopy(const TreeType& lhs, const TreeType& rhs)
+{
+    BuildPrimarySegment<TreeType, Operation> primary(lhs, rhs);
+    BuildSecondarySegment<TreeType, Operation> secondary(lhs, rhs);
+
+    // Exploiting nested parallelism
+    tbb::task_group tasks;
+    tasks.run(primary);
+    tasks.run(secondary);
+    tasks.wait();
+
+    primary.segment()->merge(*secondary.segment());
+
+    // The leafnode (level = 0) sign is set in the segment construction.
+    tools::signedFloodFill(*primary.segment(), /*threaded=*/true, /*grainSize=*/1, /*minLevel=*/1);
+
+    return primary.segment();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct GridOrTreeConstructor
+{
+    typedef typename TreeType::Ptr TreeTypePtr;
+    static TreeTypePtr construct(const TreeType&, TreeTypePtr& tree) { return tree; }
+};
+
+
+template<typename TreeType>
+struct GridOrTreeConstructor<Grid<TreeType> >
+{
+    typedef Grid<TreeType>                  GridType;
+    typedef typename Grid<TreeType>::Ptr    GridTypePtr;
+    typedef typename TreeType::Ptr          TreeTypePtr;
+
+    static GridTypePtr construct(const GridType& grid, TreeTypePtr& tree) {
+        GridTypePtr maskGrid(GridType::create(tree));
+        maskGrid->setTransform(grid.transform().copy());
+        maskGrid->insertMeta(grid);
+        return maskGrid;
+    }
+};
+
+
+////////////////////////////////////////
+
+
+} // namespace composite
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+compMax(GridOrTreeT& aTree, GridOrTreeT& bTree)
+{
+    typedef TreeAdapter<GridOrTreeT>    Adapter;
+    typedef typename Adapter::TreeType  TreeT;
+    typedef typename TreeT::ValueType   ValueT;
+    struct Local {
+        static inline void op(CombineArgs<ValueT>& args) {
+            args.setResult(composite::max(args.a(), args.b()));
+        }
+    };
+    Adapter::tree(aTree).combineExtended(Adapter::tree(bTree), Local::op, /*prune=*/false);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+compMin(GridOrTreeT& aTree, GridOrTreeT& bTree)
+{
+    typedef TreeAdapter<GridOrTreeT>    Adapter;
+    typedef typename Adapter::TreeType  TreeT;
+    typedef typename TreeT::ValueType   ValueT;
+    struct Local {
+        static inline void op(CombineArgs<ValueT>& args) {
+            args.setResult(composite::min(args.a(), args.b()));
+        }
+    };
+    Adapter::tree(aTree).combineExtended(Adapter::tree(bTree), Local::op, /*prune=*/false);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+compSum(GridOrTreeT& aTree, GridOrTreeT& bTree)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    struct Local {
+        static inline void op(CombineArgs<typename TreeT::ValueType>& args) {
+            args.setResult(args.a() + args.b());
+        }
+    };
+    Adapter::tree(aTree).combineExtended(Adapter::tree(bTree), Local::op, /*prune=*/false);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+compMul(GridOrTreeT& aTree, GridOrTreeT& bTree)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    struct Local {
+        static inline void op(CombineArgs<typename TreeT::ValueType>& args) {
+            args.setResult(args.a() * args.b());
+        }
+    };
+    Adapter::tree(aTree).combineExtended(Adapter::tree(bTree), Local::op, /*prune=*/false);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+compDiv(GridOrTreeT& aTree, GridOrTreeT& bTree)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    struct Local {
+        static inline void op(CombineArgs<typename TreeT::ValueType>& args) {
+            args.setResult(composite::divide(args.a(), args.b()));
+        }
+    };
+    Adapter::tree(aTree).combineExtended(Adapter::tree(bTree), Local::op, /*prune=*/false);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+struct CompReplaceOp
+{
+    TreeT* const aTree;
+
+    CompReplaceOp(TreeT& _aTree): aTree(&_aTree) {}
+
+    /// @note fill operation is not thread safe
+    void operator()(const typename TreeT::ValueOnCIter& iter) const
+    {
+        CoordBBox bbox;
+        iter.getBoundingBox(bbox);
+        aTree->fill(bbox, *iter);
+    }
+
+    void operator()(const typename TreeT::LeafCIter& leafIter) const
+    {
+        tree::ValueAccessor<TreeT> acc(*aTree);
+        for (typename TreeT::LeafCIter::LeafNodeT::ValueOnCIter iter =
+            leafIter->cbeginValueOn(); iter; ++iter)
+        {
+            acc.setValue(iter.getCoord(), *iter);
+        }
+    }
+};
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+compReplace(GridOrTreeT& aTree, const GridOrTreeT& bTree)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    typedef typename TreeT::ValueOnCIter ValueOnCIterT;
+
+    // Copy active states (but not values) from B to A.
+    Adapter::tree(aTree).topologyUnion(Adapter::tree(bTree));
+
+    CompReplaceOp<TreeT> op(Adapter::tree(aTree));
+
+    // Copy all active tile values from B to A.
+    ValueOnCIterT iter = bTree.cbeginValueOn();
+    iter.setMaxDepth(iter.getLeafDepth() - 1); // don't descend into leaf nodes
+    foreach(iter, op, /*threaded=*/false);
+
+    // Copy all active voxel values from B to A.
+    foreach(Adapter::tree(bTree).cbeginLeaf(), op);
+}
+
+
+////////////////////////////////////////
+
+
+/// Base visitor class for CSG operations
+/// (not intended to be used polymorphically, so no virtual functions)
+template<typename TreeType>
+class CsgVisitorBase
+{
+public:
+    typedef TreeType TreeT;
+    typedef typename TreeT::ValueType ValueT;
+    typedef typename TreeT::LeafNodeType::ChildAllIter ChildIterT;
+
+    enum { STOP = 3 };
+
+    CsgVisitorBase(const TreeT& aTree, const TreeT& bTree):
+        mAOutside(aTree.background()),
+        mAInside(math::negative(mAOutside)),
+        mBOutside(bTree.background()),
+        mBInside(math::negative(mBOutside))
+    {
+        const ValueT zero = zeroVal<ValueT>();
+        if (!(mAOutside > zero)) {
+            OPENVDB_THROW(ValueError,
+                "expected grid A outside value > 0, got " << mAOutside);
+        }
+        if (!(mAInside < zero)) {
+            OPENVDB_THROW(ValueError,
+                "expected grid A inside value < 0, got " << mAInside);
+        }
+        if (!(mBOutside > zero)) {
+            OPENVDB_THROW(ValueError,
+                "expected grid B outside value > 0, got " << mBOutside);
+        }
+        if (!(mBInside < zero)) {
+            OPENVDB_THROW(ValueError,
+                "expected grid B outside value < 0, got " << mBOutside);
+        }
+    }
+
+protected:
+    ValueT mAOutside, mAInside, mBOutside, mBInside;
+};
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct CsgUnionVisitor: public CsgVisitorBase<TreeType>
+{
+    typedef TreeType TreeT;
+    typedef typename TreeT::ValueType ValueT;
+    typedef typename TreeT::LeafNodeType::ChildAllIter ChildIterT;
+
+    enum { STOP = CsgVisitorBase<TreeT>::STOP };
+
+    CsgUnionVisitor(const TreeT& a, const TreeT& b): CsgVisitorBase<TreeT>(a, b) {}
+
+    /// Don't process nodes that are at different tree levels.
+    template<typename AIterT, typename BIterT>
+    inline int operator()(AIterT&, BIterT&) { return 0; }
+
+    /// Process root and internal nodes.
+    template<typename IterT>
+    inline int operator()(IterT& aIter, IterT& bIter)
+    {
+        ValueT aValue = zeroVal<ValueT>();
+        typename IterT::ChildNodeType* aChild = aIter.probeChild(aValue);
+        if (!aChild && aValue < zeroVal<ValueT>()) {
+            // A is an inside tile.  Leave it alone and stop traversing this branch.
+            return STOP;
+        }
+
+        ValueT bValue = zeroVal<ValueT>();
+        typename IterT::ChildNodeType* bChild = bIter.probeChild(bValue);
+        if (!bChild && bValue < zeroVal<ValueT>()) {
+            // B is an inside tile.  Make A an inside tile and stop traversing this branch.
+            aIter.setValue(this->mAInside);
+            aIter.setValueOn(bIter.isValueOn());
+            delete aChild;
+            return STOP;
+        }
+
+        if (!aChild && aValue > zeroVal<ValueT>()) {
+            // A is an outside tile.  If B has a child, transfer it to A,
+            // otherwise leave A alone.
+            if (bChild) {
+                bIter.setValue(this->mBOutside);
+                bIter.setValueOff();
+                bChild->resetBackground(this->mBOutside, this->mAOutside);
+                aIter.setChild(bChild); // transfer child
+                delete aChild;
+            }
+            return STOP;
+        }
+
+        // If A has a child and B is an outside tile, stop traversing this branch.
+        // Continue traversal only if A and B both have children.
+        return (aChild && bChild) ? 0 : STOP;
+    }
+
+    /// Process leaf node values.
+    inline int operator()(ChildIterT& aIter, ChildIterT& bIter)
+    {
+        ValueT aValue, bValue;
+        aIter.probeValue(aValue);
+        bIter.probeValue(bValue);
+        if (aValue > bValue) { // a = min(a, b)
+            aIter.setValue(bValue);
+            aIter.setValueOn(bIter.isValueOn());
+        }
+        return 0;
+    }
+};
+
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct CsgIntersectVisitor: public CsgVisitorBase<TreeType>
+{
+    typedef TreeType TreeT;
+    typedef typename TreeT::ValueType ValueT;
+    typedef typename TreeT::LeafNodeType::ChildAllIter ChildIterT;
+
+    enum { STOP = CsgVisitorBase<TreeT>::STOP };
+
+    CsgIntersectVisitor(const TreeT& a, const TreeT& b): CsgVisitorBase<TreeT>(a, b) {}
+
+    /// Don't process nodes that are at different tree levels.
+    template<typename AIterT, typename BIterT>
+    inline int operator()(AIterT&, BIterT&) { return 0; }
+
+    /// Process root and internal nodes.
+    template<typename IterT>
+    inline int operator()(IterT& aIter, IterT& bIter)
+    {
+        ValueT aValue = zeroVal<ValueT>();
+        typename IterT::ChildNodeType* aChild = aIter.probeChild(aValue);
+        if (!aChild && !(aValue < zeroVal<ValueT>())) {
+            // A is an outside tile.  Leave it alone and stop traversing this branch.
+            return STOP;
+        }
+
+        ValueT bValue = zeroVal<ValueT>();
+        typename IterT::ChildNodeType* bChild = bIter.probeChild(bValue);
+        if (!bChild && !(bValue < zeroVal<ValueT>())) {
+            // B is an outside tile.  Make A an outside tile and stop traversing this branch.
+            aIter.setValue(this->mAOutside);
+            aIter.setValueOn(bIter.isValueOn());
+            delete aChild;
+            return STOP;
+        }
+
+        if (!aChild && aValue < zeroVal<ValueT>()) {
+            // A is an inside tile.  If B has a child, transfer it to A,
+            // otherwise leave A alone.
+            if (bChild) {
+                bIter.setValue(this->mBOutside);
+                bIter.setValueOff();
+                bChild->resetBackground(this->mBOutside, this->mAOutside);
+                aIter.setChild(bChild); // transfer child
+                delete aChild;
+            }
+            return STOP;
+        }
+
+        // If A has a child and B is an outside tile, stop traversing this branch.
+        // Continue traversal only if A and B both have children.
+        return (aChild && bChild) ? 0 : STOP;
+    }
+
+    /// Process leaf node values.
+    inline int operator()(ChildIterT& aIter, ChildIterT& bIter)
+    {
+        ValueT aValue, bValue;
+        aIter.probeValue(aValue);
+        bIter.probeValue(bValue);
+        if (aValue < bValue) { // a = max(a, b)
+            aIter.setValue(bValue);
+            aIter.setValueOn(bIter.isValueOn());
+        }
+        return 0;
+    }
+};
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct CsgDiffVisitor: public CsgVisitorBase<TreeType>
+{
+    typedef TreeType TreeT;
+    typedef typename TreeT::ValueType ValueT;
+    typedef typename TreeT::LeafNodeType::ChildAllIter ChildIterT;
+
+    enum { STOP = CsgVisitorBase<TreeT>::STOP };
+
+    CsgDiffVisitor(const TreeT& a, const TreeT& b): CsgVisitorBase<TreeT>(a, b) {}
+
+    /// Don't process nodes that are at different tree levels.
+    template<typename AIterT, typename BIterT>
+    inline int operator()(AIterT&, BIterT&) { return 0; }
+
+    /// Process root and internal nodes.
+    template<typename IterT>
+    inline int operator()(IterT& aIter, IterT& bIter)
+    {
+        ValueT aValue = zeroVal<ValueT>();
+        typename IterT::ChildNodeType* aChild = aIter.probeChild(aValue);
+        if (!aChild && !(aValue < zeroVal<ValueT>())) {
+            // A is an outside tile.  Leave it alone and stop traversing this branch.
+            return STOP;
+        }
+
+        ValueT bValue = zeroVal<ValueT>();
+        typename IterT::ChildNodeType* bChild = bIter.probeChild(bValue);
+        if (!bChild && bValue < zeroVal<ValueT>()) {
+            // B is an inside tile.  Make A an inside tile and stop traversing this branch.
+            aIter.setValue(this->mAOutside);
+            aIter.setValueOn(bIter.isValueOn());
+            delete aChild;
+            return STOP;
+        }
+
+        if (!aChild && aValue < zeroVal<ValueT>()) {
+            // A is an inside tile.  If B has a child, transfer it to A,
+            // otherwise leave A alone.
+            if (bChild) {
+                bIter.setValue(this->mBOutside);
+                bIter.setValueOff();
+                bChild->resetBackground(this->mBOutside, this->mAOutside);
+                aIter.setChild(bChild); // transfer child
+                bChild->negate();
+                delete aChild;
+            }
+            return STOP;
+        }
+
+        // If A has a child and B is an outside tile, stop traversing this branch.
+        // Continue traversal only if A and B both have children.
+        return (aChild && bChild) ? 0 : STOP;
+    }
+
+    /// Process leaf node values.
+    inline int operator()(ChildIterT& aIter, ChildIterT& bIter)
+    {
+        ValueT aValue, bValue;
+        aIter.probeValue(aValue);
+        bIter.probeValue(bValue);
+        bValue = math::negative(bValue);
+        if (aValue < bValue) { // a = max(a, -b)
+            aIter.setValue(bValue);
+            aIter.setValueOn(bIter.isValueOn());
+        }
+        return 0;
+    }
+};
+
+
+////////////////////////////////////////
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+csgUnion(GridOrTreeT& a, GridOrTreeT& b, bool prune)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    TreeT &aTree = Adapter::tree(a), &bTree = Adapter::tree(b);
+    CsgUnionVisitor<TreeT> visitor(aTree, bTree);
+    aTree.visit2(bTree, visitor);
+    if (prune) tools::pruneLevelSet(aTree);
+}
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+csgIntersection(GridOrTreeT& a, GridOrTreeT& b, bool prune)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    TreeT &aTree = Adapter::tree(a), &bTree = Adapter::tree(b);
+    CsgIntersectVisitor<TreeT> visitor(aTree, bTree);
+    aTree.visit2(bTree, visitor);
+    if (prune) tools::pruneLevelSet(aTree);
+}
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline void
+csgDifference(GridOrTreeT& a, GridOrTreeT& b, bool prune)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+    TreeT &aTree = Adapter::tree(a), &bTree = Adapter::tree(b);
+    CsgDiffVisitor<TreeT> visitor(aTree, bTree);
+    aTree.visit2(bTree, visitor);
+    if (prune) tools::pruneLevelSet(aTree);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline typename GridOrTreeT::Ptr
+csgUnionCopy(const GridOrTreeT& a, const GridOrTreeT& b)
+{
+    typedef TreeAdapter<GridOrTreeT>            Adapter;
+    typedef typename Adapter::TreeType::Ptr     TreePtrT;
+
+    TreePtrT output = composite::doCSGCopy<composite::CSG_UNION>(
+                        Adapter::tree(a), Adapter::tree(b));
+
+    return composite::GridOrTreeConstructor<GridOrTreeT>::construct(a, output);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline typename GridOrTreeT::Ptr
+csgIntersectionCopy(const GridOrTreeT& a, const GridOrTreeT& b)
+{
+    typedef TreeAdapter<GridOrTreeT>            Adapter;
+    typedef typename Adapter::TreeType::Ptr     TreePtrT;
+
+    TreePtrT output = composite::doCSGCopy<composite::CSG_INTERSECTION>(
+                        Adapter::tree(a), Adapter::tree(b));
+
+    return composite::GridOrTreeConstructor<GridOrTreeT>::construct(a, output);
+}
+
+
+template<typename GridOrTreeT>
+OPENVDB_STATIC_SPECIALIZATION inline typename GridOrTreeT::Ptr
+csgDifferenceCopy(const GridOrTreeT& a, const GridOrTreeT& b)
+{
+    typedef TreeAdapter<GridOrTreeT>            Adapter;
+    typedef typename Adapter::TreeType::Ptr     TreePtrT;
+
+    TreePtrT output = composite::doCSGCopy<composite::CSG_DIFFERENCE>(
+                        Adapter::tree(a), Adapter::tree(b));
+
+    return composite::GridOrTreeConstructor<GridOrTreeT>::construct(a, output);
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_COMPOSITE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Dense.h b/nuparu/include/openvdb_new/tools/Dense.h
new file mode 100644
index 00000000..e4c091ec
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Dense.h
@@ -0,0 +1,608 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Dense.h
+///
+/// @brief This file defines a simple dense grid and efficient
+/// converters to and from VDB grids.
+
+#ifndef OPENVDB_TOOLS_DENSE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_DENSE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/Exceptions.h>
+#include <openvdb/util/Formats.h>
+#include <tbb/parallel_for.h>
+#include <boost/scoped_array.hpp>
+#include <boost/scoped_ptr.hpp>
+#include "Prune.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Populate a dense grid with the values of voxels from a sparse grid,
+/// where the sparse grid intersects the dense grid.
+/// @param sparse  an OpenVDB grid or tree from which to copy values
+/// @param dense   the dense grid into which to copy values
+/// @param serial  if false, process voxels in parallel
+template<typename DenseT, typename GridOrTreeT>
+void
+copyToDense(
+    const GridOrTreeT& sparse,
+    DenseT& dense,
+    bool serial = false);
+
+
+/// @brief Populate a sparse grid with the values of all of the voxels of a dense grid.
+/// @param dense      the dense grid from which to copy values
+/// @param sparse     an OpenVDB grid or tree into which to copy values
+/// @param tolerance  values in the dense grid that are within this tolerance of the sparse
+///     grid's background value become inactive background voxels or tiles in the sparse grid
+/// @param serial     if false, process voxels in parallel
+template<typename DenseT, typename GridOrTreeT>
+void
+copyFromDense(
+    const DenseT& dense,
+    GridOrTreeT& sparse,
+    const typename GridOrTreeT::ValueType& tolerance,
+    bool serial = false);
+
+
+////////////////////////////////////////
+
+/// We currently support the following two 3D memory layouts for dense
+/// volumes: XYZ, i.e. x is the fastest moving index, and ZYX, i.e. z
+/// is the fastest moving index. The ZYX memory layout leads to nested
+/// for-loops of the order x, y, z, which we find to be the most
+/// intuitive. Hence, ZYX is the layout used throughout VDB. However,
+/// other data structures, e.g. Houdini and Maya, employ the XYZ
+/// layout. Clearly a dense volume with the ZYX layout converts more
+/// efficiently to a VDB, but we support both for convenience.
+enum MemoryLayout { LayoutXYZ, LayoutZYX };
+
+/// @brief Base class for Dense which is defined below.
+/// @note The constructor of this class is protected to prevent direct
+/// instantiation.
+template<typename ValueT, MemoryLayout Layout> class DenseBase;
+
+/// @brief Partial template specialization of DenseBase.
+/// @note ZYX is the memory-layout in VDB. It leads to nested
+/// for-loops of the order x, y, z which we find to be the most intuitive.
+template<typename ValueT>
+class DenseBase<ValueT, LayoutZYX>
+{
+public:
+    /// @brief Return the linear offset into this grid's value array given by
+    /// unsigned coordinates (i, j, k), i.e., coordinates relative to
+    /// the origin of this grid's bounding box.
+    ///
+    /// @warning The input coordinates are assume to be relative to
+    /// the grid's origin, i.e. minimum of its index bounding box!
+    inline size_t coordToOffset(size_t i, size_t j, size_t k) const { return i*mX + j*mY + k; }
+
+    /// @brief Return the local coordinate corresponding to the specified linear offset.
+    ///
+    /// @warning The returned coordinate is relative to the origin of this
+    /// grid's bounding box so add dense.origin() to get absolute coordinates.
+    inline Coord offsetToLocalCoord(size_t n) const
+    {
+      const size_t x = n / mX;
+      n -= mX*x;
+      const size_t y = n / mY;
+      return Coord(Coord::ValueType(x), Coord::ValueType(y), Coord::ValueType(n - mY*y));
+    }
+
+    /// @brief Return the stride of the array in the x direction ( = dimY*dimZ).
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    inline size_t xStride() const { return mX; }
+
+    /// @brief Return the stride of the array in the y direction ( = dimZ).
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    inline size_t yStride() const { return mY; }
+
+    /// @brief Return the stride of the array in the z direction ( = 1).
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    static size_t zStride() { return 1; }
+
+protected:
+    /// Protected constructor so as to prevent direct instantiation
+    DenseBase(const CoordBBox& bbox) : mBBox(bbox), mY(bbox.dim()[2]), mX(mY*bbox.dim()[1]) {}
+
+    const CoordBBox mBBox;//signed coordinates of the domain represented by the grid
+    const size_t mY, mX;//strides in the y and x direction
+};// end of DenseBase<ValueT, LayoutZYX>
+
+/// @brief Partial template specialization of DenseBase.
+/// @note This is the memory-layout employed in Houdini and Maya. It leads
+/// to nested for-loops of the order z, y, x.
+template<typename ValueT>
+class DenseBase<ValueT, LayoutXYZ>
+{
+public:
+    /// @brief Return the linear offset into this grid's value array given by
+    /// unsigned coordinates (i, j, k), i.e., coordinates relative to
+    /// the origin of this grid's bounding box.
+    ///
+    /// @warning The input coordinates are assume to be relative to
+    /// the grid's origin, i.e. minimum of its index bounding box!
+    inline size_t coordToOffset(size_t i, size_t j, size_t k) const { return i + j*mY + k*mZ; }
+
+    /// @brief Return the index coordinate corresponding to the specified linear offset.
+    ///
+    /// @warning The returned coordinate is relative to the origin of this
+    /// grid's bounding box so add dense.origin() to get absolute coordinates.
+    inline Coord offsetToLocalCoord(size_t n) const
+    {
+        const size_t z = n / mZ;
+        n -= mZ*z;
+        const size_t y = n / mY;
+        return Coord(Coord::ValueType(n - mY*y), Coord::ValueType(y), Coord::ValueType(z));
+    }
+
+    /// @brief Return the stride of the array in the x direction ( = 1).
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    static size_t xStride() { return 1; }
+
+    /// @brief Return the stride of the array in the y direction ( = dimX).
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    inline size_t yStride() const { return mY; }
+
+    /// @brief Return the stride of the array in the y direction ( = dimX*dimY).
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    inline size_t zStride() const { return mZ; }
+
+protected:
+    /// Protected constructor so as to prevent direct instantiation
+    DenseBase(const CoordBBox& bbox) : mBBox(bbox), mY(bbox.dim()[0]), mZ(mY*bbox.dim()[1]) {}
+
+    const CoordBBox mBBox;//signed coordinates of the domain represented by the grid
+    const size_t mY, mZ;//strides in the y and z direction
+};// end of DenseBase<ValueT, LayoutXYZ>
+
+/// @brief Dense is a simple dense grid API used by the CopyToDense and
+/// CopyFromDense classes defined below.
+/// @details Use the Dense class to efficiently produce a dense in-memory
+/// representation of an OpenVDB grid.  However, be aware that a dense grid
+/// could have a memory footprint that is orders of magnitude larger than
+/// the sparse grid from which it originates.
+///
+/// @note This class can be used as a simple wrapper for existing dense grid
+/// classes if they provide access to the raw data array.
+/// @note This implementation allows for the 3D memory layout to be
+/// defined by the MemoryLayout template parameter (see above for definition).
+/// The default memory layout is ZYX since that's the layout used by OpenVDB grids.
+template<typename ValueT, MemoryLayout Layout = LayoutZYX>
+class Dense : public DenseBase<ValueT, Layout>
+{
+public:
+    typedef ValueT ValueType;
+    typedef DenseBase<ValueT, Layout> BaseT;
+    typedef boost::shared_ptr<Dense> Ptr;
+    typedef boost::shared_ptr<const Dense> ConstPtr;
+
+    /// @brief Construct a dense grid with a given range of coordinates.
+    ///
+    /// @param bbox  the bounding box of the (signed) coordinate range of this grid
+    /// @throw ValueError if the bounding box is empty.
+    /// @note The min and max coordinates of the bounding box are inclusive.
+    Dense(const CoordBBox& bbox) : BaseT(bbox) { this->init(); }
+
+    /// @brief Construct a dense grid with a given range of coordinates and initial value
+    ///
+    /// @param bbox  the bounding box of the (signed) coordinate range of this grid
+    /// @param value the initial value of the grid.
+    /// @throw ValueError if the bounding box is empty.
+    /// @note The min and max coordinates of the bounding box are inclusive.
+    Dense(const CoordBBox& bbox, const ValueT& value) : BaseT(bbox)
+    {
+        this->init();
+        this->fill(value);
+    }
+
+    /// @brief Construct a dense grid that wraps an external array.
+    ///
+    /// @param bbox  the bounding box of the (signed) coordinate range of this grid
+    /// @param data  a raw C-style array whose size is commensurate with
+    ///     the coordinate domain of @a bbox
+    ///
+    /// @note The data array is assumed to have a stride of one in the @e z direction.
+    /// @throw ValueError if the bounding box is empty.
+    /// @note The min and max coordinates of the bounding box are inclusive.
+    Dense(const CoordBBox& bbox, ValueT* data) : BaseT(bbox), mData(data)
+    {
+        if (BaseT::mBBox.empty()) {
+            OPENVDB_THROW(ValueError, "can't construct a dense grid with an empty bounding box");
+        }
+    }
+
+    /// @brief Construct a dense grid with a given origin and dimensions.
+    ///
+    /// @param dim  the desired dimensions of the grid
+    /// @param min  the signed coordinates of the first voxel in the dense grid
+    /// @throw ValueError if any of the dimensions are zero.
+    /// @note The @a min coordinate is inclusive, and the max coordinate will be
+    /// @a min + @a dim - 1.
+    Dense(const Coord& dim, const Coord& min = Coord(0))
+        : BaseT(CoordBBox(min, min+dim.offsetBy(-1)))
+    {
+        this->init();
+    }
+
+    /// @brief Return the memory layout for this grid (see above for definitions).
+    static MemoryLayout memoryLayout() { return Layout; }
+
+    /// @brief Return a raw pointer to this grid's value array.
+    /// @note This method is required by CopyToDense.
+    inline ValueT* data() { return mData; }
+
+    /// @brief Return a raw pointer to this grid's value array.
+    /// @note This method is required by CopyFromDense.
+    inline const ValueT* data() const { return mData; }
+
+    /// @brief Return the bounding box of the signed index domain of this grid.
+    /// @note This method is required by both CopyToDense and CopyFromDense.
+    inline const CoordBBox& bbox() const { return BaseT::mBBox; }
+
+     /// Return the grid's origin in index coordinates.
+    inline const Coord& origin() const { return BaseT::mBBox.min(); }
+
+    /// @brief Return the number of voxels contained in this grid.
+    inline Index64 valueCount() const { return BaseT::mBBox.volume(); }
+
+    /// @brief Set the value of the voxel at the given array offset.
+    inline void setValue(size_t offset, const ValueT& value) { mData[offset] = value; }
+
+    /// @brief Return a const reference to the value of the voxel at the given array offset.
+    const ValueT& getValue(size_t offset) const { return mData[offset]; }
+
+    /// @brief Return a non-const reference to the value of the voxel at the given array offset.
+    ValueT& getValue(size_t offset) { return mData[offset]; }
+
+    /// @brief Set the value of the voxel at unsigned index coordinates (i, j, k).
+    /// @note This is somewhat slower than using an array offset.
+    inline void setValue(size_t i, size_t j, size_t k, const ValueT& value)
+    {
+        mData[BaseT::coordToOffset(i,j,k)] = value;
+    }
+
+    /// @brief Return a const reference to the value of the voxel at unsigned index coordinates (i, j, k).
+    /// @note This is somewhat slower than using an array offset.
+    inline const ValueT& getValue(size_t i, size_t j, size_t k) const
+    {
+        return mData[BaseT::coordToOffset(i,j,k)];
+    }
+
+    /// @brief Return a non-const reference to the value of the voxel at unsigned index coordinates (i, j, k).
+    /// @note This is somewhat slower than using an array offset.
+    inline ValueT& getValue(size_t i, size_t j, size_t k)
+    {
+        return mData[BaseT::coordToOffset(i,j,k)];
+    }
+
+    /// @brief Set the value of the voxel at the given signed coordinates.
+    /// @note This is slower than using either an array offset or unsigned index coordinates.
+    inline void setValue(const Coord& xyz, const ValueT& value)
+    {
+        mData[this->coordToOffset(xyz)] = value;
+    }
+
+    /// @brief Return a const reference to the value of the voxel at the given signed coordinates.
+    /// @note This is slower than using either an array offset or unsigned index coordinates.
+    inline const ValueT& getValue(const Coord& xyz) const
+    {
+        return mData[this->coordToOffset(xyz)];
+    }
+
+    /// @brief Return a non-const reference to the value of the voxel at the given signed coordinates.
+    /// @note This is slower than using either an array offset or unsigned index coordinates.
+    inline ValueT& getValue(const Coord& xyz)
+    {
+        return mData[this->coordToOffset(xyz)];
+    }
+
+    /// @brief Fill this grid with a constant value.
+    inline void fill(const ValueT& value)
+    {
+        size_t size = this->valueCount();
+        ValueT* a = mData;
+        while(size--) *a++ = value;
+    }
+
+    /// @brief Return the linear offset into this grid's value array given by
+    /// the specified signed coordinates, i.e., coordinates in the space of
+    /// this grid's bounding box.
+    ///
+    /// @note This method reflects the fact that we assume the same
+    /// layout of values as an OpenVDB grid, i.e., the fastest coordinate is @e z.
+    inline size_t coordToOffset(const Coord& xyz) const
+    {
+        assert(BaseT::mBBox.isInside(xyz));
+        return BaseT::coordToOffset(size_t(xyz[0]-BaseT::mBBox.min()[0]),
+                                    size_t(xyz[1]-BaseT::mBBox.min()[1]),
+                                    size_t(xyz[2]-BaseT::mBBox.min()[2]));
+    }
+
+    /// @brief Return the global coordinate corresponding to the specified linear offset.
+    inline Coord offsetToCoord(size_t n) const
+    {
+      return this->offsetToLocalCoord(n) + BaseT::mBBox.min();
+    }
+
+    /// @brief Return the memory footprint of this Dense grid in bytes.
+    inline Index64 memUsage() const
+    {
+        return sizeof(*this) + BaseT::mBBox.volume() * sizeof(ValueType);
+    }
+
+    /// @brief Output a human-readable description of this grid to the
+    /// specified stream.
+    void print(const std::string& name = "", std::ostream& os = std::cout) const
+    {
+        const Coord dim = BaseT::mBBox.dim();
+        os << "Dense Grid";
+        if (!name.empty()) os << " \"" << name << "\"";
+        util::printBytes(os, this->memUsage(), ":\n  Memory footprint:     ");
+        os << "  Dimensions of grid  :   " << dim[0] << " x " << dim[1] << " x " << dim[2] << "\n";
+        os << "  Number of voxels:       " << util::formattedInt(this->valueCount()) << "\n";
+        os << "  Bounding box of voxels: " << BaseT::mBBox << "\n";
+        os << "  Memory layout:          " << (Layout == LayoutZYX ? "ZYX (" : "XYZ (dis")
+           << "similar to VDB)\n";        
+    }
+    
+private:
+
+    /// @brief Private method to initialize the dense value array.
+    void init()
+    {
+        if (BaseT::mBBox.empty()) {
+            OPENVDB_THROW(ValueError, "can't construct a dense grid with an empty bounding box");
+        }
+        mArray.reset(new ValueT[BaseT::mBBox.volume()]);
+        mData = mArray.get();
+    }
+
+    boost::scoped_array<ValueT> mArray;
+    ValueT* mData;//raw c-style pointer to values
+};// end of Dense
+
+////////////////////////////////////////
+
+
+/// @brief Copy an OpenVDB tree into an existing dense grid.
+///
+/// @note Only voxels that intersect the dense grid's bounding box are copied
+/// from the OpenVDB tree.  But both active and inactive voxels are copied,
+/// so all existing values in the dense grid are overwritten, regardless of
+/// the OpenVDB tree's topology.
+template<typename _TreeT, typename _DenseT = Dense<typename _TreeT::ValueType> >
+class CopyToDense
+{
+public:
+    typedef _DenseT                      DenseT;
+    typedef _TreeT                       TreeT;
+    typedef typename TreeT::ValueType    ValueT;
+
+    CopyToDense(const TreeT& tree, DenseT& dense)
+        : mRoot(&(tree.root())), mDense(&dense) {}
+
+    void copy(bool serial = false) const
+    {
+        if (serial) {
+            mRoot->copyToDense(mDense->bbox(), *mDense);
+        } else {
+            tbb::parallel_for(mDense->bbox(), *this);
+        }
+    }
+
+    /// @brief Public method called by tbb::parallel_for
+    void operator()(const CoordBBox& bbox) const
+    {
+        mRoot->copyToDense(bbox, *mDense);
+    }
+
+private:
+    const typename TreeT::RootNodeType* mRoot;
+    DenseT* mDense;
+};// CopyToDense
+
+
+// Convenient wrapper function for the CopyToDense class
+template<typename DenseT, typename GridOrTreeT>
+void
+copyToDense(const GridOrTreeT& sparse, DenseT& dense, bool serial)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+
+    CopyToDense<TreeT, DenseT> op(Adapter::constTree(sparse), dense);
+    op.copy(serial);
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Copy the values from a dense grid into an OpenVDB tree.
+///
+/// @details Values in the dense grid that are within a tolerance of
+/// the background value are truncated to inactive background voxels or tiles.
+/// This allows the tree to form a sparse representation of the dense grid.
+///
+/// @note Since this class allocates leaf nodes concurrently it is recommended
+/// to use a scalable implementation of @c new like the one provided by TBB,
+/// rather than the mutex-protected standard library @c new.
+template<typename _TreeT, typename _DenseT = Dense<typename _TreeT::ValueType> >
+class CopyFromDense
+{
+public:
+    typedef _DenseT                      DenseT;
+    typedef _TreeT                       TreeT;
+    typedef typename TreeT::ValueType    ValueT;
+    typedef typename TreeT::LeafNodeType LeafT;
+    typedef tree::ValueAccessor<TreeT>   AccessorT;
+
+    CopyFromDense(const DenseT& dense, TreeT& tree, const ValueT& tolerance)
+        : mDense(&dense),
+          mTree(&tree),
+          mBlocks(NULL),
+          mTolerance(tolerance),
+          mAccessor(tree.empty() ? NULL : new AccessorT(tree))
+    {
+    }
+    CopyFromDense(const CopyFromDense& other)
+        : mDense(other.mDense),
+          mTree(other.mTree),
+          mBlocks(other.mBlocks),
+          mTolerance(other.mTolerance),
+          mAccessor(other.mAccessor.get() == NULL ? NULL : new AccessorT(*mTree))
+    {
+    }
+
+    /// @brief Copy values from the dense grid to the sparse tree.
+    void copy(bool serial = false)
+    {
+        mBlocks = new std::vector<Block>();
+        const CoordBBox& bbox = mDense->bbox();
+        // Pre-process: Construct a list of blocks aligned with (potential) leaf nodes
+        for (CoordBBox sub=bbox; sub.min()[0] <= bbox.max()[0]; sub.min()[0] = sub.max()[0] + 1) {
+            for (sub.min()[1] = bbox.min()[1]; sub.min()[1] <= bbox.max()[1];
+                 sub.min()[1] = sub.max()[1] + 1)
+            {
+                for (sub.min()[2] = bbox.min()[2]; sub.min()[2] <= bbox.max()[2];
+                     sub.min()[2] = sub.max()[2] + 1)
+                {
+                    sub.max() = Coord::minComponent(bbox.max(),
+                        (sub.min()&(~(LeafT::DIM-1u))).offsetBy(LeafT::DIM-1u));
+                    mBlocks->push_back(Block(sub));
+                }
+            }
+        }
+
+        // Multi-threaded process: Convert dense grid into leaf nodes and tiles
+        if (serial) {
+            (*this)(tbb::blocked_range<size_t>(0, mBlocks->size()));
+        } else {
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, mBlocks->size()), *this);
+        }
+
+        // Post-process: Insert leaf nodes and tiles into the tree, and prune the tiles only!
+        tree::ValueAccessor<TreeT> acc(*mTree);
+        for (size_t m=0, size = mBlocks->size(); m<size; ++m) {
+            Block& block = (*mBlocks)[m];
+            if (block.leaf) {
+                acc.addLeaf(block.leaf);
+            } else if (block.tile.second) {//only background tiles are inactive
+                acc.addTile(1, block.bbox.min(), block.tile.first, true);//leaf tile
+            }
+        }
+        delete mBlocks;
+        mBlocks = NULL;
+
+        tools::pruneTiles(*mTree, mTolerance);//multi-threaded
+    }
+
+    /// @brief Public method called by tbb::parallel_for
+    /// @warning Never call this method directly!
+    void operator()(const tbb::blocked_range<size_t> &r) const
+    {
+        assert(mBlocks);
+        LeafT* leaf = new LeafT();
+
+        for (size_t m=r.begin(), n=0, end = r.end(); m != end; ++m, ++n) {
+
+            Block& block = (*mBlocks)[m];
+            const CoordBBox &bbox = block.bbox;
+
+            if (mAccessor.get() == NULL) {//i.e. empty target tree
+                leaf->fill(mTree->background(), false);
+            } else {//account for existing leaf nodes in the target tree
+                if (const LeafT* target = mAccessor->probeConstLeaf(bbox.min())) {
+                    (*leaf) = (*target);
+                } else {
+                    ValueT value = zeroVal<ValueT>();
+                    bool state = mAccessor->probeValue(bbox.min(), value);
+                    leaf->fill(value, state);
+                }
+            }
+
+            leaf->copyFromDense(bbox, *mDense, mTree->background(), mTolerance);
+
+            if (!leaf->isConstant(block.tile.first, block.tile.second, mTolerance)) {
+                leaf->setOrigin(bbox.min() & (~(LeafT::DIM - 1)));
+                block.leaf = leaf;
+                leaf = new LeafT();
+            }
+        }// loop over blocks
+
+        delete leaf;
+    }
+
+private:
+    struct Block {
+        CoordBBox               bbox;
+        LeafT*                  leaf;
+        std::pair<ValueT, bool> tile;
+        Block(const CoordBBox& b) : bbox(b), leaf(NULL) {}
+    };
+
+    const DenseT*                mDense;
+    TreeT*                       mTree;
+    std::vector<Block>*          mBlocks;
+    ValueT                       mTolerance;
+    boost::scoped_ptr<AccessorT> mAccessor;
+};// CopyFromDense
+
+
+// Convenient wrapper function for the CopyFromDense class
+template<typename DenseT, typename GridOrTreeT>
+void
+copyFromDense(const DenseT& dense, GridOrTreeT& sparse,
+    const typename GridOrTreeT::ValueType& tolerance, bool serial)
+{
+    typedef TreeAdapter<GridOrTreeT> Adapter;
+    typedef typename Adapter::TreeType TreeT;
+
+    CopyFromDense<TreeT, DenseT> op(dense, Adapter::tree(sparse), tolerance);
+    op.copy(serial);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_DENSE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/DenseSparseTools.h b/nuparu/include/openvdb_new/tools/DenseSparseTools.h
new file mode 100644
index 00000000..c8ec5e75
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/DenseSparseTools.h
@@ -0,0 +1,1259 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TOOLS_DENSESPARSETOOLS_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_DENSESPARSETOOLS_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range3d.h>
+#include <tbb/blocked_range2d.h>
+#include <tbb/blocked_range.h>
+#include <openvdb/Types.h>
+#include <openvdb/tree/LeafManager.h>
+#include "Dense.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Selectively extract and transform data from a dense grid, producing a
+/// sparse tree with leaf nodes only (e.g. create a tree from the square
+/// of values greater than a cutoff.)
+/// @param dense       A dense grid that acts as a data source
+/// @param functor     A functor that selects and transforms data for output
+/// @param background  The background value of the resulting sparse grid
+/// @param threaded    Option to use threaded or serial code path
+/// @return @c Ptr to tree with the valuetype and configuration defined
+/// by typedefs in the @c functor.
+/// @note To achieve optimal sparsity  consider calling the prune()
+/// method on the result.
+/// @note To simply copy the all the data from a Dense grid to a
+/// OpenVDB Grid, use tools::copyFromDense() for better performance.
+///
+/// The type of the sparse tree is determined by the specified OtpType
+/// functor by means of the typedef OptType::ResultTreeType
+///
+/// The OptType function is responsible for the the transformation of
+/// dense grid data to sparse grid data on a per-voxel basis.
+///
+/// Only leaf nodes with active values will be added to the sparse grid.
+///
+/// The OpType must struct that defines a the minimal form
+/// @code
+/// struct ExampleOp
+/// {
+///     typedef DesiredTreeType   ResultTreeType;
+///
+///     template<typename IndexOrCoord>
+///      void OpType::operator() (const DenseValueType a, const IndexOrCoord& ijk,
+///                    ResultTreeType::LeafNodeType* leaf);
+/// };
+/// @endcode
+///
+/// For example, to generate a <ValueType, 5, 4, 3> tree with valuesOn
+/// at locations greater than a given maskvalue
+/// @code
+/// template <typename ValueType>
+/// class Rule
+/// {
+/// public:
+///     // Standard tree type (e.g. MaskTree or FloatTree in openvdb.h)
+///     typedef typename openvdb::tree::Tree4<ValueType, 5, 4, 3>::Type  ResultTreeType;
+///
+///     typedef typename ResultTreeType::LeafNodeType  ResultLeafNodeType;
+///     typedef typename ResultTreeType::ValueType     ResultValueType;
+///
+///     typedef float                         DenseValueType;
+///
+///     typedef vdbmath::Coord::ValueType     Index;
+///
+///     Rule(const DenseValueType& value): mMaskValue(value){};
+///
+///     template <typename IndexOrCoord>
+///     void operator()(const DenseValueType& a, const IndexOrCoord& offset,
+///                 ResultLeafNodeType* leaf) const
+///     {
+///             if (a > mMaskValue) {
+///                 leaf->setValueOn(offset, a);
+///             }
+///     }
+///
+/// private:
+///     const DenseValueType mMaskValue;
+/// };
+/// @endcode
+template<typename OpType, typename DenseType>
+typename OpType::ResultTreeType::Ptr
+extractSparseTree(const DenseType& dense, const OpType& functor,
+                  const typename OpType::ResultValueType& background,
+                  bool threaded = true);
+
+/// This struct that aids template resolution of a new tree type
+/// has the same configuration at TreeType, but the ValueType from
+/// DenseType.
+template <typename DenseType, typename TreeType> struct DSConverter {
+    typedef typename DenseType::ValueType  ValueType;
+
+    typedef typename TreeType::template ValueConverter<ValueType>::Type Type;
+};
+
+
+/// @brief Copy data from the intersection of a sparse tree and a dense input grid.
+/// The resulting tree has the same configuration as the sparse tree, but holds
+/// the data type specified by the dense input.
+/// @param dense       A dense grid that acts as a data source
+/// @param mask        The active voxels and tiles intersected with dense define iteration mask
+/// @param background  The background value of the resulting sparse grid
+/// @param threaded    Option to use threaded or serial code path
+/// @return @c Ptr to tree with the same configuration as @c mask but of value type
+/// defined by @c dense.
+template<typename DenseType, typename MaskTreeType>
+typename DSConverter<DenseType, MaskTreeType>::Type::Ptr
+extractSparseTreeWithMask(const DenseType& dense,
+                          const MaskTreeType& mask,
+                          const typename DenseType::ValueType& background,
+                          bool threaded = true);
+
+
+/// Apply a point-wise functor to the intersection of a dense grid and a given bounding box
+/// @param dense A dense grid to be transformed
+/// @param bbox  Index space bounding box, define region where the transformation is applied
+/// @param op    A functor that acts on the dense grid value type
+/// @param parallel Used to select multithreaded or single threaded
+/// Minimally, the @c op class has to support a @c operator() method,
+/// @code
+/// // Square values in a grid
+/// struct Op
+/// {
+///     ValueT operator()(const ValueT& in) const
+///     {
+///       // do work
+///       ValueT result = in * in;
+///
+///       return result;
+///     }
+/// };
+/// @endcode
+/// NB: only Dense grids with memory layout zxy are supported
+template<typename ValueT, typename OpType>
+void transformDense(Dense<ValueT, openvdb::tools::LayoutZYX>& dense,
+                    const openvdb::CoordBBox& bbox, const OpType& op, bool parallel=true);
+
+/// We currrently support the following operations when compositing sparse
+/// data into a dense grid.
+enum DSCompositeOp {
+    DS_OVER, DS_ADD, DS_SUB, DS_MIN, DS_MAX, DS_MULT, DS_SET
+};
+
+/// @brief Composite data from a sparse tree into a dense array of the same value type.
+/// @param dense    Dense grid to be altered by the operation
+/// @param source   Sparse data to composite into @c dense
+/// @param alpha    Sparse Alpha mask used in compositing operations.
+/// @param beta     Constant multiplier on src
+/// @param strength Constant multiplier on alpha
+/// @param threaded Enable threading for this operation.
+template<DSCompositeOp, typename TreeT>
+void compositeToDense(Dense<typename TreeT::ValueType, LayoutZYX>& dense,
+                      const TreeT& source,
+                      const TreeT& alpha,
+                      const typename TreeT::ValueType beta,
+                      const typename TreeT::ValueType strength,
+                      bool threaded = true);
+
+
+/// @brief Functor-based class used to extract data that satisfies some
+/// criteria defined by the embedded @c OpType functor. The @c extractSparseTree
+/// function wraps this class.
+template<typename OpType, typename DenseType>
+class SparseExtractor
+{
+
+public:
+
+    typedef openvdb::math::Coord::ValueType              Index;
+
+    typedef typename DenseType::ValueType                 DenseValueType;
+    typedef typename OpType::ResultTreeType               ResultTreeType;
+    typedef typename ResultTreeType::ValueType            ResultValueType;
+    typedef typename ResultTreeType::LeafNodeType         ResultLeafNodeType;
+    typedef typename ResultTreeType::template ValueConverter<ValueMask>::Type MaskTree;
+
+    typedef tbb::blocked_range3d<Index, Index, Index>     Range3d;
+
+
+private:
+
+    const DenseType&                     mDense;
+    const OpType&                        mFunctor;
+    const ResultValueType                mBackground;
+    const openvdb::math::CoordBBox       mBBox;
+    const Index                          mWidth;
+    typename ResultTreeType::Ptr         mMask;
+    openvdb::math::Coord                 mMin;
+
+
+public:
+
+    SparseExtractor(const DenseType& dense, const OpType& functor,
+                    const ResultValueType background) :
+        mDense(dense), mFunctor(functor),
+        mBackground(background),
+        mBBox(dense.bbox()),
+        mWidth(ResultLeafNodeType::DIM),
+        mMask( new ResultTreeType(mBackground))
+    {}
+
+
+    SparseExtractor(const DenseType& dense,
+                    const openvdb::math::CoordBBox& bbox,
+                    const OpType& functor,
+                    const ResultValueType background) :
+        mDense(dense), mFunctor(functor),
+        mBackground(background),
+        mBBox(bbox),
+        mWidth(ResultLeafNodeType::DIM),
+        mMask( new ResultTreeType(mBackground))
+    {
+        // mBBox must be inside the coordinate rage of the dense grid
+        if (!dense.bbox().isInside(mBBox)) {
+            OPENVDB_THROW(ValueError, "Data extraction window out of bound");
+        }
+    }
+
+
+    SparseExtractor(SparseExtractor& other, tbb::split):
+        mDense(other.mDense), mFunctor(other.mFunctor),
+        mBackground(other.mBackground), mBBox(other.mBBox),
+        mWidth(other.mWidth),
+        mMask(new ResultTreeType(mBackground)),
+        mMin(other.mMin)
+    {}
+
+    typename ResultTreeType::Ptr extract(bool threaded = true) {
+
+
+        // Construct 3D range of leaf nodes that
+        // intersect mBBox.
+
+        // Snap the bbox to nearest leaf nodes min and max
+
+        openvdb::math::Coord padded_min = mBBox.min();
+        openvdb::math::Coord padded_max = mBBox.max();
+
+
+        padded_min &= ~(mWidth - 1);
+        padded_max &= ~(mWidth - 1);
+
+        padded_max[0] += mWidth - 1;
+        padded_max[1] += mWidth - 1;
+        padded_max[2] += mWidth - 1;
+
+
+        // number of leaf nodes in each direction
+        // division by leaf width, e.g. 8 in most cases
+
+        const Index xleafCount = ( padded_max.x() - padded_min.x() + 1 ) / mWidth;
+        const Index yleafCount = ( padded_max.y() - padded_min.y() + 1 ) / mWidth;
+        const Index zleafCount = ( padded_max.z() - padded_min.z() + 1 ) / mWidth;
+
+        mMin = padded_min;
+
+
+        Range3d  leafRange(0, xleafCount, 1,
+                           0, yleafCount, 1,
+                           0, zleafCount, 1);
+
+
+        // Iterate over the leafnodes applying *this as a functor.
+        if (threaded) {
+            tbb::parallel_reduce(leafRange, *this);
+        } else {
+            (*this)(leafRange);
+        }
+
+        return mMask;
+    }
+
+
+    void operator()(const Range3d& range) {
+
+        ResultLeafNodeType* leaf = NULL;
+
+        // Unpack the range3d item.
+        const Index imin = range.pages().begin();
+        const Index imax = range.pages().end();
+
+        const Index jmin = range.rows().begin();
+        const Index jmax = range.rows().end();
+
+        const Index kmin = range.cols().begin();
+        const Index kmax = range.cols().end();
+
+
+        // loop over all the candidate leafs. Adding only those with 'true' values
+        // to the tree
+
+        for (Index i = imin; i < imax; ++i) {
+            for (Index j = jmin; j < jmax; ++j) {
+                for (Index k = kmin; k < kmax; ++k) {
+
+                    // Calculate the origin of candidate leaf
+                    const openvdb::math::Coord origin =
+                        mMin + openvdb::math::Coord(mWidth * i,
+                                                    mWidth * j,
+                                                    mWidth * k );
+
+                    if (leaf == NULL) {
+                        leaf = new ResultLeafNodeType(origin, mBackground);
+                    } else {
+                        leaf->setOrigin(origin);
+                        leaf->fill(mBackground);
+                        leaf->setValuesOff();
+                    }
+
+                    // The bounding box for this leaf
+
+                    openvdb::math::CoordBBox localBBox = leaf->getNodeBoundingBox();
+
+                    // Shrink to the intersection with mBBox (i.e. the dense
+                    // volume)
+
+                    localBBox.intersect(mBBox);
+
+                    // Early out for non-intersecting leafs
+
+                    if (localBBox.empty()) continue;
+
+
+                    const openvdb::math::Coord start = localBBox.getStart();
+                    const openvdb::math::Coord end   = localBBox.getEnd();
+
+                    // Order the looping to respect the memory layout in
+                    // the Dense source
+
+                    if (mDense.memoryLayout() == openvdb::tools::LayoutZYX) {
+
+                        openvdb::math::Coord ijk;
+                        Index offset;
+                        const DenseValueType* dp;
+                        for (ijk[0] = start.x(); ijk[0] < end.x(); ++ijk[0] ) {
+                            for (ijk[1] = start.y(); ijk[1] < end.y(); ++ijk[1] ) {
+                                for (ijk[2] = start.z(),
+                                         offset = ResultLeafNodeType::coordToOffset(ijk),
+                                         dp = &mDense.getValue(ijk);
+                                     ijk[2] < end.z(); ++ijk[2], ++offset, ++dp) {
+
+                                    mFunctor(*dp, offset, leaf);
+                                }
+                            }
+                        }
+
+                    } else {
+
+                        openvdb::math::Coord ijk;
+                        const DenseValueType* dp;
+                        for (ijk[2] = start.z(); ijk[2] < end.z(); ++ijk[2]) {
+                            for (ijk[1] = start.y(); ijk[1] < end.y(); ++ijk[1]) {
+                                for (ijk[0] = start.x(),
+                                         dp = &mDense.getValue(ijk);
+                                     ijk[0] < end.x(); ++ijk[0], ++dp) {
+
+                                    mFunctor(*dp, ijk, leaf);
+
+                                }
+                            }
+                        }
+                    }
+
+                    // Only add non-empty leafs (empty is defined as all inactive)
+
+                    if (!leaf->isEmpty()) {
+                        mMask->addLeaf(*leaf);
+                        leaf = NULL;
+                    }
+
+                }
+            }
+        }
+
+        // Clean up an unused leaf.
+
+        if (leaf != NULL) delete leaf;
+    }
+
+    void join(SparseExtractor& rhs) {
+        mMask->merge(*rhs.mMask);
+    }
+}; // class SparseExtractor
+
+
+template<typename OpType, typename DenseType>
+typename OpType::ResultTreeType::Ptr
+extractSparseTree(const DenseType& dense, const OpType& functor,
+                  const typename OpType::ResultValueType& background,
+                  bool threaded)
+{
+
+    // Construct the mask using a parallel reduce pattern.
+    // Each thread computes disjoint mask-trees.  The join merges
+    // into a single tree.
+
+    SparseExtractor<OpType, DenseType> extractor(dense, functor, background);
+
+    return extractor.extract(threaded);
+}
+
+
+/// @brief Functor-based class used to extract data from a dense grid, at
+/// the index-space intersection with a supplied mask in the form of a sparse tree.
+/// The @c extractSparseTreeWithMask function wraps this class.
+template <typename DenseType, typename MaskTreeType>
+class SparseMaskedExtractor
+{
+public:
+
+    typedef typename DSConverter<DenseType, MaskTreeType>::Type  _ResultTreeType;
+    typedef _ResultTreeType                                      ResultTreeType;
+    typedef typename ResultTreeType::LeafNodeType                ResultLeafNodeType;
+    typedef typename ResultTreeType::ValueType                   ResultValueType;
+    typedef ResultValueType                                      DenseValueType;
+
+    typedef typename ResultTreeType::template ValueConverter<ValueMask>::Type  MaskTree;
+    typedef typename MaskTree::LeafCIter                         MaskLeafCIter;
+    typedef std::vector<const typename MaskTree::LeafNodeType*>  MaskLeafVec;
+
+
+    SparseMaskedExtractor(const DenseType& dense,
+                  const ResultValueType& background,
+                  const MaskLeafVec& leafVec
+                  ):
+        mDense(dense), mBackground(background), mBBox(dense.bbox()),
+        mLeafVec(leafVec),
+        mResult(new ResultTreeType(mBackground))
+    {}
+
+
+
+    SparseMaskedExtractor(const SparseMaskedExtractor& other, tbb::split):
+        mDense(other.mDense), mBackground(other.mBackground), mBBox(other.mBBox),
+        mLeafVec(other.mLeafVec), mResult( new ResultTreeType(mBackground))
+    {}
+
+    typename ResultTreeType::Ptr extract(bool threaded = true) {
+
+        tbb::blocked_range<size_t> range(0, mLeafVec.size());
+
+        if (threaded) {
+            tbb::parallel_reduce(range, *this);
+        } else {
+            (*this)(range);
+        }
+
+        return mResult;
+    }
+
+
+    // Used in looping over leaf nodes in the masked grid
+    // and using the active mask to select data to
+    void operator()(const tbb::blocked_range<size_t>& range) {
+
+        ResultLeafNodeType* leaf = NULL;
+
+
+        // loop over all the candidate leafs. Adding only those with 'true' values
+        // to the tree
+
+        for (size_t idx = range.begin(); idx < range.end(); ++ idx) {
+
+            const typename MaskTree::LeafNodeType* maskLeaf = mLeafVec[idx];
+
+            // The bounding box for this leaf
+
+            openvdb::math::CoordBBox localBBox = maskLeaf->getNodeBoundingBox();
+
+            // Shrink to the intersection with the dense volume
+
+            localBBox.intersect(mBBox);
+
+            // Early out if there was no intersection
+
+            if (localBBox.empty()) continue;
+
+            // Reset or allocate the target leaf
+
+            if (leaf == NULL) {
+                leaf = new ResultLeafNodeType(maskLeaf->origin(), mBackground);
+            } else {
+                leaf->setOrigin(maskLeaf->origin());
+                leaf->fill(mBackground);
+                leaf->setValuesOff();
+            }
+
+
+            // Iterate over the intersecting bounding box
+            // copying active values to the result tree
+
+            const openvdb::math::Coord start = localBBox.getStart();
+            const openvdb::math::Coord end   = localBBox.getEnd();
+
+
+            openvdb::math::Coord ijk;
+
+            if (mDense.memoryLayout() == openvdb::tools::LayoutZYX
+                  && maskLeaf->isDense()) {
+
+                Index offset;
+                const DenseValueType* src;
+                for (ijk[0] = start.x(); ijk[0] < end.x(); ++ijk[0] ) {
+                    for (ijk[1] = start.y(); ijk[1] < end.y(); ++ijk[1] ) {
+                        for (ijk[2] = start.z(),
+                                 offset = ResultLeafNodeType::coordToOffset(ijk),
+                                 src  = &mDense.getValue(ijk);
+                             ijk[2] < end.z(); ++ijk[2], ++offset, ++src) {
+
+                            // copy into leaf
+                            leaf->setValueOn(offset, *src);
+                        }
+
+                    }
+                }
+
+            } else {
+
+                Index offset;
+                for (ijk[0] = start.x(); ijk[0] < end.x(); ++ijk[0] ) {
+                    for (ijk[1] = start.y(); ijk[1] < end.y(); ++ijk[1] ) {
+                        for (ijk[2] = start.z(),
+                                 offset = ResultLeafNodeType::coordToOffset(ijk);
+                             ijk[2] < end.z(); ++ijk[2], ++offset) {
+
+                            if (maskLeaf->isValueOn(offset)) {
+                                const ResultValueType denseValue =  mDense.getValue(ijk);
+                                leaf->setValueOn(offset, denseValue);
+                            }
+                        }
+                    }
+                }
+            }
+            // Only add non-empty leafs (empty is defined as all inactive)
+
+            if (!leaf->isEmpty()) {
+                mResult->addLeaf(*leaf);
+                leaf = NULL;
+            }
+        }
+
+        // Clean up an unused leaf.
+
+        if (leaf != NULL) delete leaf;
+    }
+
+    void join(SparseMaskedExtractor& rhs) {
+        mResult->merge(*rhs.mResult);
+    }
+
+
+private:
+    const DenseType&                   mDense;
+    const ResultValueType              mBackground;
+    const openvdb::math::CoordBBox&    mBBox;
+    const MaskLeafVec&                 mLeafVec;
+
+    typename ResultTreeType::Ptr       mResult;
+
+}; // class SparseMaskedExtractor
+
+
+/// @brief a simple utility class used by @c extractSparseTreeWithMask
+template<typename _ResultTreeType, typename DenseValueType>
+struct ExtractAll
+{
+    typedef  _ResultTreeType                       ResultTreeType;
+    typedef typename ResultTreeType::LeafNodeType  ResultLeafNodeType;
+
+    template<typename CoordOrIndex> inline void
+    operator()(const DenseValueType& a, const CoordOrIndex& offset, ResultLeafNodeType* leaf) const
+    {
+        leaf->setValueOn(offset, a);
+    }
+};
+
+
+template <typename DenseType, typename MaskTreeType>
+typename DSConverter<DenseType, MaskTreeType>::Type::Ptr
+extractSparseTreeWithMask(const DenseType& dense,
+                          const MaskTreeType& maskProxy,
+                          const typename DenseType::ValueType& background,
+                          bool threaded)
+{
+    typedef SparseMaskedExtractor<DenseType, MaskTreeType>       LeafExtractor;
+    typedef typename LeafExtractor::DenseValueType               DenseValueType;
+    typedef typename LeafExtractor::ResultTreeType               ResultTreeType;
+    typedef typename LeafExtractor::MaskLeafVec                  MaskLeafVec;
+    typedef typename LeafExtractor::MaskTree                     MaskTree;
+    typedef typename LeafExtractor::MaskLeafCIter                MaskLeafCIter;
+    typedef ExtractAll<ResultTreeType, DenseValueType>           ExtractionRule;
+
+    // Use Mask tree to hold the topology
+
+    MaskTree maskTree(maskProxy, false, TopologyCopy());
+
+    // Construct an array of pointers to the mask leafs.
+
+    const size_t leafCount = maskTree.leafCount();
+    MaskLeafVec leafarray(leafCount);
+    MaskLeafCIter leafiter = maskTree.cbeginLeaf();
+    for (size_t n = 0; n != leafCount; ++n, ++leafiter) {
+        leafarray[n] = leafiter.getLeaf();
+    }
+
+
+    // Extract the data that is masked leaf nodes in the mask.
+
+    LeafExtractor leafextractor(dense, background, leafarray);
+    typename ResultTreeType::Ptr resultTree = leafextractor.extract(threaded);
+
+
+    // Extract data that is masked by tiles in the mask.
+
+
+    // Loop over the mask tiles, extracting the data into new trees.
+    // These trees will be leaf-orthogonal to the leafTree (i.e. no leaf
+    // nodes will overlap).  Merge these trees into the result.
+
+    typename MaskTreeType::ValueOnCIter tileIter(maskProxy);
+    tileIter.setMaxDepth(MaskTreeType::ValueOnCIter::LEAF_DEPTH - 1);
+
+    // Return the leaf tree if the mask had no tiles
+
+    if (!tileIter) return resultTree;
+
+    ExtractionRule allrule;
+
+    // Loop over the tiles in series, but the actual data extraction
+    // is in parallel.
+
+    CoordBBox bbox;
+    for ( ; tileIter; ++tileIter) {
+
+        // Find the intersection of the tile with the dense grid.
+
+        tileIter.getBoundingBox(bbox);
+        bbox.intersect(dense.bbox());
+
+        if (bbox.empty()) continue;
+
+        SparseExtractor<ExtractionRule, DenseType> copyData(dense, bbox, allrule, background);
+        typename ResultTreeType::Ptr fromTileTree = copyData.extract(threaded);
+        resultTree->merge(*fromTileTree);
+    }
+
+    return resultTree;
+}
+
+
+/// @brief Class that applies a functor to the index space intersection
+/// of a prescribed bounding box and the dense grid.
+/// NB: This class only supports DenseGrids with ZYX memory layout.
+template <typename _ValueT, typename OpType>
+class DenseTransformer
+{
+public:
+
+    typedef _ValueT                                 ValueT;
+    typedef Dense<ValueT, openvdb::tools::LayoutZYX>       DenseT;
+    typedef openvdb::math::Coord::ValueType         IntType;
+    typedef tbb::blocked_range2d<IntType, IntType>  RangeType;
+
+
+private:
+
+    DenseT&                  mDense;
+    const OpType&            mOp;
+    openvdb::math::CoordBBox mBBox;
+
+public:
+    DenseTransformer(DenseT& dense,
+                     const openvdb::math::CoordBBox& bbox,
+                     const OpType& functor):
+        mDense(dense), mOp(functor), mBBox(dense.bbox())
+    {
+        // The iteration space is the intersection of the
+        // input bbox and the index-space covered by the dense grid
+        mBBox.intersect(bbox);
+    }
+
+    DenseTransformer(const DenseTransformer& other) :
+        mDense(other.mDense), mOp(other.mOp), mBBox(other.mBBox) {}
+
+    void apply(bool threaded = true) {
+
+        // Early out if the iteration space is empty
+
+        if (mBBox.empty()) return;
+
+
+        const openvdb::math::Coord start = mBBox.getStart();
+        const openvdb::math::Coord end   = mBBox.getEnd();
+
+        // The iteration range only the slower two directions.
+        const RangeType range(start.x(), end.x(), 1,
+                              start.y(), end.y(), 1);
+
+        if (threaded) {
+            tbb::parallel_for(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+
+    void operator()(const RangeType& range) const {
+
+        // The stride in the z-direction.
+        // Note: the bbox is [inclusive, inclusive]
+
+        const size_t zlength = size_t(mBBox.max().z() - mBBox.min().z() + 1);
+
+        const IntType imin = range.rows().begin();
+        const IntType imax = range.rows().end();
+        const IntType jmin = range.cols().begin();
+        const IntType jmax = range.cols().end();
+
+
+        openvdb::math::Coord xyz(imin, jmin, mBBox.min().z());
+        for (xyz[0] = imin; xyz[0] != imax; ++xyz[0]) {
+            for (xyz[1] = jmin; xyz[1] != jmax; ++xyz[1]) {
+
+                mOp.transform(mDense, xyz, zlength);
+            }
+        }
+    }
+}; // class DenseTransformer
+
+
+/// @brief a wrapper struct used to avoid unnecessary computation of
+/// memory access from @c Coord when all offsets are guaranteed to be
+/// within the dense grid.
+template <typename ValueT, typename PointWiseOp>
+struct ContiguousOp
+{
+    ContiguousOp(const PointWiseOp& op) : mOp(op){}
+
+    typedef Dense<ValueT, openvdb::tools::LayoutZYX>  DenseT;
+    inline void transform(DenseT& dense, openvdb::math::Coord& ijk, size_t size) const
+    {
+        ValueT* dp = const_cast<ValueT*>(&dense.getValue(ijk));
+
+        for (size_t offset = 0; offset < size; ++offset) {
+            dp[offset] = mOp(dp[offset]);
+        }
+    }
+
+    const PointWiseOp mOp;
+};
+
+
+/// Apply a point-wise functor to the intersection of a dense grid and a given bounding box
+template <typename ValueT, typename PointwiseOpT>
+void
+transformDense(Dense<ValueT, openvdb::tools::LayoutZYX>& dense,
+               const openvdb::CoordBBox& bbox,
+               const PointwiseOpT& functor, bool parallel)
+{
+    typedef ContiguousOp<ValueT, PointwiseOpT>  OpT;
+
+    // Convert the Op so it operates on a contiguous line in memory
+
+    OpT op(functor);
+
+    // Apply to the index space intersection in the dense grid
+    DenseTransformer<ValueT, OpT> transformer(dense, bbox, op);
+    transformer.apply(parallel);
+}
+
+
+template <typename CompositeMethod, typename _TreeT>
+class SparseToDenseCompositor
+{
+
+public:
+    typedef _TreeT                                               TreeT;
+    typedef typename TreeT::ValueType                            ValueT;
+    typedef typename TreeT::LeafNodeType                         LeafT;
+    typedef typename TreeT::template ValueConverter<ValueMask>::Type  MaskTreeT;
+    typedef typename MaskTreeT::LeafNodeType                     MaskLeafT;
+    typedef Dense<ValueT, openvdb::tools::LayoutZYX>             DenseT;
+    typedef openvdb::math::Coord::ValueType                      Index;
+    typedef tbb::blocked_range3d<Index, Index, Index>            Range3d;
+
+    SparseToDenseCompositor(DenseT& dense, const TreeT& source, const TreeT& alpha,
+                            const ValueT beta, const ValueT strength) :
+        mDense(dense), mSource(source), mAlpha(alpha), mBeta(beta), mStrength(strength)
+    {}
+
+    SparseToDenseCompositor(const SparseToDenseCompositor& other):
+        mDense(other.mDense), mSource(other.mSource), mAlpha(other.mAlpha),
+        mBeta(other.mBeta), mStrength(other.mStrength) {}
+
+
+
+    void sparseComposite(bool threaded) {
+
+        const ValueT beta = mBeta;
+        const ValueT strenght = mStrength;
+
+        // construct a tree that defines the iteration space
+
+        MaskTreeT maskTree(mSource, false /*background*/, openvdb::TopologyCopy());
+        maskTree.topologyUnion(mAlpha);
+
+        // Composite regions that are represented by leafnodes in either mAlpha or mSource
+        // Parallelize over bool-leafs
+
+        openvdb::tree::LeafManager<const MaskTreeT> maskLeafs(maskTree);
+        maskLeafs.foreach(*this, threaded);
+
+        // Composite regions that are represented by tiles
+        // Parallelize within each tile.
+
+        typename MaskTreeT::ValueOnCIter citer = maskTree.cbeginValueOn();
+        citer.setMaxDepth(MaskTree::ValueOnCIter::LEAF_DEPTH - 1);
+
+        if (!citer) return;
+
+        typename tree::ValueAccessor<const TreeT>   alphaAccessor(mAlpha);
+        typename tree::ValueAccessor<const TreeT>   sourceAccessor(mSource);
+
+        for (; citer; ++citer) {
+
+            const openvdb::math::Coord org = citer.getCoord();
+
+            // Early out if both alpha and source are zero in this tile.
+
+            const ValueT alphaValue = alphaAccessor.getValue(org);
+            const ValueT sourceValue = sourceAccessor.getValue(org);
+
+            if (openvdb::math::isZero(alphaValue) &&
+                openvdb::math::isZero(sourceValue) ) continue;
+
+            // Compute overlap of tile with the dense grid
+
+            openvdb::math::CoordBBox localBBox = citer.getBoundingBox();
+            localBBox.intersect(mDense.bbox());
+
+            // Early out if there is no intersection
+
+            if (localBBox.empty()) continue;
+
+            // Composite the tile-uniform values into the dense grid.
+            compositeFromTile(mDense, localBBox, sourceValue,
+                              alphaValue, beta, strenght, threaded);
+        }
+    }
+
+    // Composites leaf values where the alpha values are active.
+    // Used in sparseComposite
+    void inline operator()(const MaskLeafT& maskLeaf, size_t /*i*/) const
+    {
+
+        typedef UniformLeaf   ULeaf;
+        openvdb::math::CoordBBox localBBox = maskLeaf.getNodeBoundingBox();
+        localBBox.intersect(mDense.bbox());
+
+        // Early out for non-overlapping leafs
+
+        if (localBBox.empty()) return;
+
+        const openvdb::math::Coord org = maskLeaf.origin();
+        const LeafT* alphaLeaf = mAlpha.probeLeaf(org);
+        const LeafT* sourceLeaf   = mSource.probeLeaf(org);
+
+        if (!sourceLeaf) {
+
+            // Create a source leaf proxy with the correct value
+            ULeaf uniformSource(mSource.getValue(org));
+
+            if (!alphaLeaf) {
+
+                // Create an alpha leaf proxy with the correct value
+                ULeaf uniformAlpha(mAlpha.getValue(org));
+
+                compositeFromLeaf(mDense, localBBox, uniformSource, uniformAlpha,
+                                  mBeta, mStrength);
+            } else {
+
+                compositeFromLeaf(mDense, localBBox, uniformSource, *alphaLeaf,
+                                  mBeta, mStrength);
+            }
+        } else {
+            if (!alphaLeaf) {
+
+                // Create an alpha leaf proxy with the correct value
+                ULeaf uniformAlpha(mAlpha.getValue(org));
+
+                compositeFromLeaf(mDense, localBBox, *sourceLeaf, uniformAlpha,
+                                  mBeta, mStrength);
+            } else {
+
+                compositeFromLeaf(mDense, localBBox, *sourceLeaf, *alphaLeaf,
+                                  mBeta, mStrength);
+            }
+        }
+    }
+    // i.e.  it assumes that all valueOff Alpha voxels have value 0.
+
+    template <typename LeafT1, typename LeafT2>
+    inline static void compositeFromLeaf(DenseT& dense, const openvdb::math::CoordBBox& bbox,
+                                         const LeafT1& source, const LeafT2& alpha,
+                                         const ValueT beta, const ValueT strength)
+    {
+        typedef openvdb::math::Coord::ValueType  IntType;
+
+        const ValueT sbeta = strength * beta;
+        openvdb::math::Coord ijk = bbox.min();
+
+
+        if (alpha.isDense() /*all active values*/) {
+
+            // Optimal path for dense alphaLeaf
+            const IntType size = bbox.max().z() + 1 - bbox.min().z();
+
+            for (ijk[0] = bbox.min().x(); ijk[0] < bbox.max().x() + 1; ++ijk[0]) {
+                for (ijk[1] = bbox.min().y(); ijk[1] < bbox.max().y() + 1; ++ijk[1]) {
+
+                    ValueT* d = const_cast<ValueT*>(&dense.getValue(ijk));
+                    const ValueT* a = &alpha.getValue(ijk);
+                    const ValueT* s = &source.getValue(ijk);
+
+                    for (IntType idx = 0; idx < size; ++idx) {
+                        d[idx] = CompositeMethod::apply(d[idx], a[idx], s[idx],
+                                                        strength, beta, sbeta);
+                    }
+                }
+            }
+        }  else {
+
+            // AlphaLeaf has non-active cells.
+
+            for (ijk[0] = bbox.min().x(); ijk[0] < bbox.max().x() + 1; ++ijk[0]) {
+                for (ijk[1] = bbox.min().y(); ijk[1] < bbox.max().y() + 1; ++ijk[1]) {
+                    for (ijk[2] = bbox.min().z(); ijk[2] < bbox.max().z() + 1; ++ijk[2]) {
+
+                        if (alpha.isValueOn(ijk)) {
+
+                            dense.setValue(ijk,
+                             CompositeMethod::apply(dense.getValue(ijk),
+                                                    alpha.getValue(ijk), source.getValue(ijk),
+                                                    strength, beta, sbeta)
+                                           );
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    inline static void compositeFromTile(DenseT& dense, openvdb::math::CoordBBox& bbox,
+                                         const ValueT& sourceValue, const ValueT& alphaValue,
+                                         const ValueT& beta, const ValueT& strength,
+                                         bool threaded)
+    {
+
+        typedef UniformTransformer TileTransformer;
+        TileTransformer functor(sourceValue, alphaValue, beta, strength);
+
+        // Transform the data inside the bbox according to the TileTranformer.
+
+        transformDense(dense, bbox, functor, threaded);
+
+    }
+
+
+    void denseComposite(bool threaded)
+    {
+        /// Construct a range that corresponds to the
+        /// bounding box of the dense volume
+        const openvdb::math::CoordBBox& bbox = mDense.bbox();
+
+        Range3d  range(bbox.min().x(), bbox.max().x(), LeafT::DIM,
+                       bbox.min().y(), bbox.max().y(), LeafT::DIM,
+                       bbox.min().z(), bbox.max().z(), LeafT::DIM);
+
+        // Iterate over the range, compositing into
+        // the dense grid using value accessors for
+        // sparse the grids.
+        if (threaded) {
+            tbb::parallel_for(range, *this);
+        } else {
+            (*this)(range);
+        }
+
+    }
+
+    // Composites a dense region using value accessors
+    // into a dense grid
+    void inline operator()(const Range3d& range) const
+    {
+        // Use value accessors to alpha and source
+
+        typename tree::ValueAccessor<const TreeT>   alphaAccessor(mAlpha);
+        typename tree::ValueAccessor<const TreeT>   sourceAccessor(mSource);
+
+        const ValueT strength = mStrength;
+        const ValueT beta     = mBeta;
+        const ValueT sbeta    = strength * beta;
+
+        // Unpack the range3d item.
+        const Index imin = range.pages().begin();
+        const Index imax = range.pages().end();
+
+        const Index jmin = range.rows().begin();
+        const Index jmax = range.rows().end();
+
+        const Index kmin = range.cols().begin();
+        const Index kmax = range.cols().end();
+
+        openvdb::Coord ijk;
+        for (ijk[0] = imin; ijk[0] < imax; ++ijk[0]) {
+            for (ijk[1] = jmin; ijk[1] < jmax; ++ijk[1]) {
+                for (ijk[2] = kmin; ijk[2] < kmax; ++ijk[2]) {
+                    const ValueT d_old = mDense.getValue(ijk);
+                    const ValueT& alpha = alphaAccessor.getValue(ijk);
+                    const ValueT& src   = sourceAccessor.getValue(ijk);
+
+                    mDense.setValue(ijk, CompositeMethod::apply(d_old, alpha, src,
+                                                                strength, beta, sbeta));
+                }
+            }
+        }
+
+    }
+
+
+private:
+
+    // Internal class that wraps the templated composite method
+    // for use when both alpha and source are uniform over
+    // a prescribed bbox (e.g. a tile).
+    class UniformTransformer
+    {
+    public:
+        UniformTransformer(const ValueT& source, const ValueT& alpha, const ValueT& _beta,
+                           const ValueT& _strength) :
+            mSource(source), mAlpha(alpha), mBeta(_beta),
+            mStrength(_strength), mSBeta(_strength * _beta)
+        {}
+
+        ValueT operator()(const ValueT& input) const
+        {
+            return CompositeMethod::apply(input, mAlpha, mSource,
+                                          mStrength, mBeta, mSBeta);
+        }
+
+    private:
+        const ValueT mSource;   const ValueT mAlpha; const ValueT mBeta;
+        const ValueT mStrength; const ValueT mSBeta;
+    };
+
+
+    // Simple Class structure that mimics a leaf
+    // with uniform values. Holds LeafT::DIM copies
+    // of a value in an array.
+    struct Line {  ValueT mValues[LeafT::DIM]; };
+    class UniformLeaf : private Line
+    {
+    public:
+        typedef typename LeafT::ValueType ValueT;
+
+        typedef Line   BaseT;
+        UniformLeaf(const ValueT& value) : BaseT(init(value)) {}
+
+        static const BaseT init(const ValueT& value) {
+            BaseT tmp;
+            for (openvdb::Index i = 0; i < LeafT::DIM; ++i) {
+                tmp.mValues[i] = value;
+            }
+            return tmp;
+        }
+
+        bool isDense() const { return true; }
+        bool isValueOn(openvdb::math::Coord&) const { return true; }
+
+        inline const ValueT& getValue(const openvdb::math::Coord& ) const
+        {return  BaseT::mValues[0];}
+    };
+
+private:
+    DenseT&       mDense;
+    const TreeT&  mSource;
+    const TreeT&  mAlpha;
+    ValueT        mBeta;
+    ValueT        mStrength;
+}; // class SparseToDenseCompositor
+
+
+namespace ds
+{
+    //@{
+    /// @brief Point wise methods used to apply various compositing operations.
+    template <typename ValueT>
+    struct OpOver
+    {
+        static inline ValueT apply(const ValueT u, const ValueT alpha,
+                                   const ValueT v,
+                                   const ValueT strength,
+                                   const ValueT beta,
+                                   const ValueT /*sbeta*/)
+        { return (u + strength * alpha * (beta * v - u)); }
+    };
+
+
+    template <typename ValueT>
+    struct OpAdd
+    {
+        static inline ValueT apply(const ValueT u, const ValueT alpha,
+                                   const ValueT v,
+                                   const ValueT /*strength*/,
+                                   const ValueT /*beta*/,
+                                   const ValueT sbeta)
+        { return (u + sbeta * alpha * v); }
+    };
+
+    template <typename ValueT>
+    struct OpSub
+    {
+        static inline ValueT apply(const ValueT u, const ValueT alpha,
+                                   const ValueT v,
+                                   const ValueT /*strength*/,
+                                   const ValueT /*beta*/,
+                                   const ValueT sbeta)
+        { return (u - sbeta * alpha * v); }
+    };
+
+    template <typename ValueT>
+    struct OpMin
+    {
+        static inline ValueT apply(const ValueT u, const ValueT alpha,
+                                   const ValueT v,
+                                   const ValueT s /*trength*/,
+                                   const ValueT beta,
+                                   const ValueT /*sbeta*/)
+        { return ( ( 1 - s * alpha) * u + s * alpha * std::min(u, beta * v) ); }
+    };
+
+
+    template <typename ValueT>
+    struct OpMax
+    {
+        static inline ValueT apply(const ValueT u, const ValueT alpha,
+                                   const ValueT v,
+                                   const ValueT s/*trength*/,
+                                   const ValueT beta,
+                                   const ValueT /*sbeta*/)
+        { return ( ( 1 - s * alpha ) * u + s * alpha * std::min(u, beta * v) ); }
+    };
+
+    template <typename ValueT>
+    struct OpMult
+    {
+        static inline ValueT apply(const ValueT u, const ValueT alpha,
+                                   const ValueT v,
+                                   const ValueT s/*trength*/,
+                                   const ValueT /*beta*/,
+                                   const ValueT sbeta)
+        { return ( ( 1 + alpha * (sbeta * v - s)) * u ); }
+    };
+    //@}
+
+    //@{
+    /// Translator that converts an enum to compositing functor types
+    template <DSCompositeOp OP, typename ValueT>
+    struct CompositeFunctorTranslator{};
+
+    template <typename ValueT>
+    struct CompositeFunctorTranslator<DS_OVER, ValueT>{ typedef OpOver<ValueT>   OpT; };
+
+    template <typename ValueT>
+    struct CompositeFunctorTranslator<DS_ADD, ValueT>{ typedef OpAdd<ValueT>   OpT; };
+
+    template <typename ValueT>
+    struct CompositeFunctorTranslator<DS_SUB, ValueT>{ typedef OpSub<ValueT>   OpT; };
+
+    template <typename ValueT>
+    struct CompositeFunctorTranslator<DS_MIN, ValueT>{ typedef OpMin<ValueT>   OpT; };
+
+    template <typename ValueT>
+    struct CompositeFunctorTranslator<DS_MAX, ValueT>{ typedef OpMax<ValueT>   OpT; };
+
+    template <typename ValueT>
+    struct CompositeFunctorTranslator<DS_MULT, ValueT>{ typedef OpMult<ValueT>   OpT; };
+    //@}
+
+} // namespace ds
+
+
+template <DSCompositeOp OpT, typename TreeT>
+void compositeToDense(
+    Dense<typename TreeT::ValueType, LayoutZYX>& dense,
+    const TreeT& source, const TreeT& alpha,
+    const typename TreeT::ValueType beta,
+    const typename TreeT::ValueType strength,
+    bool threaded)
+{
+    typedef typename TreeT::ValueType  ValueT;
+    typedef ds::CompositeFunctorTranslator<OpT, ValueT> Translator;
+    typedef typename Translator::OpT  Method;
+
+    if (openvdb::math::isZero(strength)) return;
+
+    SparseToDenseCompositor<Method, TreeT> tool(dense, source, alpha, beta, strength);
+
+    if (openvdb::math::isZero(alpha.background()) &&
+        openvdb::math::isZero(source.background()))
+    {
+        // Use the sparsity of (alpha U source) as the iteration space.
+        tool.sparseComposite(threaded);
+    } else {
+        // Use the bounding box of dense as the iteration space.
+        tool.denseComposite(threaded);
+    }
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif //OPENVDB_TOOLS_DENSESPARSETOOLS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Diagnostics.h b/nuparu/include/openvdb_new/tools/Diagnostics.h
new file mode 100644
index 00000000..68bc4853
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Diagnostics.h
@@ -0,0 +1,1344 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file Diagnostics.h
+///
+/// @author Ken Museth
+///
+/// @brief Various diagnostic tools to identify potential issues with
+///        for example narrow-band level sets or fog volumes
+///
+#ifndef OPENVDB_TOOLS_DIAGNOSTICS_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_DIAGNOSTICS_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Vec3.h>
+#include <openvdb/math/Stencils.h>
+#include <openvdb/math/Operators.h>
+#include <openvdb/tree/LeafManager.h>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+#include <set>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/utility/enable_if.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Perform checks on a grid to see if it is a valid symmetric,
+/// narrow-band level set.
+///
+/// @param grid      Grid to be checked
+/// @param number    Number of the checks to be performed (see below)
+/// @return string with a message indicating the nature of the
+/// issue. If no issue is detected the return string is empty.
+///
+/// @details @a number refers to the following ordered list of
+/// checks - always starting from the top.
+/// Fast checks
+/// 1: value type is floating point
+/// 2: has level set class type
+/// 3: has uniform scale
+/// 4: background value is positive and n*dx
+///
+/// Slower checks
+/// 5: no active tiles
+/// 6: all the values are finite, i.e not NaN or infinite
+/// 7: active values in range between +-background
+/// 8: abs of inactive values = background, i.e. assuming a symmetric
+/// narrow band!
+///
+/// Relatively slow check (however multithreaded)
+/// 9: norm gradient is close to one, i.e. satisfied the Eikonal equation.
+template<class GridType>
+std::string
+checkLevelSet(const GridType& grid, size_t number=9);
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Perform checks on a grid to see if it is a valid fog volume.
+///
+/// @param grid      Grid to be checked
+/// @param number    Number of the checks to be performed (see below)
+/// @return string with a message indicating the nature of the
+/// issue. If no issue is detected the return string is empty.
+///
+/// @details @a number refers to the following ordered list of
+/// checks - always starting from the top.
+/// Fast checks
+/// 1: value type is floating point
+/// 2: has FOG volume class type
+/// 3: background value is zero
+///
+/// Slower checks
+/// 4: all the values are finite, i.e not NaN or infinite
+/// 5: inactive values are zero
+/// 6: active values are in the range [0,1]
+template<class GridType>
+std::string
+checkFogVolume(const GridType& grid, size_t number=6);
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief  Threaded method to find unique inactive values.
+///
+/// @param grid         A VDB volume.
+/// @param values       List of unique inactive values, returned by this method.
+/// @param numValues    Number of values to look for.
+/// @return @c false if the @a grid has more than @a numValues inactive values.
+template<class GridType>
+bool
+uniqueInactiveValues(const GridType& grid,
+    std::vector<typename GridType::ValueType>& values, size_t numValues);
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks NaN values
+template <typename GridT,
+          typename TreeIterT = typename GridT::ValueOnCIter>
+struct CheckNan
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    /// @brief Default constructor
+    CheckNan() {}
+
+    /// Return true if the scalar value is NaN
+    inline bool operator()(const ElementType& v) const { return boost::math::isnan(v); }
+
+    /// @brief This allows for vector values to be checked component-wise
+    template <typename T>
+    inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const
+    {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;//should unroll
+        return false;
+    }
+
+    /// @brief Return true if the tile at the iterator location is NaN
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the voxel at the iterator location is NaN
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const { return "NaN"; }
+
+};// CheckNan
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks for infinite values, e.g. 1/0 or -1/0
+template <typename GridT,
+          typename TreeIterT = typename GridT::ValueOnCIter>
+struct CheckInf
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    /// @brief Default constructor
+    CheckInf() {}
+
+    /// Return true if the value is infinite
+    inline bool operator()(const ElementType& v) const { return boost::math::isinf(v); }
+
+    /// Return true if any of the vector components are infinite.
+    template <typename T> inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const
+    {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;
+        return false;
+    }
+
+    /// @brief Return true if the tile at the iterator location is infinite
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the tile at the iterator location is infinite
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const { return "infinite"; }
+};// CheckInf
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks for both NaN and inf values, i.e. any value that is not finite.
+template <typename GridT,
+          typename TreeIterT = typename GridT::ValueOnCIter>
+struct CheckFinite
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    /// @brief Default constructor
+    CheckFinite() {}
+
+    /// Return true if the value is NOT finite, i.e. it's NaN or infinite
+    inline bool operator()(const ElementType& v) const { return !boost::math::isfinite(v); }
+
+    /// Return true if any of the vector components are NaN or infinite.
+    template <typename T>
+    inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;
+        return false;
+    }
+
+    /// @brief Return true if the tile at the iterator location is NaN or infinite.
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the tile at the iterator location is NaN or infinite.
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const { return "not finite"; }
+};// CheckFinite
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Check that the magnitude of a value, a, is close to a fixed
+/// magnitude, b, given a fixed tolerance c. That is | |a| - |b| | <= c
+template <typename GridT,
+          typename TreeIterT = typename GridT::ValueOffCIter>
+struct CheckMagnitude
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    /// @brief Default constructor
+    CheckMagnitude(const ElementType& a,
+                   const ElementType& t = math::Tolerance<ElementType>::value())
+        : absVal(math::Abs(a)), tolVal(math::Abs(t))
+    {
+    }
+
+    /// Return true if the magnitude of the value is not approximately
+    /// equal to totVal.
+    inline bool operator()(const ElementType& v) const
+    {
+        return math::Abs(math::Abs(v) - absVal) > tolVal;
+    }
+
+    /// Return true if any of the vector components are infinite.
+    template <typename T> inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const
+    {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;
+        return false;
+    }
+
+    /// @brief Return true if the tile at the iterator location is infinite
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the tile at the iterator location is infinite
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "not equal to +/-"<<absVal<<" with a tolerance of "<<tolVal;
+        return ss.str();
+    }
+
+    const ElementType absVal, tolVal;
+};// CheckMagnitude
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks a value against a range
+template <typename GridT,
+          bool MinInclusive = true,//is min part of the range?
+          bool MaxInclusive = true,//is max part of the range?
+          typename TreeIterT = typename GridT::ValueOnCIter>
+struct CheckRange
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    // @brief Constructor taking a range to be tested against.
+    CheckRange(const ElementType& _min, const ElementType& _max) : minVal(_min), maxVal(_max)
+    {
+        if (minVal > maxVal) {
+            OPENVDB_THROW(ValueError, "CheckRange: Invalid range (min > max)");
+        }
+    }
+
+    /// Return true if the value is smaller than min or larger than max.
+    inline bool operator()(const ElementType& v) const
+    {
+        return (MinInclusive ? v<minVal : v<=minVal) ||
+               (MaxInclusive ? v>maxVal : v>=maxVal);
+    }
+
+    /// Return true if any of the vector components are out of range.
+    template <typename T>
+    inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;
+        return false;
+    }
+
+    /// @brief Return true if the voxel at the iterator location is out of range.
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the tile at the iterator location is out of range.
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "outside the value range " << (MinInclusive ? "[" : "]")
+           << minVal << "," << maxVal    << (MaxInclusive ? "]" : "[");
+        return ss.str();
+    }
+
+    const ElementType minVal, maxVal;
+};// CheckRange
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks a value against a minimum
+template <typename GridT,
+          typename TreeIterT = typename GridT::ValueOnCIter>
+struct CheckMin
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    // @brief Constructor taking a minimum to be tested against.
+    CheckMin(const ElementType& _min) : minVal(_min) {}
+
+    /// Return true if the value is smaller than min.
+    inline bool operator()(const ElementType& v) const { return v<minVal; }
+
+    /// Return true if any of the vector components are smaller than min.
+    template <typename T>
+    inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;
+        return false;
+    }
+
+    /// @brief Return true if the voxel at the iterator location is smaller than min.
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the tile at the iterator location is smaller than min.
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "smaller than "<<minVal;
+        return ss.str();
+    }
+
+    const ElementType minVal;
+};// CheckMin
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks a value against a maximum
+template <typename GridT,
+          typename TreeIterT = typename GridT::ValueOnCIter>
+struct CheckMax
+{
+    typedef typename VecTraits<typename GridT::ValueType>::ElementType ElementType;
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    /// @brief Constructor taking a maximum to be tested against.
+    CheckMax(const ElementType& _max) : maxVal(_max) {}
+
+    /// Return true if the value is larger than max.
+    inline bool operator()(const ElementType& v) const { return v>maxVal; }
+
+    /// Return true if any of the vector components are larger than max.
+    template <typename T>
+    inline typename boost::enable_if_c<VecTraits<T>::IsVec, bool>::type
+    operator()(const T& v) const {
+        for (int i=0; i<VecTraits<T>::Size; ++i) if ((*this)(v[i])) return true;
+        return false;
+    }
+
+    /// @brief Return true if the tile at the iterator location is larger than max.
+    bool operator()(const TreeIterT  &iter) const { return (*this)(*iter); }
+
+    /// @brief Return true if the voxel at the iterator location is larger than max.
+    bool operator()(const VoxelIterT &iter) const { return (*this)(*iter); }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "larger than "<<maxVal;
+        return ss.str();
+    }
+
+    const ElementType maxVal;
+};// CheckMax
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks the norm of the gradient against a range, i.e. @f$|\nabla\phi|\in[min,max]@f$
+///
+/// @note Internally the test is performed as @f$|\nabla\phi|^2\in[min^2,max^2]@f$
+/// for optimization reasons.  
+template<typename GridT,
+         typename TreeIterT = typename GridT::ValueOnCIter,
+         math::BiasedGradientScheme GradScheme = math::FIRST_BIAS>//math::WENO5_BIAS>
+struct CheckNormGrad
+{
+    typedef typename GridT::ValueType ValueType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+    typedef typename GridT::ConstAccessor AccT;
+
+    /// @brief Constructor taking a grid and a range to be tested against.
+    CheckNormGrad(const GridT&  grid, const ValueType& _min, const ValueType& _max)
+        : acc(grid.getConstAccessor())
+        , invdx2(ValueType(1.0/math::Pow2(grid.voxelSize()[0])))
+        , minVal2(_min*_min)
+        , maxVal2(_max*_max)
+    {
+        if ( !grid.hasUniformVoxels() ) {
+            OPENVDB_THROW(ValueError, "CheckNormGrad: The transform must have uniform scale");
+        }
+        if (_min > _max) {
+            OPENVDB_THROW(ValueError, "CheckNormGrad: Invalid range (min > max)");
+        }
+    }
+
+    CheckNormGrad(const CheckNormGrad& other)
+        : acc(other.acc.tree())
+        , invdx2(other.invdx2)
+        , minVal2(other.minVal2)
+        , maxVal2(other.maxVal2)
+    {
+    }
+
+    /// Return true if the value is smaller than min or larger than max.
+    inline bool operator()(const ValueType& v) const { return v<minVal2 || v>maxVal2; }
+
+    /// @brief Return true if zero is outside the range.
+    /// @note We assume that the norm of the gradient of a tile is always zero.
+    inline bool operator()(const TreeIterT&) const { return (*this)(ValueType(0)); }
+
+    /// @brief Return true if the norm of the gradient at a voxel
+    /// location of the iterator is out of range.
+    inline bool operator()(const VoxelIterT &iter) const
+    {
+        const Coord ijk = iter.getCoord();
+        return (*this)(invdx2 * math::ISGradientNormSqrd<GradScheme>::result(acc, ijk));
+    }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "outside the range of NormGrad ["<<math::Sqrt(minVal2)<<","<<math::Sqrt(maxVal2)<<"]";
+        return ss.str();
+    }
+
+    AccT acc;
+    const ValueType invdx2, minVal2, maxVal2;
+};// CheckNormGrad
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks the norm of the gradient at zero-crossing voxels against a range
+/// @details CheckEikonal differs from CheckNormGrad in that it only
+/// checks the norm of the gradient at voxel locations where the
+/// FD-stencil crosses the zero isosurface!
+template<typename GridT,
+         typename TreeIterT = typename GridT::ValueOnCIter,
+         typename StencilT  = math::WenoStencil<GridT> >//math::GradStencil<GridT>
+struct CheckEikonal
+{
+    typedef typename GridT::ValueType ValueType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+
+    /// @brief Constructor taking a grid and a range to be tested against.
+    CheckEikonal(const GridT&  grid, const ValueType& _min, const ValueType& _max)
+        : stencil(grid), minVal(_min), maxVal(_max)
+    {
+        if ( !grid.hasUniformVoxels() ) {
+            OPENVDB_THROW(ValueError, "CheckEikonal: The transform must have uniform scale");
+        }
+        if (minVal > maxVal) {
+            OPENVDB_THROW(ValueError, "CheckEikonal: Invalid range (min > max)");
+        }
+    }
+
+    CheckEikonal(const CheckEikonal& other)
+        : stencil(other.stencil.grid()), minVal(other.minVal), maxVal(other.maxVal)
+    {
+    }
+
+    /// Return true if the value is smaller than min or larger than max.
+    inline bool operator()(const ValueType& v) const { return v<minVal || v>maxVal; }
+
+    /// @brief Return true if zero is outside the range.
+    /// @note We assume that the norm of the gradient of a tile is always zero.
+    inline bool operator()(const TreeIterT&) const { return (*this)(ValueType(0)); }
+
+    /// @brief Return true if the norm of the gradient at a
+    /// zero-crossing voxel location of the iterator is out of range.
+    inline bool operator()(const VoxelIterT &iter) const
+    {
+        stencil.moveTo(iter);
+        if (!stencil.zeroCrossing()) return false;
+        return (*this)(stencil.normSqGrad());
+    }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "outside the range of NormGrad ["<<minVal<<","<<maxVal<<"]";
+        return ss.str();
+    }
+
+    mutable StencilT stencil;
+    const ValueType minVal, maxVal;
+};// CheckEikonal
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Checks the divergence against a range
+template<typename GridT,
+         typename TreeIterT = typename GridT::ValueOnCIter,
+         math::DScheme DiffScheme = math::CD_2ND>
+struct CheckDivergence
+{
+    typedef typename GridT::ValueType ValueType;
+    typedef typename VecTraits<ValueType>::ElementType ElementType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ElementType>::value);
+    typedef TreeIterT TileIterT;
+    typedef typename tree::IterTraits<typename TreeIterT::NodeT, typename TreeIterT::ValueIterT>
+    ::template NodeConverter<typename GridT::TreeType::LeafNodeType>::Type VoxelIterT;
+    typedef typename GridT::ConstAccessor AccT;
+
+    /// @brief Constructor taking a grid and a range to be tested against.
+    CheckDivergence(const GridT&  grid,
+                    const ValueType& _min,
+                    const ValueType& _max)
+        : acc(grid.getConstAccessor())
+        , invdx(ValueType(1.0/grid.voxelSize()[0]))
+        , minVal(_min)
+        , maxVal(_max)
+    {
+        if ( !grid.hasUniformVoxels() ) {
+            OPENVDB_THROW(ValueError, "CheckDivergence: The transform must have uniform scale");
+        }
+        if (minVal > maxVal) {
+            OPENVDB_THROW(ValueError, "CheckDivergence: Invalid range (min > max)");
+        }
+    }
+    /// Return true if the value is smaller than min or larger than max.
+    inline bool operator()(const ElementType& v) const { return v<minVal || v>maxVal; }
+
+    /// @brief Return true if zero is outside the range.
+    /// @note We assume that the divergence of a tile is always zero.
+    inline bool operator()(const TreeIterT&) const { return (*this)(ElementType(0)); }
+
+    /// @brief Return true if the divergence at a voxel location of
+    /// the iterator is out of range.
+    inline bool operator()(const VoxelIterT &iter) const
+    {
+        const Coord ijk = iter.getCoord();
+        return (*this)(invdx * math::ISDivergence<DiffScheme>::result(acc, ijk));
+    }
+
+    /// @brief Return a string describing a failed check.
+    std::string str() const
+    {
+        std::ostringstream ss;
+        ss << "outside the range of divergence ["<<minVal<<","<<maxVal<<"]";
+        return ss.str();
+    }
+
+    AccT acc;
+    const ValueType invdx, minVal, maxVal;
+};// CheckDivergence
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Performs multithreaded diagnostics of a grid
+/// @note More documentation will be added soon!
+template <typename GridT>
+class Diagnose
+{
+  public:
+    typedef typename GridT::template ValueConverter<bool>::Type  MaskType;
+
+    Diagnose(const GridT& grid) : mGrid(&grid), mMask(new MaskType()), mCount(0)
+    {
+        mMask->setTransform(grid.transformPtr()->copy());
+    }
+
+    template <typename CheckT>
+    std::string check(const CheckT& check,
+                      bool updateMask = false,
+                      bool checkVoxels = true,
+                      bool checkTiles = true,
+                      bool checkBackground = true)
+    {
+        typename MaskType::TreeType* mask = updateMask ? &(mMask->tree()) : NULL;
+        CheckValues<CheckT> cc(mask, mGrid, check);
+        std::ostringstream ss;
+        if (checkBackground) ss << cc.checkBackground();
+        if (checkTiles)      ss << cc.checkTiles();
+        if (checkVoxels)     ss << cc.checkVoxels();
+        mCount += cc.mCount;
+        return ss.str();
+    }
+
+    //@{
+    /// @brief Return a boolean mask of all the values
+    /// (i.e. tiles and/or voxels) that have failed one or
+    /// more checks.
+    typename MaskType::ConstPtr mask() const { return mMask; }
+    typename MaskType::Ptr mask() { return mMask; }
+    //@}
+
+    /// @brief Return the number of values (i.e. background, tiles or
+    /// voxels) that have failed one or more checks.
+    Index64 valueCount() const { return mMask->activeVoxelCount(); }
+
+    /// @brief Return total number of failed checks
+    /// @note If only one check was performed and the mask was updated
+    /// failureCount equals valueCount.
+    Index64 failureCount() const { return mCount; }
+
+    /// @brief Return a const reference to the grid
+    const GridT& grid() const { return *mGrid; }
+
+    /// @brief Clear the mask and error counter
+    void clear() { mMask = new MaskType(); mCount = 0; }
+
+private:
+    // disallow copy construction and copy by assignment!
+    Diagnose(const Diagnose&);// not implemented
+    Diagnose& operator=(const Diagnose&);// not implemented
+
+    const GridT*           mGrid;
+    typename MaskType::Ptr mMask;
+    Index64                mCount;
+
+    /// @brief Private class that performs the multithreaded checks
+    template <typename CheckT>
+    struct CheckValues
+    {
+        typedef typename MaskType::TreeType MaskT;
+        typedef typename GridT::TreeType::LeafNodeType LeafT;
+        typedef typename tree::LeafManager<const typename GridT::TreeType> LeafManagerT;
+        const bool      mOwnsMask;
+        MaskT*          mMask;
+        const GridT*    mGrid;
+        const CheckT    mCheck;
+        Index64         mCount;
+
+        CheckValues(MaskT* mask, const GridT* grid, const CheckT& check)
+            : mOwnsMask(false)
+            , mMask(mask)
+            , mGrid(grid)
+            , mCheck(check)
+            , mCount(0)
+        {
+        }
+        CheckValues(CheckValues& other, tbb::split)
+            : mOwnsMask(true)
+            , mMask(other.mMask ? new MaskT() : NULL)
+            , mGrid(other.mGrid)
+            , mCheck(other.mCheck)
+            , mCount(0)
+        {
+        }
+        ~CheckValues() { if (mOwnsMask) delete mMask; }
+
+        std::string checkBackground()
+        {
+            std::ostringstream ss;
+            if (mCheck(mGrid->background())) {
+                ++mCount;
+                ss << "Background is " + mCheck.str() << std::endl;
+            }
+            return ss.str();
+        }
+
+        std::string checkTiles()
+        {
+            std::ostringstream ss;
+            const Index64 n = mCount;
+            typename CheckT::TileIterT i(mGrid->tree());
+            for (i.setMaxDepth(GridT::TreeType::RootNodeType::LEVEL - 1); i; ++i) {
+                if (mCheck(i)) {
+                    ++mCount;
+                    if (mMask) mMask->fill(i.getBoundingBox(), true, true);
+                }
+            }
+            if (const Index64 m = mCount - n) {
+                ss << m << " tile" << (m==1 ? " is " : "s are ") + mCheck.str() << std::endl;
+            }
+            return ss.str();
+        }
+
+        std::string checkVoxels()
+        {
+            std::ostringstream ss;
+            LeafManagerT leafs(mGrid->tree());
+            const Index64 n = mCount;
+            tbb::parallel_reduce(leafs.leafRange(), *this);
+            if (const Index64 m = mCount - n) {
+                ss << m << " voxel" << (m==1 ? " is " : "s are ") + mCheck.str() << std::endl;
+            }
+            return ss.str();
+        }
+
+        void operator()(const typename LeafManagerT::LeafRange& r)
+        {
+            typedef typename CheckT::VoxelIterT VoxelIterT;
+            if (mMask) {
+                for (typename LeafManagerT::LeafRange::Iterator i=r.begin(); i; ++i) {
+                    typename MaskT::LeafNodeType* maskLeaf = NULL;
+                    for (VoxelIterT j = tree::IterTraits<LeafT, VoxelIterT>::begin(*i); j; ++j) {
+                        if (mCheck(j)) {
+                            ++mCount;
+                            if (maskLeaf == NULL) maskLeaf = mMask->touchLeaf(j.getCoord());
+                            maskLeaf->setValueOn(j.pos(), true);
+                        }
+                    }
+                }
+            } else {
+                for (typename LeafManagerT::LeafRange::Iterator i=r.begin(); i; ++i) {
+                    for (VoxelIterT j = tree::IterTraits<LeafT, VoxelIterT>::begin(*i); j; ++j) {
+                        if (mCheck(j)) ++mCount;
+                    }
+                }
+            }
+        }
+        void join(const CheckValues& other)
+        {
+            if (mMask) mMask->merge(*(other.mMask), openvdb::MERGE_ACTIVE_STATES_AND_NODES);
+            mCount += other.mCount;
+        }
+    };//End of private class CheckValues
+
+};// End of public class Diagnose
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Class that performs various types of checks on narrow-band level sets.
+///
+/// @note The most common usage is to simply call CheckLevelSet::check()
+template<class GridType>
+class CheckLevelSet
+{
+public:
+    typedef typename GridType::ValueType ValueType;
+    typedef typename GridType::template ValueConverter<bool>::Type  MaskType;
+
+    CheckLevelSet(const GridType& grid) : mDiagnose(grid) {}
+
+    //@{
+    /// @brief Return a boolean mask of all the values
+    /// (i.e. tiles and/or voxels) that have failed one or
+    /// more checks.
+    typename MaskType::ConstPtr mask() const { return mDiagnose.mask(); }
+    typename MaskType::Ptr mask() { return mDiagnose.mask(); }
+    //@}
+
+    /// @brief Return the number of values (i.e. background, tiles or
+    /// voxels) that have failed one or more checks.
+    Index64 valueCount() const { return mDiagnose.valueCount(); }
+
+    /// @brief Return total number of failed checks
+    /// @note If only one check was performed and the mask was updated
+    /// failureCount equals valueCount.
+    Index64 failureCount() const { return mDiagnose.failureCount(); }
+
+    /// @brief Return a const reference to the grid
+    const GridType& grid() const { return mDiagnose.grid(); }
+
+    /// @brief Clear the mask and error counter
+    void clear() { mDiagnose.clear(); }
+
+    /// @brief Return a nonempty message if the grid's value type is a floating point.
+    ///
+    /// @note No run-time overhead
+    static std::string checkValueType()
+    {
+        static const bool test = boost::is_floating_point<ValueType>::value;
+        return test ? "" : "Value type is not floating point\n";
+    }
+
+    /// @brief Return message if the grid's class is a level set.
+    ///
+    /// @note Small run-time overhead
+    std::string checkClassType() const
+    {
+        const bool test = mDiagnose.grid().getGridClass() == GRID_LEVEL_SET;
+        return test ? "" : "Class type is not \"GRID_LEVEL_SET\"\n";
+    }
+
+    /// @brief Return a nonempty message if the grid's transform does not have uniform scaling.
+    ///
+    /// @note Small run-time overhead
+    std::string checkTransform() const
+    {
+        return mDiagnose.grid().hasUniformVoxels() ? "" : "Does not have uniform voxels\n";
+    }
+
+    /// @brief Return a nonempty message if the background value is larger than or
+    /// equal to the halfWidth*voxelSize.
+    ///
+    /// @note Small run-time overhead
+    std::string checkBackground(Real halfWidth = LEVEL_SET_HALF_WIDTH) const
+    {
+        const Real w = mDiagnose.grid().background() / mDiagnose.grid().voxelSize()[0];
+        if (w < halfWidth) {
+            std::ostringstream ss;
+            ss << "The background value ("<< mDiagnose.grid().background()<<") is less than "
+               << halfWidth << " voxel units\n";
+            return ss.str();
+        }
+        return "";
+    }
+
+    /// @brief Return a nonempty message if the grid has no active tile values.
+    ///
+    /// @note Medium run-time overhead
+    std::string checkTiles() const
+    {
+        const bool test = mDiagnose.grid().tree().hasActiveTiles();
+        return test ? "Has active tile values\n" : "";
+    }
+
+    /// @brief Return a nonempty message if any of the values are not finite. i.e. NaN or inf.
+    ///
+    /// @note Medium run-time overhead
+    std::string checkFinite(bool updateMask = false)
+    {
+        CheckFinite<GridType,typename GridType::ValueAllCIter> c;
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/true, /*background*/true);
+    }
+
+    /// @brief Return a nonempty message if the active voxel values are out-of-range.
+    ///
+    /// @note Medium run-time overhead
+    std::string checkRange(bool updateMask = false)
+    {
+        const ValueType& background = mDiagnose.grid().background();
+        CheckRange<GridType> c(-background, background);
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/false, /*background*/false);
+    }
+
+    /// @brief Return a nonempty message if the the inactive values do not have a
+    /// magnitude equal to the background value.
+    ///
+    /// @note Medium run-time overhead
+    std::string checkInactiveValues(bool updateMask = false)
+    {
+        const ValueType& background = mDiagnose.grid().background();
+        CheckMagnitude<GridType, typename GridType::ValueOffCIter> c(background);
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/true, /*background*/false);
+    }
+
+    /// @brief Return a nonempty message if the norm of the gradient of the
+    /// active voxels is out of the range minV to maxV.
+    ///
+    /// @note Significant run-time overhead
+    std::string checkEikonal(bool updateMask = false, ValueType minV = 0.5, ValueType maxV = 1.5)
+    {
+        CheckEikonal<GridType> c(mDiagnose.grid(), minV, maxV);
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/false, /*background*/false);
+    }
+
+    /// @brief Return a nonempty message if an error or issue is detected. Only
+    /// runs tests with a number lower than or equal to n, where:
+    ///
+    /// Fast checks
+    /// 1: value type is floating point
+    /// 2: has level set class type
+    /// 3: has uniform scale
+    /// 4: background value is positive and n*dx
+    ///
+    /// Slower checks
+    /// 5: no active tiles
+    /// 6: all the values are finite, i.e not NaN or infinite
+    /// 7: active values in range between +-background
+    /// 8: abs of inactive values = background, i.e. assuming a symmetric narrow band!
+    ///
+    /// Relatively slow check (however multi-threaded)
+    /// 9: norm of gradient at zero-crossings is one, i.e. satisfied the Eikonal equation.
+    std::string check(size_t n=9, bool updateMask = false)
+    {
+        std::string str = this->checkValueType();
+        if (str.empty() && n>1) str = this->checkClassType();
+        if (str.empty() && n>2) str = this->checkTransform();
+        if (str.empty() && n>3) str = this->checkBackground();
+        if (str.empty() && n>4) str = this->checkTiles();
+        if (str.empty() && n>5) str = this->checkFinite(updateMask);
+        if (str.empty() && n>6) str = this->checkRange(updateMask);
+        if (str.empty() && n>7) str = this->checkInactiveValues(updateMask);
+        if (str.empty() && n>8) str = this->checkEikonal(updateMask);
+        return str;
+    }
+
+private:
+    // disallow copy construction and copy by assignment!
+    CheckLevelSet(const CheckLevelSet&);// not implemented
+    CheckLevelSet& operator=(const CheckLevelSet&);// not implemented
+
+    // Member data
+    Diagnose<GridType> mDiagnose;
+};// CheckLevelSet
+
+template<class GridType>
+std::string
+checkLevelSet(const GridType& grid, size_t n)
+{
+    CheckLevelSet<GridType> c(grid);
+    return c.check(n, false);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Class that performs various types of checks on fog volumes.
+///
+/// @note The most common usage is to simply call CheckFogVolume::check()
+template<class GridType>
+class CheckFogVolume
+{
+public:
+    typedef typename GridType::ValueType ValueType;
+    typedef typename GridType::template ValueConverter<bool>::Type  MaskType;
+
+    CheckFogVolume(const GridType& grid) : mDiagnose(grid) {}
+
+    //@{
+    /// @brief Return a boolean mask of all the values
+    /// (i.e. tiles and/or voxels) that have failed one or
+    /// more checks.
+    typename MaskType::ConstPtr mask() const { return mDiagnose.mask(); }
+    typename MaskType::Ptr mask() { return mDiagnose.mask(); }
+    //@}
+
+    /// @brief Return the number of values (i.e. background, tiles or
+    /// voxels) that have failed one or more checks.
+    Index64 valueCount() const { return mDiagnose.valueCount(); }
+
+    /// @brief Return total number of failed checks
+    /// @note If only one check was performed and the mask was updated
+    /// failureCount equals valueCount.
+    Index64 failureCount() const { return mDiagnose.failureCount(); }
+
+    /// @brief Return a const reference to the grid
+    const GridType& grid() const { return mDiagnose.grid(); }
+
+    /// @brief Clear the mask and error counter
+    void clear() { mDiagnose.clear(); }
+
+    /// @brief Return a nonempty message if the grid's value type is a floating point.
+    ///
+    /// @note No run-time overhead
+    static std::string checkValueType()
+    {
+        static const bool test = boost::is_floating_point<ValueType>::value;
+        return test ? "" : "Value type is not floating point";
+    }
+
+    /// @brief Return a nonempty message if the grid's class is a level set.
+    ///
+    /// @note Small run-time overhead
+    std::string checkClassType() const
+    {
+        const bool test = mDiagnose.grid().getGridClass() == GRID_FOG_VOLUME;
+        return test ? "" : "Class type is not \"GRID_LEVEL_SET\"";
+    }
+
+    /// @brief Return a nonempty message if the background value is not zero.
+    ///
+    /// @note Small run-time overhead
+    std::string checkBackground() const
+    {
+        if (!math::isApproxZero(mDiagnose.grid().background())) {
+            std::ostringstream ss;
+            ss << "The background value ("<< mDiagnose.grid().background()<<") is not zero";
+            return ss.str();
+        }
+        return "";
+    }
+
+    /// @brief Return a nonempty message if any of the values are not finite. i.e. NaN or inf.
+    ///
+    /// @note Medium run-time overhead
+    std::string checkFinite(bool updateMask = false)
+    {
+        CheckFinite<GridType,typename GridType::ValueAllCIter> c;
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/true, /*background*/true);
+    }
+
+    /// @brief Return a nonempty message if any of the inactive values are not zero.
+    ///
+    /// @note Medium run-time overhead
+    std::string checkInactiveValues(bool updateMask = false)
+    {
+        CheckMagnitude<GridType, typename GridType::ValueOffCIter> c(0);
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/true, /*background*/true);
+    }
+
+    /// @brief Return a nonempty message if the active voxel values
+    /// are out-of-range, i.e. not in the range [0,1].
+    ///
+    /// @note Medium run-time overhead
+    std::string checkRange(bool updateMask = false)
+    {
+        CheckRange<GridType> c(0, 1);
+        return mDiagnose.check(c, updateMask, /*voxel*/true, /*tiles*/true, /*background*/false);
+    }
+
+    /// @brief Return a nonempty message if an error or issue is detected. Only
+    /// runs tests with a number lower than or equal to n, where:
+    ///
+    /// Fast checks
+    /// 1: value type is floating point
+    /// 2: has FOG volume class type
+    /// 3: background value is zero
+    ///
+    /// Slower checks
+    /// 4: all the values are finite, i.e not NaN or infinite
+    /// 5: inactive values are zero
+    /// 6: active values are in the range [0,1]
+    std::string check(size_t n=6, bool updateMask = false)
+    {
+        std::string str = this->checkValueType();
+        if (str.empty() && n>1) str = this->checkClassType();
+        if (str.empty() && n>2) str = this->checkBackground();
+        if (str.empty() && n>3) str = this->checkFinite(updateMask);
+        if (str.empty() && n>4) str = this->checkInactiveValues(updateMask);
+        if (str.empty() && n>5) str = this->checkRange(updateMask);
+        return str;
+    }
+
+private:
+    // disallow copy construction and copy by assignment!
+    CheckFogVolume(const CheckFogVolume&);// not implemented
+    CheckFogVolume& operator=(const CheckFogVolume&);// not implemented
+
+    // Member data
+    Diagnose<GridType> mDiagnose;
+};// CheckFogVolume
+
+template<class GridType>
+std::string
+checkFogVolume(const GridType& grid, size_t n)
+{
+    CheckFogVolume<GridType> c(grid);
+    return c.check(n, false);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Internal utility objects and implementation details
+
+
+namespace diagnostics_internal {
+
+
+template<typename TreeType>
+class InactiveVoxelValues
+{
+public:
+    typedef tree::LeafManager<TreeType> LeafArray;
+    typedef typename TreeType::ValueType ValueType;
+    typedef std::set<ValueType> SetType;
+
+    InactiveVoxelValues(LeafArray&, size_t numValues);
+
+    void runParallel();
+    void runSerial();
+
+    void getInactiveValues(SetType&) const;
+
+    inline InactiveVoxelValues(const InactiveVoxelValues<TreeType>&, tbb::split);
+    inline void operator()(const tbb::blocked_range<size_t>&);
+    inline void join(const InactiveVoxelValues<TreeType>&);
+
+private:
+    LeafArray& mLeafArray;
+    SetType mInactiveValues;
+    size_t mNumValues;
+};// InactiveVoxelValues
+
+template<typename TreeType>
+InactiveVoxelValues<TreeType>::InactiveVoxelValues(LeafArray& leafs, size_t numValues)
+    : mLeafArray(leafs)
+    , mInactiveValues()
+    , mNumValues(numValues)
+{
+}
+
+template <typename TreeType>
+inline
+InactiveVoxelValues<TreeType>::InactiveVoxelValues(
+    const InactiveVoxelValues<TreeType>& rhs, tbb::split)
+    : mLeafArray(rhs.mLeafArray)
+    , mInactiveValues()
+    , mNumValues(rhs.mNumValues)
+{
+}
+
+template<typename TreeType>
+void
+InactiveVoxelValues<TreeType>::runParallel()
+{
+    tbb::parallel_reduce(mLeafArray.getRange(), *this);
+}
+
+
+template<typename TreeType>
+void
+InactiveVoxelValues<TreeType>::runSerial()
+{
+    (*this)(mLeafArray.getRange());
+}
+
+
+template<typename TreeType>
+inline void
+InactiveVoxelValues<TreeType>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    typename TreeType::LeafNodeType::ValueOffCIter iter;
+
+    for (size_t n = range.begin(); n < range.end() && !tbb::task::self().is_cancelled(); ++n) {
+        for (iter = mLeafArray.leaf(n).cbeginValueOff(); iter; ++iter) {
+            mInactiveValues.insert(iter.getValue());
+        }
+
+        if (mInactiveValues.size() > mNumValues) {
+            tbb::task::self().cancel_group_execution();
+        }
+    }
+}
+
+template<typename TreeType>
+inline void
+InactiveVoxelValues<TreeType>::join(const InactiveVoxelValues<TreeType>& rhs)
+{
+    mInactiveValues.insert(rhs.mInactiveValues.begin(), rhs.mInactiveValues.end());
+}
+
+template<typename TreeType>
+inline void
+InactiveVoxelValues<TreeType>::getInactiveValues(SetType& values) const
+{
+    values.insert(mInactiveValues.begin(), mInactiveValues.end());
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+class InactiveTileValues
+{
+public:
+    typedef tree::IteratorRange<typename TreeType::ValueOffCIter> IterRange;
+    typedef typename TreeType::ValueType ValueType;
+    typedef std::set<ValueType> SetType;
+
+    InactiveTileValues(size_t numValues);
+
+    void runParallel(IterRange&);
+    void runSerial(IterRange&);
+
+    void getInactiveValues(SetType&) const;
+
+    inline InactiveTileValues(const InactiveTileValues<TreeType>&, tbb::split);
+    inline void operator()(IterRange&);
+    inline void join(const InactiveTileValues<TreeType>&);
+
+private:
+    SetType mInactiveValues;
+    size_t mNumValues;
+};
+
+
+template<typename TreeType>
+InactiveTileValues<TreeType>::InactiveTileValues(size_t numValues)
+    : mInactiveValues()
+    , mNumValues(numValues)
+{
+}
+
+template <typename TreeType>
+inline
+InactiveTileValues<TreeType>::InactiveTileValues(
+    const InactiveTileValues<TreeType>& rhs, tbb::split)
+    : mInactiveValues()
+    , mNumValues(rhs.mNumValues)
+{
+}
+
+template<typename TreeType>
+void
+InactiveTileValues<TreeType>::runParallel(IterRange& range)
+{
+    tbb::parallel_reduce(range, *this);
+}
+
+
+template<typename TreeType>
+void
+InactiveTileValues<TreeType>::runSerial(IterRange& range)
+{
+    (*this)(range);
+}
+
+
+template<typename TreeType>
+inline void
+InactiveTileValues<TreeType>::operator()(IterRange& range)
+{
+    for (; range && !tbb::task::self().is_cancelled(); ++range) {
+        typename TreeType::ValueOffCIter iter = range.iterator();
+        for (; iter; ++iter) {
+            mInactiveValues.insert(iter.getValue());
+        }
+
+        if (mInactiveValues.size() > mNumValues) {
+            tbb::task::self().cancel_group_execution();
+        }
+    }
+}
+
+template<typename TreeType>
+inline void
+InactiveTileValues<TreeType>::join(const InactiveTileValues<TreeType>& rhs)
+{
+    mInactiveValues.insert(rhs.mInactiveValues.begin(), rhs.mInactiveValues.end());
+}
+
+template<typename TreeType>
+inline void
+InactiveTileValues<TreeType>::getInactiveValues(SetType& values) const
+{
+    values.insert(mInactiveValues.begin(), mInactiveValues.end());
+}
+
+} // namespace diagnostics_internal
+
+
+////////////////////////////////////////
+
+
+template<class GridType>
+bool
+uniqueInactiveValues(const GridType& grid,
+    std::vector<typename GridType::ValueType>& values, size_t numValues)
+{
+
+    typedef typename GridType::TreeType TreeType;
+    typedef typename GridType::ValueType ValueType;
+    typedef std::set<ValueType> SetType;
+
+    SetType uniqueValues;
+
+    { // Check inactive voxels
+        TreeType& tree = const_cast<TreeType&>(grid.tree());
+        tree::LeafManager<TreeType> leafs(tree);
+        diagnostics_internal::InactiveVoxelValues<TreeType> voxelOp(leafs, numValues);
+        voxelOp.runParallel();
+        voxelOp.getInactiveValues(uniqueValues);
+    }
+
+    // Check inactive tiles
+    if (uniqueValues.size() <= numValues) {
+        typename TreeType::ValueOffCIter iter(grid.tree());
+        iter.setMaxDepth(TreeType::ValueAllIter::LEAF_DEPTH - 1);
+        diagnostics_internal::InactiveTileValues<TreeType> tileOp(numValues);
+
+        tree::IteratorRange<typename TreeType::ValueOffCIter> range(iter);
+        tileOp.runParallel(range);
+
+        tileOp.getInactiveValues(uniqueValues);
+    }
+
+    values.clear();
+    values.reserve(uniqueValues.size());
+
+    typename SetType::iterator it = uniqueValues.begin();
+    for ( ; it != uniqueValues.end(); ++it) {
+        values.push_back(*it);
+    }
+
+    return values.size() <= numValues;
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_DIAGNOSTICS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Filter.h b/nuparu/include/openvdb_new/tools/Filter.h
new file mode 100644
index 00000000..9bc124b8
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Filter.h
@@ -0,0 +1,459 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file Filter.h
+///
+/// @brief Filtering of VDB volumes. Note that only the values in the
+/// grid are changed, not its topology! All operations can optionally
+/// be masked with another grid that acts as an alpha-mask.
+
+#ifndef OPENVDB_TOOLS_FILTER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_FILTER_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_for.h>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Stencils.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/tree/LeafManager.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <openvdb/Grid.h>
+#include "Interpolation.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Volume filtering (e.g., diffusion) with optional alpha masking
+///
+/// @note Only the values in the grid are changed, not its topology!
+template<typename GridT,
+         typename MaskT = typename GridT::template ValueConverter<float>::Type,
+         typename InterruptT = util::NullInterrupter>
+class Filter
+{
+public:
+    typedef GridT                                GridType;
+    typedef MaskT                                MaskType;
+    typedef typename GridType::TreeType          TreeType;
+    typedef typename TreeType::LeafNodeType      LeafType;
+    typedef typename GridType::ValueType         ValueType;
+    typedef typename MaskType::ValueType         AlphaType;
+    typedef typename tree::LeafManager<TreeType> LeafManagerType;
+    typedef typename LeafManagerType::LeafRange  RangeType;
+    typedef typename LeafManagerType::BufferType BufferType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<AlphaType>::value);
+
+    /// Constructor
+    /// @param grid Grid to be filtered.
+    /// @param interrupt Optional interrupter.
+    Filter(GridT& grid, InterruptT* interrupt = NULL)
+        : mGrid(&grid)
+        , mTask(0)
+        , mInterrupter(interrupt)
+        , mMask(NULL)
+        , mGrainSize(1)
+        , mMinMask(0)
+        , mMaxMask(1)
+        , mInvertMask(false)
+    {
+    }
+
+    /// @brief Shallow copy constructor called by tbb::parallel_for()
+    /// threads during filtering.
+    /// @param other The other Filter from which to copy.
+    Filter(const Filter& other)
+        : mGrid(other.mGrid)
+        , mTask(other.mTask)
+        , mInterrupter(other.mInterrupter)
+        , mMask(other.mMask)
+        , mGrainSize(other.mGrainSize)
+        , mMinMask(other.mMinMask)
+        , mMaxMask(other.mMaxMask)
+        , mInvertMask(other.mInvertMask)
+    {
+    }
+
+    /// @return the grain-size used for multi-threading
+    int  getGrainSize() const { return mGrainSize; }
+    /// @brief Set the grain-size used for multi-threading.
+    /// @note A grain size of 0 or less disables multi-threading!
+    void setGrainSize(int grainsize) { mGrainSize = grainsize; }
+
+    /// @brief Return the minimum value of the mask to be used for the
+    /// derivation of a smooth alpha value.
+    AlphaType minMask() const { return mMinMask; }
+    /// @brief Return the maximum value of the mask to be used for the
+    /// derivation of a smooth alpha value.
+    AlphaType maxMask() const { return mMaxMask; }
+    /// @brief Define the range for the (optional) scalar mask.
+    /// @param min Minimum value of the range.
+    /// @param max Maximum value of the range.
+    /// @details Mask values outside the range are clamped to zero or one, and
+    /// values inside the range map smoothly to 0->1 (unless the mask is inverted).
+    /// @throw ValueError if @a min is not smaller than @a max.
+    void setMaskRange(AlphaType min, AlphaType max)
+    {
+        if (!(min < max)) OPENVDB_THROW(ValueError, "Invalid mask range (expects min < max)");
+        mMinMask = min;
+        mMaxMask = max;
+    }
+
+    /// @brief Return true if the mask is inverted, i.e. min->max in the
+    /// original mask maps to 1->0 in the inverted alpha mask.
+    bool isMaskInverted() const { return mInvertMask; }
+    /// @brief Invert the optional mask, i.e. min->max in the original
+    /// mask maps to 1->0 in the inverted alpha mask.
+    void invertMask(bool invert=true) { mInvertMask = invert; }
+
+    /// @brief One iteration of a fast separable mean-value (i.e. box) filter.
+    /// @param width The width of the mean-value filter is 2*width+1 voxels.
+    /// @param iterations Number of times the mean-value filter is applied.
+    /// @param mask Optional alpha mask.
+    void mean(int width = 1, int iterations = 1, const MaskType* mask = NULL);
+
+    /// @brief One iteration of a fast separable Gaussian filter.
+    ///
+    /// @note This is approximated as 4 iterations of a separable mean filter
+    /// which typically leads an approximation that's better than 95%!
+    /// @param width The width of the mean-value filter is 2*width+1 voxels.
+    /// @param iterations Number of times the mean-value filter is applied.
+    /// @param mask Optional alpha mask.
+    void gaussian(int width = 1, int iterations = 1, const MaskType* mask = NULL);
+
+    /// @brief One iteration of a median-value filter
+    ///
+    /// @note This filter is not separable and is hence relatively slow!
+    /// @param width The width of the mean-value filter is 2*width+1 voxels.
+    /// @param iterations Number of times the mean-value filter is applied.
+    /// @param mask Optional alpha mask.
+    void median(int width = 1, int iterations = 1, const MaskType* mask = NULL);
+
+    /// Offsets (i.e. adds) a constant value to all active voxels.
+    /// @param offset Offset in the same units as the grid.
+    /// @param mask Optional alpha mask.
+    void offset(ValueType offset, const MaskType* mask = NULL);
+
+    /// @brief Used internally by tbb::parallel_for()
+    /// @param range Range of LeafNodes over which to multi-thread.
+    ///
+    /// @warning Never call this method directly!
+    void operator()(const RangeType& range) const
+    {
+        if (mTask) mTask(const_cast<Filter*>(this), range);
+        else OPENVDB_THROW(ValueError, "task is undefined - call median(), mean(), etc.");
+    }
+
+private:
+    typedef typename TreeType::LeafNodeType                  LeafT;
+    typedef typename LeafT::ValueOnIter                      VoxelIterT;
+    typedef typename LeafT::ValueOnCIter                     VoxelCIterT;
+    typedef typename tree::LeafManager<TreeType>::BufferType BufferT;
+    typedef typename RangeType::Iterator                     LeafIterT;
+    typedef tools::AlphaMask<GridT, MaskT>                   AlphaMaskT;
+
+    void cook(LeafManagerType& leafs);
+
+    template<size_t Axis>
+    struct Avg {
+        Avg(const GridT* grid, Int32 w): acc(grid->tree()), width(w), frac(1.f/float(2*w+1)) {}
+        inline ValueType operator()(Coord xyz);
+        typename GridT::ConstAccessor acc;
+        const Int32 width;
+        const float frac;
+    };
+
+    // Private filter methods called by tbb::parallel_for threads
+    template <typename AvgT>
+    void doBox( const RangeType& r, Int32 w);
+    void doBoxX(const RangeType& r, Int32 w) { this->doBox<Avg<0> >(r,w); }
+    void doBoxZ(const RangeType& r, Int32 w) { this->doBox<Avg<1> >(r,w); }
+    void doBoxY(const RangeType& r, Int32 w) { this->doBox<Avg<2> >(r,w); }
+    void doMedian(const RangeType&, int);
+    void doOffset(const RangeType&, ValueType);
+    /// @return true if the process was interrupted
+    bool wasInterrupted();
+
+    GridType*        mGrid;
+    typename boost::function<void (Filter*, const RangeType&)> mTask;
+    InterruptT*      mInterrupter;
+    const MaskType*  mMask;
+    int              mGrainSize;
+    AlphaType        mMinMask, mMaxMask;
+    bool             mInvertMask;
+}; // end of Filter class
+
+
+////////////////////////////////////////
+
+
+namespace filter_internal {
+// Helper function for Filter::Avg::operator()
+template<typename T> static inline void accum(T& sum, T addend) { sum += addend; }
+// Overload for bool ValueType
+inline void accum(bool& sum, bool addend) { sum = sum || addend; }
+}
+
+
+template<typename GridT, typename MaskT, typename InterruptT>
+template<size_t Axis>
+inline typename GridT::ValueType
+Filter<GridT, MaskT, InterruptT>::Avg<Axis>::operator()(Coord xyz)
+{
+    ValueType sum = zeroVal<ValueType>();
+    Int32 &i = xyz[Axis], j = i + width;
+    for (i -= width; i <= j; ++i) filter_internal::accum(sum, acc.getValue(xyz));
+    return static_cast<ValueType>(sum * frac);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::mean(int width, int iterations, const MaskType* mask)
+{
+    mMask = mask;
+
+    if (mInterrupter) mInterrupter->start("Applying mean filter");
+
+    const int w = std::max(1, width);
+
+    LeafManagerType leafs(mGrid->tree(), 1, mGrainSize==0);
+
+    for (int i=0; i<iterations && !this->wasInterrupted(); ++i) {
+        mTask = boost::bind(&Filter::doBoxX, _1, _2, w);
+        this->cook(leafs);
+
+        mTask = boost::bind(&Filter::doBoxY, _1, _2, w);
+        this->cook(leafs);
+
+        mTask = boost::bind(&Filter::doBoxZ, _1, _2, w);
+        this->cook(leafs);
+    }
+
+    if (mInterrupter) mInterrupter->end();
+}
+
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::gaussian(int width, int iterations, const MaskType* mask)
+{
+    mMask = mask;
+
+    if (mInterrupter) mInterrupter->start("Applying Gaussian filter");
+
+    const int w = std::max(1, width);
+
+    LeafManagerType leafs(mGrid->tree(), 1, mGrainSize==0);
+
+    for (int i=0; i<iterations; ++i) {
+        for (int n=0; n<4 && !this->wasInterrupted(); ++n) {
+            mTask = boost::bind(&Filter::doBoxX, _1, _2, w);
+            this->cook(leafs);
+
+            mTask = boost::bind(&Filter::doBoxY, _1, _2, w);
+            this->cook(leafs);
+
+            mTask = boost::bind(&Filter::doBoxZ, _1, _2, w);
+            this->cook(leafs);
+        }
+    }
+
+    if (mInterrupter) mInterrupter->end();
+}
+
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::median(int width, int iterations, const MaskType* mask)
+{
+    mMask = mask;
+
+    if (mInterrupter) mInterrupter->start("Applying median filter");
+
+    LeafManagerType leafs(mGrid->tree(), 1, mGrainSize==0);
+
+    mTask = boost::bind(&Filter::doMedian, _1, _2, std::max(1, width));
+    for (int i=0; i<iterations && !this->wasInterrupted(); ++i) this->cook(leafs);
+
+    if (mInterrupter) mInterrupter->end();
+}
+
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::offset(ValueType value, const MaskType* mask)
+{
+    mMask = mask;
+
+    if (mInterrupter) mInterrupter->start("Applying offset");
+
+    LeafManagerType leafs(mGrid->tree(), 0, mGrainSize==0);
+
+    mTask = boost::bind(&Filter::doOffset, _1, _2, value);
+    this->cook(leafs);
+
+    if (mInterrupter) mInterrupter->end();
+}
+
+
+////////////////////////////////////////
+
+
+/// Private method to perform the task (serial or threaded) and
+/// subsequently swap the leaf buffers.
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::cook(LeafManagerType& leafs)
+{
+    if (mGrainSize>0) {
+        tbb::parallel_for(leafs.leafRange(mGrainSize), *this);
+    } else {
+        (*this)(leafs.leafRange());
+    }
+    leafs.swapLeafBuffer(1, mGrainSize==0);
+}
+
+
+/// One dimensional convolution of a separable box filter
+template<typename GridT, typename MaskT, typename InterruptT>
+template <typename AvgT>
+inline void
+Filter<GridT, MaskT, InterruptT>::doBox(const RangeType& range, Int32 w)
+{
+    this->wasInterrupted();
+    AvgT avg(mGrid, w);
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(*mGrid, *mMask, mMinMask, mMaxMask, mInvertMask);
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            BufferT& buffer = leafIter.buffer(1);
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                const Coord xyz = iter.getCoord();
+                if (alpha(xyz, a, b)) {
+                    buffer.setValue(iter.pos(), ValueType(b*(*iter) + a*avg(xyz)));
+                }
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            BufferT& buffer = leafIter.buffer(1);
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                buffer.setValue(iter.pos(), avg(iter.getCoord()));
+            }
+        }
+    }
+}
+
+
+/// Performs simple but slow median-value diffusion
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::doMedian(const RangeType& range, int width)
+{
+    this->wasInterrupted();
+    typename math::DenseStencil<GridType> stencil(*mGrid, width);//creates local cache!
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(*mGrid, *mMask, mMinMask, mMaxMask, mInvertMask);
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            BufferT& buffer = leafIter.buffer(1);
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                if (alpha(iter.getCoord(), a, b)) {
+                    stencil.moveTo(iter);
+                    buffer.setValue(iter.pos(), ValueType(b*(*iter) + a*stencil.median()));
+                }
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            BufferT& buffer = leafIter.buffer(1);
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                stencil.moveTo(iter);
+                buffer.setValue(iter.pos(), stencil.median());
+            }
+        }
+    }
+}
+
+
+/// Offsets the values by a constant
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+Filter<GridT, MaskT, InterruptT>::doOffset(const RangeType& range, ValueType offset)
+{
+    this->wasInterrupted();
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(*mGrid, *mMask, mMinMask, mMaxMask, mInvertMask);
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            for (VoxelIterT iter = leafIter->beginValueOn(); iter; ++iter) {
+                if (alpha(iter.getCoord(), a, b)) iter.setValue(ValueType(*iter + a*offset));
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            for (VoxelIterT iter = leafIter->beginValueOn(); iter; ++iter) {
+                iter.setValue(*iter + offset);
+            }
+        }
+    }
+}
+
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline bool
+Filter<GridT, MaskT, InterruptT>::wasInterrupted()
+{
+    if (util::wasInterrupted(mInterrupter)) {
+        tbb::task::self().cancel_group_execution();
+        return true;
+    }
+    return false;
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_FILTER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/GridOperators.h b/nuparu/include/openvdb_new/tools/GridOperators.h
new file mode 100644
index 00000000..a5b94a7d
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/GridOperators.h
@@ -0,0 +1,1090 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file GridOperators.h
+///
+/// @brief Applies an operator on an input grid to produce an output
+/// grid with the same topology but potentially different value type.
+
+#ifndef OPENVDB_TOOLS_GRID_OPERATORS_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_GRID_OPERATORS_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/math/Operators.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <openvdb/tree/LeafManager.h>
+#include <openvdb/tree/ValueAccessor.h>
+#include <tbb/parallel_for.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief VectorToScalarConverter<VectorGridType>::Type is the type of a grid
+/// having the same tree configuration as VectorGridType but a scalar value type, T,
+/// where T is the type of the original vector components.
+/// @details For example, VectorToScalarConverter<Vec3DGrid>::Type is equivalent to DoubleGrid.
+template<typename VectorGridType> struct VectorToScalarConverter {
+    typedef typename VectorGridType::ValueType::value_type VecComponentValueT;
+    typedef typename VectorGridType::template ValueConverter<VecComponentValueT>::Type Type;
+};
+
+/// @brief ScalarToVectorConverter<ScalarGridType>::Type is the type of a grid
+/// having the same tree configuration as ScalarGridType but value type Vec3<T>
+/// where T is ScalarGridType::ValueType.
+/// @details For example, ScalarToVectorConverter<DoubleGrid>::Type is equivalent to Vec3DGrid.
+template<typename ScalarGridType> struct ScalarToVectorConverter {
+    typedef math::Vec3<typename ScalarGridType::ValueType> VectorValueT;
+    typedef typename ScalarGridType::template ValueConverter<VectorValueT>::Type Type;
+};
+
+
+/// @brief Compute the Closest-Point Transform (CPT) from a distance field.
+/// @return a new vector-valued grid with the same numerical precision as the input grid
+///     (for example, if the input grid is a DoubleGrid, the output grid will be a Vec3DGrid)
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+/// @note The current implementation assumes all the input distance values
+/// are represented by leaf voxels and not tiles.  This is true for all
+/// narrow-band level sets, which this class was originally developed for.
+/// In the future we will expand this class to also handle tile values.
+template<typename GridType, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+cpt(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+cpt(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+cpt(const GridType& grid, bool threaded = true)
+{
+    return cpt<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+cpt(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return cpt<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Compute the curl of the given vector-valued grid.
+/// @return a new vector-valued grid
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+curl(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+curl(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename GridType::Ptr
+curl(const GridType& grid, bool threaded = true)
+{
+    return curl<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename GridType::Ptr
+curl(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return curl<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Compute the divergence of the given vector-valued grid.
+/// @return a new scalar-valued grid with the same numerical precision as the input grid
+///     (for example, if the input grid is a Vec3DGrid, the output grid will be a DoubleGrid)
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+divergence(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+divergence(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+divergence(const GridType& grid, bool threaded = true)
+{
+    return divergence<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+divergence(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return divergence<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Compute the gradient of the given scalar grid.
+/// @return a new vector-valued grid with the same numerical precision as the input grid
+///     (for example, if the input grid is a DoubleGrid, the output grid will be a Vec3DGrid)
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+gradient(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+gradient(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+gradient(const GridType& grid, bool threaded = true)
+{
+    return gradient<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+gradient(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return gradient<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Compute the Laplacian of the given scalar grid.
+/// @return a new scalar grid
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+laplacian(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+laplacian(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename GridType::Ptr
+laplacian(const GridType& grid, bool threaded = true)
+{
+    return laplacian<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename GridType::Ptr
+laplacian(const GridType& grid, const MaskT mask, bool threaded = true)
+{
+    return laplacian<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Compute the mean curvature of the given grid.
+/// @return a new grid
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+meanCurvature(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+meanCurvature(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename GridType::Ptr
+meanCurvature(const GridType& grid, bool threaded = true)
+{
+    return meanCurvature<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename GridType::Ptr
+meanCurvature(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return meanCurvature<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Compute the magnitudes of the vectors of the given vector-valued grid.
+/// @return a new scalar-valued grid with the same numerical precision as the input grid
+///     (for example, if the input grid is a Vec3DGrid, the output grid will be a DoubleGrid)
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+magnitude(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+magnitude(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+magnitude(const GridType& grid, bool threaded = true)
+{
+    return magnitude<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+magnitude(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return magnitude<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+/// @brief Normalize the vectors of the given vector-valued grid.
+/// @return a new vector-valued grid
+/// @details When a mask grid is specified, the solution is calculated only in
+/// the intersection of the mask active topology and the input active topology
+/// independent of the transforms associated with either grid.
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+normalize(const GridType& grid, bool threaded, InterruptT* interrupt);
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+normalize(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt);
+
+template<typename GridType> inline
+typename GridType::Ptr
+normalize(const GridType& grid, bool threaded = true)
+{
+    return normalize<GridType, util::NullInterrupter>(grid, threaded, NULL);
+}
+
+template<typename GridType, typename MaskT> inline
+typename GridType::Ptr
+normalize(const GridType& grid, const MaskT& mask, bool threaded = true)
+{
+    return normalize<GridType, MaskT, util::NullInterrupter>(grid, mask, threaded, NULL);
+}
+
+
+////////////////////////////////////////
+
+
+namespace gridop {
+
+/// @brief ToMaskGrid<T>::Type is the type of a grid having the same
+/// tree hierarchy as grid type T but a value equal to its active state.
+/// @details For example, ToMaskGrid<FloatGrid>::Type is equivalent to MaskGrid.
+template<typename GridType>
+struct ToMaskGrid {
+    typedef Grid<typename GridType::TreeType::template ValueConverter<ValueMask>::Type> Type;
+};
+
+
+/// @brief Apply an operator on an input grid to produce an output grid
+/// with the same topology but a possibly different value type.
+/// @details To facilitate inlining, this class is also templated on a Map type.
+///
+/// @note This is a helper class and should never be used directly.
+///
+/// @note The current implementation assumes all the input
+/// values are represented by leaf voxels and not tiles. In the
+/// future we will expand this class to also handle tile values.
+template<
+    typename InGridT,
+    typename MaskGridType,
+    typename OutGridT,
+    typename MapT,
+    typename OperatorT,
+    typename InterruptT = util::NullInterrupter>
+class GridOperator
+{
+public:
+    typedef typename OutGridT::TreeType           OutTreeT;
+    typedef typename OutTreeT::LeafNodeType       OutLeafT;
+    typedef typename tree::LeafManager<OutTreeT>  LeafManagerT;
+
+    GridOperator(const InGridT& grid, const MaskGridType* mask, const MapT& map,
+        InterruptT* interrupt = NULL):
+        mAcc(grid.getConstAccessor()), mMap(map), mInterrupt(interrupt), mMask(mask)
+    {
+    }
+
+    virtual ~GridOperator() {}
+    typename OutGridT::Ptr process(bool threaded = true)
+    {
+        if (mInterrupt) mInterrupt->start("Processing grid");
+
+        // Derive background value of the output grid
+        typename InGridT::TreeType tmp(mAcc.tree().background());
+        typename OutGridT::ValueType backg = OperatorT::result(mMap, tmp, math::Coord(0));
+
+        // output tree = topology copy of input tree!
+        typename OutTreeT::Ptr tree(new OutTreeT(mAcc.tree(), backg, TopologyCopy()));
+
+
+        // create grid with output tree and unit transform
+        typename OutGridT::Ptr result(new OutGridT(tree));
+
+        // Modify the solution area if a mask was supplied.
+        if (mMask) {
+            result->topologyIntersection(*mMask);
+        }
+
+        // transform of output grid = transform of input grid
+        result->setTransform(math::Transform::Ptr(new math::Transform( mMap.copy() )));
+
+        LeafManagerT leafManager(*tree);
+
+        if (threaded) {
+            tbb::parallel_for(leafManager.leafRange(), *this);
+        } else {
+            (*this)(leafManager.leafRange());
+        }
+
+        if (mInterrupt) mInterrupt->end();
+        return result;
+    }
+
+    /// @brief Iterate sequentially over LeafNodes and voxels in the output
+    /// grid and compute the Laplacian using a valueAccessor for the
+    /// input grid.
+    ///
+    /// @note Never call this public method directly - it is called by
+    /// TBB threads only!
+    void operator()(const typename LeafManagerT::LeafRange& range) const
+    {
+        if (util::wasInterrupted(mInterrupt)) tbb::task::self().cancel_group_execution();
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+            for (typename OutLeafT::ValueOnIter value=leaf->beginValueOn(); value; ++value) {
+                value.setValue(OperatorT::result(mMap, mAcc, value.getCoord()));
+            }
+        }
+    }
+
+protected:
+    typedef typename InGridT::ConstAccessor  AccessorT;
+    mutable AccessorT   mAcc;
+    const MapT&         mMap;
+    InterruptT*         mInterrupt;
+    const MaskGridType* mMask;
+}; // end of GridOperator class
+
+} // namespace gridop
+
+
+////////////////////////////////////////
+
+
+/// @brief Compute the closest-point transform of a scalar grid.
+template<
+    typename InGridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<InGridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Cpt
+{
+public:
+    typedef InGridT                                         InGridType;
+    typedef typename ScalarToVectorConverter<InGridT>::Type OutGridType;
+
+    Cpt(const InGridType& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Cpt(const InGridType& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename OutGridType::Ptr process(bool threaded = true, bool useWorldTransform = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, useWorldTransform, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        if (functor.mOutputGrid) functor.mOutputGrid->setVectorType(VEC_CONTRAVARIANT_ABSOLUTE);
+        return functor.mOutputGrid;
+    }
+
+private:
+    struct IsOpT
+    {
+        template<typename MapT, typename AccT>
+        static typename OutGridType::ValueType
+        result(const MapT& map, const AccT& acc, const Coord& xyz)
+        {
+            return math::CPT<MapT, math::CD_2ND>::result(map, acc, xyz);
+        }
+    };
+    struct WsOpT
+    {
+        template<typename MapT, typename AccT>
+        static typename OutGridType::ValueType
+        result(const MapT& map, const AccT& acc, const Coord& xyz)
+        {
+            return math::CPT_RANGE<MapT, math::CD_2ND>::result(map, acc, xyz);
+        }
+    };
+    struct Functor
+    {
+        Functor(const InGridType& grid, const MaskGridType* mask,
+            bool threaded, bool worldspace, InterruptT* interrupt)
+        : mThreaded(threaded)
+        , mWorldSpace(worldspace)
+        , mInputGrid(grid)
+        , mInterrupt(interrupt)
+        , mMask(mask)
+        {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            if (mWorldSpace) {
+                gridop::GridOperator<InGridType, MaskGridType, OutGridType, MapT, WsOpT, InterruptT>
+                    op(mInputGrid, mMask, map, mInterrupt);
+                mOutputGrid = op.process(mThreaded); // cache the result
+            } else {
+                gridop::GridOperator<InGridType, MaskGridType, OutGridType, MapT, IsOpT, InterruptT>
+                    op(mInputGrid, mMask, map, mInterrupt);
+                mOutputGrid = op.process(mThreaded); // cache the result
+            }
+        }
+        const bool                mThreaded;
+        const bool                mWorldSpace;
+        const InGridType&         mInputGrid;
+        typename OutGridType::Ptr mOutputGrid;
+        InterruptT*               mInterrupt;
+        const MaskGridType*       mMask;
+    };
+    const InGridType&   mInputGrid;
+    InterruptT*         mInterrupt;
+    const MaskGridType* mMask;
+}; // end of Cpt class
+
+
+////////////////////////////////////////
+
+
+/// @brief Compute the curl of a vector grid.
+template<
+    typename GridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<GridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Curl
+{
+public:
+    typedef GridT  InGridType;
+    typedef GridT  OutGridType;
+
+    Curl(const GridT& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Curl(const GridT& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename GridT::Ptr process(bool threaded = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        if (functor.mOutputGrid) functor.mOutputGrid->setVectorType(VEC_COVARIANT);
+        return functor.mOutputGrid;
+    }
+
+private:
+    struct Functor
+    {
+        Functor(const GridT& grid, const MaskGridType* mask,
+                bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            typedef math::Curl<MapT, math::CD_2ND> OpT;
+            gridop::GridOperator<GridT, MaskGridType, GridT, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask, map, mInterrupt);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool           mThreaded;
+        const GridT&         mInputGrid;
+        typename GridT::Ptr  mOutputGrid;
+        InterruptT*          mInterrupt;
+        const MaskGridType*  mMask;
+    }; // Private Functor
+
+    const GridT&         mInputGrid;
+    InterruptT*          mInterrupt;
+    const MaskGridType*  mMask;
+}; // end of Curl class
+
+
+////////////////////////////////////////
+
+
+/// @brief Compute the divergence of a vector grid.
+template<
+    typename InGridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<InGridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Divergence
+{
+public:
+    typedef InGridT                                         InGridType;
+    typedef typename VectorToScalarConverter<InGridT>::Type OutGridType;
+
+    Divergence(const InGridT& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Divergence(const InGridT& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename OutGridType::Ptr process(bool threaded = true)
+    {
+        if (mInputGrid.getGridClass() == GRID_STAGGERED) {
+            Functor<math::FD_1ST> functor(mInputGrid, mMask, threaded, mInterrupt);
+            processTypedMap(mInputGrid.transform(), functor);
+            return functor.mOutputGrid;
+        } else {
+            Functor<math::CD_2ND> functor(mInputGrid, mMask, threaded, mInterrupt);
+            processTypedMap(mInputGrid.transform(), functor);
+            return functor.mOutputGrid;
+        }
+    }
+
+protected:
+    template<math::DScheme DiffScheme>
+    struct Functor
+    {
+        Functor(const InGridT& grid, const MaskGridType* mask,
+            bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            typedef math::Divergence<MapT, DiffScheme> OpT;
+            gridop::GridOperator<InGridType, MaskGridType, OutGridType, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask, map, mInterrupt);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool                 mThreaded;
+        const InGridType&          mInputGrid;
+        typename OutGridType::Ptr  mOutputGrid;
+        InterruptT*                mInterrupt;
+        const MaskGridType*        mMask;
+    }; // Private Functor
+
+    const InGridType&    mInputGrid;
+    InterruptT*          mInterrupt;
+    const MaskGridType*  mMask;
+}; // end of Divergence class
+
+
+////////////////////////////////////////
+
+
+/// @brief Compute the gradient of a scalar grid.
+template<
+    typename InGridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<InGridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Gradient
+{
+public:
+    typedef InGridT                                         InGridType;
+    typedef typename ScalarToVectorConverter<InGridT>::Type OutGridType;
+
+    Gradient(const InGridT& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Gradient(const InGridT& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename OutGridType::Ptr process(bool threaded = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        if (functor.mOutputGrid) functor.mOutputGrid->setVectorType(VEC_COVARIANT);
+        return functor.mOutputGrid;
+    }
+
+protected:
+    struct Functor
+    {
+        Functor(const InGridT& grid, const MaskGridType* mask,
+            bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            typedef math::Gradient<MapT, math::CD_2ND> OpT;
+            gridop::GridOperator<InGridType, MaskGridType, OutGridType, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask, map, mInterrupt);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool                 mThreaded;
+        const InGridT&             mInputGrid;
+        typename OutGridType::Ptr  mOutputGrid;
+        InterruptT*                mInterrupt;
+        const MaskGridType*        mMask;
+    }; // Private Functor
+
+    const InGridT&       mInputGrid;
+    InterruptT*          mInterrupt;
+    const MaskGridType*  mMask;
+}; // end of Gradient class
+
+
+////////////////////////////////////////
+
+
+template<
+    typename GridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<GridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Laplacian
+{
+public:
+    typedef GridT  InGridType;
+    typedef GridT  OutGridType;
+
+    Laplacian(const GridT& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Laplacian(const GridT& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename GridT::Ptr process(bool threaded = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        if (functor.mOutputGrid) functor.mOutputGrid->setVectorType(VEC_COVARIANT);
+        return functor.mOutputGrid;
+    }
+
+protected:
+    struct Functor
+    {
+        Functor(const GridT& grid, const MaskGridType* mask, bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            typedef math::Laplacian<MapT, math::CD_SECOND> OpT;
+            gridop::GridOperator<GridT, MaskGridType, GridT, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask, map);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool           mThreaded;
+        const GridT&         mInputGrid;
+        typename GridT::Ptr  mOutputGrid;
+        InterruptT*          mInterrupt;
+        const MaskGridType*  mMask;
+    }; // Private Functor
+
+    const GridT&        mInputGrid;
+    InterruptT*         mInterrupt;
+    const MaskGridType* mMask;
+}; // end of Laplacian class
+
+
+////////////////////////////////////////
+
+
+template<
+    typename GridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<GridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class MeanCurvature
+{
+public:
+    typedef GridT  InGridType;
+    typedef GridT  OutGridType;
+
+    MeanCurvature(const GridT& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    MeanCurvature(const GridT& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename GridT::Ptr process(bool threaded = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        if (functor.mOutputGrid) functor.mOutputGrid->setVectorType(VEC_COVARIANT);
+        return functor.mOutputGrid;
+    }
+
+protected:
+    struct Functor
+    {
+        Functor(const GridT& grid, const MaskGridType* mask, bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            typedef math::MeanCurvature<MapT, math::CD_SECOND, math::CD_2ND> OpT;
+            gridop::GridOperator<GridT, MaskGridType, GridT, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask, map);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool           mThreaded;
+        const GridT&         mInputGrid;
+        typename GridT::Ptr  mOutputGrid;
+        InterruptT*          mInterrupt;
+        const MaskGridType*  mMask;
+    }; // Private Functor
+
+    const GridT&        mInputGrid;
+    InterruptT*         mInterrupt;
+    const MaskGridType* mMask;
+}; // end of MeanCurvature class
+
+
+////////////////////////////////////////
+
+
+template<
+    typename InGridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<InGridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Magnitude
+{
+public:
+    typedef InGridT                                         InGridType;
+    typedef typename VectorToScalarConverter<InGridT>::Type OutGridType;
+
+    Magnitude(const InGridType& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Magnitude(const InGridType& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename OutGridType::Ptr process(bool threaded = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        return functor.mOutputGrid;
+    }
+
+protected:
+    struct OpT
+    {
+        template<typename MapT, typename AccT>
+        static typename OutGridType::ValueType
+        result(const MapT&, const AccT& acc, const Coord& xyz) { return acc.getValue(xyz).length();}
+    };
+    struct Functor
+    {
+        Functor(const InGridT& grid, const MaskGridType* mask,
+            bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            gridop::GridOperator<InGridType, MaskGridType, OutGridType, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask, map);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool                 mThreaded;
+        const InGridType&          mInputGrid;
+        typename OutGridType::Ptr  mOutputGrid;
+        InterruptT*                mInterrupt;
+        const MaskGridType*        mMask;
+    }; // Private Functor
+
+    const InGridType&    mInputGrid;
+    InterruptT*          mInterrupt;
+    const MaskGridType*  mMask;
+}; // end of Magnitude class
+
+
+////////////////////////////////////////
+
+
+template<
+    typename GridT,
+    typename MaskGridType = typename gridop::ToMaskGrid<GridT>::Type,
+    typename InterruptT = util::NullInterrupter>
+class Normalize
+{
+public:
+    typedef GridT  InGridType;
+    typedef GridT  OutGridType;
+
+    Normalize(const GridT& grid, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(NULL)
+    {
+    }
+
+    Normalize(const GridT& grid, const MaskGridType& mask, InterruptT* interrupt = NULL):
+        mInputGrid(grid), mInterrupt(interrupt), mMask(&mask)
+    {
+    }
+
+    typename GridT::Ptr process(bool threaded = true)
+    {
+        Functor functor(mInputGrid, mMask, threaded, mInterrupt);
+        processTypedMap(mInputGrid.transform(), functor);
+        if (typename GridT::Ptr outGrid = functor.mOutputGrid) {
+            const VecType vecType = mInputGrid.getVectorType();
+            if (vecType == VEC_COVARIANT) {
+                outGrid->setVectorType(VEC_COVARIANT_NORMALIZE);
+            } else {
+                outGrid->setVectorType(vecType);
+            }
+        }
+        return functor.mOutputGrid;
+    }
+
+protected:
+    struct OpT
+    {
+        template<typename MapT, typename AccT>
+        static typename OutGridType::ValueType
+        result(const MapT&, const AccT& acc, const Coord& xyz)
+        {
+            typename OutGridType::ValueType vec = acc.getValue(xyz);
+            if ( !vec.normalize() ) vec.setZero();
+            return vec;
+        }
+    };
+    struct Functor
+    {
+        Functor(const GridT& grid, const MaskGridType* mask, bool threaded, InterruptT* interrupt):
+            mThreaded(threaded), mInputGrid(grid), mInterrupt(interrupt), mMask(mask) {}
+
+        template<typename MapT>
+        void operator()(const MapT& map)
+        {
+            gridop::GridOperator<GridT, MaskGridType, GridT, MapT, OpT, InterruptT>
+                op(mInputGrid, mMask,map);
+            mOutputGrid = op.process(mThreaded); // cache the result
+        }
+
+        const bool           mThreaded;
+        const GridT&         mInputGrid;
+        typename GridT::Ptr  mOutputGrid;
+        InterruptT*          mInterrupt;
+        const MaskGridType*  mMask;
+    }; // Private Functor
+
+    const GridT&        mInputGrid;
+    InterruptT*         mInterrupt;
+    const MaskGridType* mMask;
+}; // end of Normalize class
+
+
+////////////////////////////////////////
+
+
+template<typename GridType, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+cpt(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Cpt<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT> op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+cpt(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Cpt<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+curl(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Curl<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT> op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+curl(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Curl<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+divergence(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Divergence<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT>
+        op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+divergence(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Divergence<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+gradient(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Gradient<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT>
+        op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename ScalarToVectorConverter<GridType>::Type::Ptr
+gradient(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Gradient<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+laplacian(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Laplacian<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT>
+        op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+laplacian(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Laplacian<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+meanCurvature(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    MeanCurvature<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT>
+        op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+meanCurvature(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    MeanCurvature<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+magnitude(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Magnitude<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT>
+        op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename VectorToScalarConverter<GridType>::Type::Ptr
+magnitude(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Magnitude<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename InterruptT> inline
+typename GridType::Ptr
+normalize(const GridType& grid, bool threaded, InterruptT* interrupt)
+{
+    Normalize<GridType, typename gridop::ToMaskGrid<GridType>::Type, InterruptT>
+        op(grid, interrupt);
+    return op.process(threaded);
+}
+
+template<typename GridType, typename MaskT, typename InterruptT> inline
+typename GridType::Ptr
+normalize(const GridType& grid, const MaskT& mask, bool threaded, InterruptT* interrupt)
+{
+    Normalize<GridType, MaskT, InterruptT> op(grid, mask, interrupt);
+    return op.process(threaded);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_GRID_OPERATORS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/GridTransformer.h b/nuparu/include/openvdb_new/tools/GridTransformer.h
new file mode 100644
index 00000000..b23f94ab
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/GridTransformer.h
@@ -0,0 +1,1039 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file GridTransformer.h
+/// @author Peter Cucka
+
+#ifndef OPENVDB_TOOLS_GRIDTRANSFORMER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_GRIDTRANSFORMER_HAS_BEEN_INCLUDED
+
+#include <cmath>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <boost/shared_ptr.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h> // for isApproxEqual()
+#include <openvdb/util/NullInterrupter.h>
+#include "ChangeBackground.h"
+#include "Interpolation.h"
+#include "LevelSetRebuild.h" // for doLevelSetRebuild()
+#include "SignedFloodFill.h" // for signedFloodFill
+#include "Prune.h" // for pruneLevelSet
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Resample an input grid into an output grid of the same type such that,
+/// after resampling, the input and output grids coincide (apart from sampling
+/// artifacts), but the output grid's transform is unchanged.
+/// @details Specifically, this function resamples the input grid into the output
+/// grid's index space, using a sampling kernel like PointSampler, BoxSampler,
+/// or QuadraticSampler.
+/// @param inGrid       the grid to be resampled
+/// @param outGrid      the grid into which to write the resampled voxel data
+/// @param interrupter  an object adhering to the util::NullInterrupter interface
+/// @par Example:
+/// @code
+/// // Create an input grid with the default identity transform
+/// // and populate it with a level-set sphere.
+/// FloatGrid::ConstPtr src = tools::makeSphere(...);
+/// // Create an output grid and give it a uniform-scale transform.
+/// FloatGrid::Ptr dest = FloatGrid::create();
+/// const float voxelSize = 0.5;
+/// dest->setTransform(math::Transform::createLinearTransform(voxelSize));
+/// // Resample the input grid into the output grid, reproducing
+/// // the level-set sphere at a smaller voxel size.
+/// MyInterrupter interrupter = ...;
+/// tools::resampleToMatch<tools::QuadraticSampler>(*src, *dest, interrupter);
+/// @endcode
+template<typename Sampler, typename Interrupter, typename GridType>
+inline void
+resampleToMatch(const GridType& inGrid, GridType& outGrid, Interrupter& interrupter);
+
+/// @brief Resample an input grid into an output grid of the same type such that,
+/// after resampling, the input and output grids coincide (apart from sampling
+/// artifacts), but the output grid's transform is unchanged.
+/// @details Specifically, this function resamples the input grid into the output
+/// grid's index space, using a sampling kernel like PointSampler, BoxSampler,
+/// or QuadraticSampler.
+/// @param inGrid       the grid to be resampled
+/// @param outGrid      the grid into which to write the resampled voxel data
+/// @par Example:
+/// @code
+/// // Create an input grid with the default identity transform
+/// // and populate it with a level-set sphere.
+/// FloatGrid::ConstPtr src = tools::makeSphere(...);
+/// // Create an output grid and give it a uniform-scale transform.
+/// FloatGrid::Ptr dest = FloatGrid::create();
+/// const float voxelSize = 0.5;
+/// dest->setTransform(math::Transform::createLinearTransform(voxelSize));
+/// // Resample the input grid into the output grid, reproducing
+/// // the level-set sphere at a smaller voxel size.
+/// tools::resampleToMatch<tools::QuadraticSampler>(*src, *dest);
+/// @endcode
+template<typename Sampler, typename GridType>
+inline void
+resampleToMatch(const GridType& inGrid, GridType& outGrid);
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+/// @brief A TileSampler wraps a grid sampler of another type (BoxSampler,
+/// QuadraticSampler, etc.), and for samples that fall within a given tile
+/// of the grid, it returns a cached tile value instead of accessing the grid.
+template<typename Sampler, typename TreeT>
+class TileSampler: public Sampler
+{
+public:
+    typedef typename TreeT::ValueType ValueT;
+
+    /// @param b        the index-space bounding box of a particular grid tile
+    /// @param tileVal  the tile's value
+    /// @param on       the tile's active state
+    TileSampler(const CoordBBox& b, const ValueT& tileVal, bool on):
+        mBBox(b.min().asVec3d(), b.max().asVec3d()), mVal(tileVal), mActive(on), mEmpty(false)
+    {
+        mBBox.expand(-this->radius()); // shrink the bounding box by the sample radius
+        mEmpty = mBBox.empty();
+    }
+
+    bool sample(const TreeT& inTree, const Vec3R& inCoord, ValueT& result) const
+    {
+        if (!mEmpty && mBBox.isInside(inCoord)) { result = mVal; return mActive; }
+        return Sampler::sample(inTree, inCoord, result);
+    }
+
+protected:
+    BBoxd mBBox;
+    ValueT mVal;
+    bool mActive, mEmpty;
+};
+
+
+/// @brief For point sampling, tree traversal is less expensive than testing
+/// bounding box membership.
+template<typename TreeT>
+class TileSampler<PointSampler, TreeT>: public PointSampler {
+public:
+    TileSampler(const CoordBBox&, const typename TreeT::ValueType&, bool) {}
+};
+
+/// @brief For point sampling, tree traversal is less expensive than testing
+/// bounding box membership.
+template<typename TreeT>
+class TileSampler<StaggeredPointSampler, TreeT>: public StaggeredPointSampler {
+public:
+    TileSampler(const CoordBBox&, const typename TreeT::ValueType&, bool) {}
+};
+
+} // namespace internal
+
+
+////////////////////////////////////////
+
+
+/// A GridResampler applies a geometric transformation to an
+/// input grid using one of several sampling schemes, and stores
+/// the result in an output grid.
+///
+/// Usage:
+/// @code
+/// GridResampler resampler();
+/// resampler.transformGrid<BoxSampler>(xform, inGrid, outGrid);
+/// @endcode
+/// where @c xform is a functor that implements the following methods:
+/// @code
+/// bool isAffine() const
+/// openvdb::Vec3d transform(const openvdb::Vec3d&) const
+/// openvdb::Vec3d invTransform(const openvdb::Vec3d&) const
+/// @endcode
+/// @note When the transform is affine and can be expressed as a 4 x 4 matrix,
+/// a GridTransformer is much more efficient than a GridResampler.
+class GridResampler
+{
+public:
+    typedef boost::shared_ptr<GridResampler> Ptr;
+    typedef boost::function<bool (void)> InterruptFunc;
+
+    GridResampler(): mThreaded(true), mTransformTiles(true) {}
+    virtual ~GridResampler() {}
+
+    /// Enable or disable threading.  (Threading is enabled by default.)
+    void setThreaded(bool b) { mThreaded = b; }
+    /// Return @c true if threading is enabled.
+    bool threaded() const { return mThreaded; }
+    /// Enable or disable processing of tiles.  (Enabled by default, except for level set grids.)
+    void setTransformTiles(bool b) { mTransformTiles = b; }
+    /// Return @c true if tile processing is enabled.
+    bool transformTiles() const { return mTransformTiles; }
+
+    /// @brief Allow processing to be aborted by providing an interrupter object.
+    /// The interrupter will be queried periodically during processing.
+    /// @see util/NullInterrupter.h for interrupter interface requirements.
+    template<typename InterrupterType> void setInterrupter(InterrupterType&);
+
+    template<typename Sampler, typename GridT, typename Transformer>
+    void transformGrid(const Transformer&,
+        const GridT& inGrid, GridT& outGrid) const;
+
+protected:
+    template<typename Sampler, typename GridT, typename Transformer>
+    void applyTransform(const Transformer&, const GridT& inGrid, GridT& outGrid) const;
+
+    bool interrupt() const { return mInterrupt && mInterrupt(); }
+
+private:
+    template<typename Sampler, typename InTreeT, typename OutTreeT, typename Transformer>
+    static void transformBBox(const Transformer&, const CoordBBox& inBBox,
+        const InTreeT& inTree, OutTreeT& outTree, const InterruptFunc&,
+        const Sampler& = Sampler());
+
+    template<typename Sampler, typename TreeT, typename Transformer>
+    class RangeProcessor;
+
+    bool mThreaded, mTransformTiles;
+    InterruptFunc mInterrupt;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief A GridTransformer applies a geometric transformation to an
+/// input grid using one of several sampling schemes, and stores
+/// the result in an output grid.
+///
+/// @note GridTransformer is optimized for affine transformations.
+///
+/// Usage:
+/// @code
+/// Mat4R xform = ...;
+/// GridTransformer transformer(xform);
+/// transformer.transformGrid<BoxSampler>(inGrid, outGrid);
+/// @endcode
+/// or
+/// @code
+/// Vec3R pivot = ..., scale = ..., rotate = ..., translate = ...;
+/// GridTransformer transformer(pivot, scale, rotate, translate);
+/// transformer.transformGrid<QuadraticSampler>(inGrid, outGrid);
+/// @endcode
+class GridTransformer: public GridResampler
+{
+public:
+    typedef boost::shared_ptr<GridTransformer> Ptr;
+
+    GridTransformer(const Mat4R& xform);
+    GridTransformer(
+        const Vec3R& pivot,
+        const Vec3R& scale,
+        const Vec3R& rotate,
+        const Vec3R& translate,
+        const std::string& xformOrder = "tsr",
+        const std::string& rotationOrder = "zyx");
+    virtual ~GridTransformer() {}
+
+    const Mat4R& getTransform() const { return mTransform; }
+
+    template<class Sampler, class GridT>
+    void transformGrid(const GridT& inGrid, GridT& outGrid) const;
+
+private:
+    struct MatrixTransform;
+
+    inline void init(const Vec3R& pivot, const Vec3R& scale,
+        const Vec3R& rotate, const Vec3R& translate,
+        const std::string& xformOrder, const std::string& rotOrder);
+
+    Vec3R mPivot;
+    Vec3i mMipLevels;
+    Mat4R mTransform, mPreScaleTransform, mPostScaleTransform;
+};
+
+
+////////////////////////////////////////
+
+
+namespace local_util {
+
+/// @brief Decompose an affine transform into scale, rotation and translation components.
+/// @return @c false if the given matrix is not affine or cannot otherwise be decomposed.
+template<typename T>
+inline bool
+decompose(const math::Mat4<T>& m, math::Vec3<T>& scale,
+    math::Vec3<T>& rotate, math::Vec3<T>& translate)
+{
+    if (!math::isAffine(m)) return false;
+
+    // This is the translation in world space
+    translate = m.getTranslation();
+    // Extract translation.
+    const math::Mat3<T> xform = m.getMat3();
+
+    const math::Vec3<T> unsignedScale(
+        (math::Vec3<T>(1, 0, 0) * xform).length(),
+        (math::Vec3<T>(0, 1, 0) * xform).length(),
+        (math::Vec3<T>(0, 0, 1) * xform).length());
+
+    const bool hasUniformScale = unsignedScale.eq(math::Vec3<T>(unsignedScale[0]));
+
+    bool hasRotation = false;
+    bool validDecomposition = false;
+
+    T minAngle = std::numeric_limits<T>::max();
+
+    // If the transformation matrix contains a reflection,
+    // test different negative scales to find a decomposition
+    // that favors the optimal resampling algorithm.
+    for (size_t n = 0; n < 8; ++n) {
+
+        const math::Vec3<T> signedScale(
+            n & 0x1 ? -unsignedScale.x() : unsignedScale.x(),
+            n & 0x2 ? -unsignedScale.y() : unsignedScale.y(),
+            n & 0x4 ? -unsignedScale.z() : unsignedScale.z());
+
+        // Extract scale and potentially reflection.
+        const math::Mat3<T> mat = xform * math::scale<math::Mat3<T> >(signedScale).inverse();
+        if (mat.det() < T(0.0)) continue; // Skip if mat contains a reflection.
+
+        const math::Vec3<T> tmpAngle = math::eulerAngles(mat, math::XYZ_ROTATION);
+
+        const math::Mat3<T> rebuild =
+            math::rotation<math::Mat3<T> >(math::Vec3<T>(1, 0, 0), tmpAngle.x()) *
+            math::rotation<math::Mat3<T> >(math::Vec3<T>(0, 1, 0), tmpAngle.y()) *
+            math::rotation<math::Mat3<T> >(math::Vec3<T>(0, 0, 1), tmpAngle.z()) *
+            math::scale<math::Mat3<T> >(signedScale);
+
+        if (xform.eq(rebuild)) {
+
+            const T maxAngle = std::max(std::abs(tmpAngle[0]),
+                std::max(std::abs(tmpAngle[1]), std::abs(tmpAngle[2])));
+
+            if (!(minAngle < maxAngle)) { // Update if less or equal.
+
+                minAngle = maxAngle;
+                rotate = tmpAngle;
+                scale = signedScale;
+
+                hasRotation = !rotate.eq(math::Vec3<T>::zero());
+                validDecomposition = true;
+
+                if (hasUniformScale || !hasRotation) {
+                    // Current decomposition is optimal.
+                    break;
+                }
+            }
+        }
+    }
+
+    if (!validDecomposition || (hasRotation && !hasUniformScale)) {
+        // The decomposition is invalid if the transformation matrix contains shear.
+        // No unique decomposition if scale is nonuniform and rotation is nonzero.
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace local_util
+
+
+////////////////////////////////////////
+
+
+/// This class implements the Transformer functor interface (specifically,
+/// the isAffine(), transform() and invTransform() methods) for a transform
+/// that is expressed as a 4 x 4 matrix.
+struct GridTransformer::MatrixTransform
+{
+    MatrixTransform(): mat(Mat4R::identity()), invMat(Mat4R::identity()) {}
+    MatrixTransform(const Mat4R& xform): mat(xform), invMat(xform.inverse()) {}
+
+    bool isAffine() const { return math::isAffine(mat); }
+
+    Vec3R transform(const Vec3R& pos) const { return mat.transformH(pos); }
+
+    Vec3R invTransform(const Vec3R& pos) const { return invMat.transformH(pos); }
+
+    Mat4R mat, invMat;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief This class implements the Transformer functor interface (specifically,
+/// the isAffine(), transform() and invTransform() methods) for a transform
+/// that maps an A grid into a B grid's index space such that, after resampling,
+/// A's index space and transform match B's index space and transform.
+class ABTransform
+{
+public:
+    /// @param aXform  the A grid's transform
+    /// @param bXform  the B grid's transform
+    ABTransform(const math::Transform& aXform, const math::Transform& bXform):
+        mAXform(aXform),
+        mBXform(bXform),
+        mIsAffine(mAXform.isLinear() && mBXform.isLinear()),
+        mIsIdentity(mIsAffine && mAXform == mBXform)
+        {}
+
+    bool isAffine() const { return mIsAffine; }
+
+    bool isIdentity() const { return mIsIdentity; }
+
+    openvdb::Vec3R transform(const openvdb::Vec3R& pos) const
+    {
+        return mBXform.worldToIndex(mAXform.indexToWorld(pos));
+    }
+
+    openvdb::Vec3R invTransform(const openvdb::Vec3R& pos) const
+    {
+        return mAXform.worldToIndex(mBXform.indexToWorld(pos));
+    }
+
+    const math::Transform& getA() const { return mAXform; }
+    const math::Transform& getB() const { return mBXform; }
+
+private:
+    const math::Transform &mAXform, &mBXform;
+    const bool mIsAffine;
+    const bool mIsIdentity;
+};
+
+
+/// The normal entry points for resampling are the resampleToMatch() functions,
+/// which correctly handle level set grids under scaling and shearing.
+/// doResampleToMatch() is mainly for internal use but is typically faster
+/// for level sets, and correct provided that no scaling or shearing is needed.
+///
+/// @warning Do not use this function to scale or shear a level set grid.
+template<typename Sampler, typename Interrupter, typename GridType>
+inline void
+doResampleToMatch(const GridType& inGrid, GridType& outGrid, Interrupter& interrupter)
+{
+    ABTransform xform(inGrid.transform(), outGrid.transform());
+
+    if (Sampler::consistent() && xform.isIdentity()) {
+        // If the transforms of the input and output are identical, the
+        // output tree is simply a deep copy of the input tree.
+        outGrid.setTree(inGrid.tree().copy());
+    } else if (xform.isAffine()) {
+        // If the input and output transforms are both affine, create an
+        // input to output transform (in:index-to-world * out:world-to-index)
+        // and use the fast GridTransformer API.
+        Mat4R mat = xform.getA().baseMap()->getAffineMap()->getMat4() *
+            ( xform.getB().baseMap()->getAffineMap()->getMat4().inverse() );
+
+        GridTransformer transformer(mat);
+        transformer.setInterrupter(interrupter);
+
+        // Transform the input grid and store the result in the output grid.
+        transformer.transformGrid<Sampler>(inGrid, outGrid);
+    } else {
+        // If either the input or the output transform is non-affine,
+        // use the slower GridResampler API.
+        GridResampler resampler;
+        resampler.setInterrupter(interrupter);
+
+        resampler.transformGrid<Sampler>(xform, inGrid, outGrid);
+    }
+}
+
+
+template<typename Sampler, typename Interrupter, typename GridType>
+inline void
+resampleToMatch(const GridType& inGrid, GridType& outGrid, Interrupter& interrupter)
+{
+    if (inGrid.getGridClass() == GRID_LEVEL_SET) {
+        // If the input grid is a level set, resample it using the level set rebuild tool.
+
+        if (inGrid.constTransform() == outGrid.constTransform()) {
+            // If the transforms of the input and output grids are identical,
+            // the output tree is simply a deep copy of the input tree.
+            outGrid.setTree(inGrid.tree().copy());
+            return;
+        }
+
+        // If the output grid is a level set, resample the input grid to have the output grid's
+        // background value.  Otherwise, preserve the input grid's background value.
+        typedef typename GridType::ValueType ValueT;
+        const ValueT halfWidth = ((outGrid.getGridClass() == openvdb::GRID_LEVEL_SET)
+            ? ValueT(outGrid.background() * (1.0 / outGrid.voxelSize()[0]))
+            : ValueT(inGrid.background() * (1.0 / inGrid.voxelSize()[0])));
+
+        typename GridType::Ptr tempGrid;
+        try {
+            tempGrid = doLevelSetRebuild(inGrid, /*iso=*/zeroVal<ValueT>(),
+                /*exWidth=*/halfWidth, /*inWidth=*/halfWidth,
+                &outGrid.constTransform(), &interrupter);
+        } catch (TypeError&) {
+            // The input grid is classified as a level set, but it has a value type
+            // that is not supported by the level set rebuild tool.  Fall back to
+            // using the generic resampler.
+            tempGrid.reset();
+        }
+        if (tempGrid) {
+            outGrid.setTree(tempGrid->treePtr());
+            return;
+        }
+    }
+
+    // If the input grid is not a level set, use the generic resampler.
+    doResampleToMatch<Sampler>(inGrid, outGrid, interrupter);
+}
+
+
+template<typename Sampler, typename GridType>
+inline void
+resampleToMatch(const GridType& inGrid, GridType& outGrid)
+{
+    util::NullInterrupter interrupter;
+    resampleToMatch<Sampler>(inGrid, outGrid, interrupter);
+}
+
+
+////////////////////////////////////////
+
+
+inline
+GridTransformer::GridTransformer(const Mat4R& xform):
+    mPivot(0, 0, 0),
+    mMipLevels(0, 0, 0),
+    mTransform(xform),
+    mPreScaleTransform(Mat4R::identity()),
+    mPostScaleTransform(Mat4R::identity())
+{
+    Vec3R scale, rotate, translate;
+    if (local_util::decompose(mTransform, scale, rotate, translate)) {
+        // If the transform can be decomposed into affine components,
+        // use them to set up a mipmapping-like scheme for downsampling.
+        init(mPivot, scale, rotate, translate, "srt", "zyx");
+    }
+}
+
+
+inline
+GridTransformer::GridTransformer(
+    const Vec3R& pivot, const Vec3R& scale,
+    const Vec3R& rotate, const Vec3R& translate,
+    const std::string& xformOrder, const std::string& rotOrder):
+    mPivot(0, 0, 0),
+    mMipLevels(0, 0, 0),
+    mPreScaleTransform(Mat4R::identity()),
+    mPostScaleTransform(Mat4R::identity())
+{
+    init(pivot, scale, rotate, translate, xformOrder, rotOrder);
+}
+
+
+////////////////////////////////////////
+
+
+inline void
+GridTransformer::init(
+    const Vec3R& pivot, const Vec3R& scale,
+    const Vec3R& rotate, const Vec3R& translate,
+    const std::string& xformOrder, const std::string& rotOrder)
+{
+    if (xformOrder.size() != 3) {
+        OPENVDB_THROW(ValueError, "invalid transform order (" + xformOrder + ")");
+    }
+    if (rotOrder.size() != 3) {
+        OPENVDB_THROW(ValueError, "invalid rotation order (" + rotOrder + ")");
+    }
+
+    mPivot = pivot;
+
+    // Scaling is handled via a mipmapping-like scheme of successive
+    // halvings of the tree resolution, until the remaining scale
+    // factor is greater than or equal to 1/2.
+    Vec3R scaleRemainder = scale;
+    for (int i = 0; i < 3; ++i) {
+        double s = std::fabs(scale(i));
+        if (s < 0.5) {
+            mMipLevels(i) = int(std::floor(-std::log(s)/std::log(2.0)));
+            scaleRemainder(i) = scale(i) * (1 << mMipLevels(i));
+        }
+    }
+
+    // Build pre-scale and post-scale transform matrices based on
+    // the user-specified order of operations.
+    // Note that we iterate over the transform order string in reverse order
+    // (e.g., "t", "r", "s", given "srt").  This is because math::Mat matrices
+    // postmultiply row vectors rather than premultiplying column vectors.
+    mTransform = mPreScaleTransform = mPostScaleTransform = Mat4R::identity();
+    Mat4R* remainder = &mPostScaleTransform;
+    int rpos, spos, tpos;
+    rpos = spos = tpos = 3;
+    for (int ix = 2; ix >= 0; --ix) { // reverse iteration
+        switch (xformOrder[ix]) {
+
+        case 'r':
+            rpos = ix;
+            mTransform.preTranslate(pivot);
+            remainder->preTranslate(pivot);
+
+            int xpos, ypos, zpos;
+            xpos = ypos = zpos = 3;
+            for (int ir = 2; ir >= 0; --ir) {
+                switch (rotOrder[ir]) {
+                case 'x':
+                    xpos = ir;
+                    mTransform.preRotate(math::X_AXIS, rotate.x());
+                    remainder->preRotate(math::X_AXIS, rotate.x());
+                    break;
+                case 'y':
+                    ypos = ir;
+                    mTransform.preRotate(math::Y_AXIS, rotate.y());
+                    remainder->preRotate(math::Y_AXIS, rotate.y());
+                    break;
+                case 'z':
+                    zpos = ir;
+                    mTransform.preRotate(math::Z_AXIS, rotate.z());
+                    remainder->preRotate(math::Z_AXIS, rotate.z());
+                    break;
+                }
+            }
+            // Reject rotation order strings that don't contain exactly one
+            // instance of "x", "y" and "z".
+            if (xpos > 2 || ypos > 2 || zpos > 2) {
+                OPENVDB_THROW(ValueError, "invalid rotation order (" + rotOrder + ")");
+            }
+
+            mTransform.preTranslate(-pivot);
+            remainder->preTranslate(-pivot);
+            break;
+
+        case 's':
+            spos = ix;
+            mTransform.preTranslate(pivot);
+            mTransform.preScale(scale);
+            mTransform.preTranslate(-pivot);
+
+            remainder->preTranslate(pivot);
+            remainder->preScale(scaleRemainder);
+            remainder->preTranslate(-pivot);
+            remainder = &mPreScaleTransform;
+            break;
+
+        case 't':
+            tpos = ix;
+            mTransform.preTranslate(translate);
+            remainder->preTranslate(translate);
+            break;
+        }
+    }
+    // Reject transform order strings that don't contain exactly one
+    // instance of "t", "r" and "s".
+    if (tpos > 2 || rpos > 2 || spos > 2) {
+        OPENVDB_THROW(ValueError, "invalid transform order (" + xformOrder + ")");
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename InterrupterType>
+void
+GridResampler::setInterrupter(InterrupterType& interrupter)
+{
+    mInterrupt = boost::bind(&InterrupterType::wasInterrupted,
+        /*this=*/&interrupter, /*percent=*/-1);
+}
+
+
+template<typename Sampler, typename GridT, typename Transformer>
+void
+GridResampler::transformGrid(const Transformer& xform,
+    const GridT& inGrid, GridT& outGrid) const
+{
+    tools::changeBackground(outGrid.tree(), inGrid.background());
+    applyTransform<Sampler>(xform, inGrid, outGrid);
+}
+
+
+template<class Sampler, class GridT>
+void
+GridTransformer::transformGrid(const GridT& inGrid, GridT& outGrid) const
+{
+    tools::changeBackground(outGrid.tree(), inGrid.background());
+
+    if (!Sampler::mipmap() || mMipLevels == Vec3i::zero()) {
+        // Skip the mipmapping step.
+        const MatrixTransform xform(mTransform);
+        applyTransform<Sampler>(xform, inGrid, outGrid);
+
+    } else {
+        bool firstPass = true;
+        const typename GridT::ValueType background = inGrid.background();
+        typename GridT::Ptr tempGrid = GridT::create(background);
+
+        if (!mPreScaleTransform.eq(Mat4R::identity())) {
+            firstPass = false;
+            // Apply the pre-scale transform to the input grid
+            // and store the result in a temporary grid.
+            const MatrixTransform xform(mPreScaleTransform);
+            applyTransform<Sampler>(xform, inGrid, *tempGrid);
+        }
+
+        // While the scale factor along one or more axes is less than 1/2,
+        // scale the grid by half along those axes.
+        Vec3i count = mMipLevels; // # of halvings remaining per axis
+        while (count != Vec3i::zero()) {
+            MatrixTransform xform;
+            xform.mat.setTranslation(mPivot);
+            xform.mat.preScale(Vec3R(
+                count.x() ? .5 : 1, count.y() ? .5 : 1, count.z() ? .5 : 1));
+            xform.mat.preTranslate(-mPivot);
+            xform.invMat = xform.mat.inverse();
+
+            if (firstPass) {
+                firstPass = false;
+                // Scale the input grid and store the result in a temporary grid.
+                applyTransform<Sampler>(xform, inGrid, *tempGrid);
+            } else {
+                // Scale the temporary grid and store the result in a transient grid,
+                // then swap the two and discard the transient grid.
+                typename GridT::Ptr destGrid = GridT::create(background);
+                applyTransform<Sampler>(xform, *tempGrid, *destGrid);
+                tempGrid.swap(destGrid);
+            }
+            // (3, 2, 1) -> (2, 1, 0) -> (1, 0, 0) -> (0, 0, 0), etc.
+            count = math::maxComponent(count - 1, Vec3i::zero());
+        }
+
+        // Apply the post-scale transform and store the result in the output grid.
+        if (!mPostScaleTransform.eq(Mat4R::identity())) {
+            const MatrixTransform xform(mPostScaleTransform);
+            applyTransform<Sampler>(xform, *tempGrid, outGrid);
+        } else {
+            outGrid.setTree(tempGrid->treePtr());
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<class Sampler, class TreeT, typename Transformer>
+class GridResampler::RangeProcessor
+{
+public:
+    typedef typename TreeT::LeafCIter LeafIterT;
+    typedef typename TreeT::ValueAllCIter TileIterT;
+    typedef typename tree::IteratorRange<LeafIterT> LeafRange;
+    typedef typename tree::IteratorRange<TileIterT> TileRange;
+    typedef typename tree::ValueAccessor<const TreeT> InTreeAccessor;
+    typedef typename tree::ValueAccessor<TreeT> OutTreeAccessor;
+
+    RangeProcessor(const Transformer& xform, const CoordBBox& b, const TreeT& inT, TreeT& outT):
+        mIsRoot(true), mXform(xform), mBBox(b),
+        mInTree(inT), mOutTree(&outT), mInAcc(mInTree), mOutAcc(*mOutTree)
+    {}
+
+    RangeProcessor(const Transformer& xform, const CoordBBox& b, const TreeT& inTree):
+        mIsRoot(false), mXform(xform), mBBox(b),
+        mInTree(inTree), mOutTree(new TreeT(inTree.background())),
+        mInAcc(mInTree), mOutAcc(*mOutTree)
+    {}
+
+    ~RangeProcessor() { if (!mIsRoot) delete mOutTree; }
+
+    /// Splitting constructor: don't copy the original processor's output tree
+    RangeProcessor(RangeProcessor& other, tbb::split):
+        mIsRoot(false),
+        mXform(other.mXform),
+        mBBox(other.mBBox),
+        mInTree(other.mInTree),
+        mOutTree(new TreeT(mInTree.background())),
+        mInAcc(mInTree),
+        mOutAcc(*mOutTree),
+        mInterrupt(other.mInterrupt)
+    {}
+
+    void setInterrupt(const InterruptFunc& f) { mInterrupt = f; }
+
+    /// Transform each leaf node in the given range.
+    void operator()(LeafRange& r)
+    {
+        for ( ; r; ++r) {
+            if (interrupt()) break;
+            LeafIterT i = r.iterator();
+            CoordBBox bbox(i->origin(), i->origin() + Coord(i->dim()));
+            if (!mBBox.empty()) {
+                // Intersect the leaf node's bounding box with mBBox.
+                bbox = CoordBBox(
+                    Coord::maxComponent(bbox.min(), mBBox.min()),
+                    Coord::minComponent(bbox.max(), mBBox.max()));
+            }
+            if (!bbox.empty()) {
+                transformBBox<Sampler>(mXform, bbox, mInAcc, mOutAcc, mInterrupt);
+            }
+        }
+    }
+
+    /// Transform each non-background tile in the given range.
+    void operator()(TileRange& r)
+    {
+        for ( ; r; ++r) {
+            if (interrupt()) break;
+
+            TileIterT i = r.iterator();
+            // Skip voxels and background tiles.
+            if (!i.isTileValue()) continue;
+            if (!i.isValueOn() && math::isApproxEqual(*i, mOutTree->background())) continue;
+
+            CoordBBox bbox;
+            i.getBoundingBox(bbox);
+            if (!mBBox.empty()) {
+                // Intersect the tile's bounding box with mBBox.
+                bbox = CoordBBox(
+                    Coord::maxComponent(bbox.min(), mBBox.min()),
+                    Coord::minComponent(bbox.max(), mBBox.max()));
+            }
+            if (!bbox.empty()) {
+                /// @todo This samples the tile voxel-by-voxel, which is much too slow.
+                /// Instead, compute the largest axis-aligned bounding box that is
+                /// contained in the transformed tile (adjusted for the sampler radius)
+                /// and fill it with the tile value.  Then transform the remaining voxels.
+                internal::TileSampler<Sampler, InTreeAccessor>
+                    sampler(bbox, i.getValue(), i.isValueOn());
+                transformBBox(mXform, bbox, mInAcc, mOutAcc, mInterrupt, sampler);
+            }
+        }
+    }
+
+    /// Merge another processor's output tree into this processor's tree.
+    void join(RangeProcessor& other)
+    {
+        if (!interrupt()) mOutTree->merge(*other.mOutTree);
+    }
+
+private:
+    bool interrupt() const { return mInterrupt && mInterrupt(); }
+
+    const bool mIsRoot; // true if mOutTree is the top-level tree
+    Transformer mXform;
+    CoordBBox mBBox;
+    const TreeT& mInTree;
+    TreeT* mOutTree;
+    InTreeAccessor mInAcc;
+    OutTreeAccessor mOutAcc;
+    InterruptFunc mInterrupt;
+};
+
+
+////////////////////////////////////////
+
+
+template<class Sampler, class GridT, typename Transformer>
+void
+GridResampler::applyTransform(const Transformer& xform,
+    const GridT& inGrid, GridT& outGrid) const
+{
+    typedef typename GridT::TreeType TreeT;
+    const TreeT& inTree = inGrid.tree();
+    TreeT& outTree = outGrid.tree();
+
+    typedef RangeProcessor<Sampler, TreeT, Transformer> RangeProc;
+
+    const GridClass gridClass = inGrid.getGridClass();
+
+    if (gridClass != GRID_LEVEL_SET && mTransformTiles) {
+        // Independently transform the tiles of the input grid.
+        // Note: Tiles in level sets can only be background tiles, and they
+        // are handled more efficiently with a signed flood fill (see below).
+
+        RangeProc proc(xform, CoordBBox(), inTree, outTree);
+        proc.setInterrupt(mInterrupt);
+
+        typename RangeProc::TileIterT tileIter = inTree.cbeginValueAll();
+        tileIter.setMaxDepth(tileIter.getLeafDepth() - 1); // skip leaf nodes
+        typename RangeProc::TileRange tileRange(tileIter);
+
+        if (mThreaded) {
+            tbb::parallel_reduce(tileRange, proc);
+        } else {
+            proc(tileRange);
+        }
+    }
+
+    CoordBBox clipBBox;
+    if (gridClass == GRID_LEVEL_SET) {
+        // Inactive voxels in level sets can only be background voxels, and they
+        // are handled more efficiently with a signed flood fill (see below).
+        clipBBox = inGrid.evalActiveVoxelBoundingBox();
+    }
+
+    // Independently transform the leaf nodes of the input grid.
+
+    RangeProc proc(xform, clipBBox, inTree, outTree);
+    proc.setInterrupt(mInterrupt);
+
+    typename RangeProc::LeafRange leafRange(inTree.cbeginLeaf());
+
+    if (mThreaded) {
+        tbb::parallel_reduce(leafRange, proc);
+    } else {
+        proc(leafRange);
+    }
+
+    // If the grid is a level set, mark inactive voxels as inside or outside.
+    if (gridClass == GRID_LEVEL_SET) {
+        tools::pruneLevelSet(outTree);
+        tools::signedFloodFill(outTree);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+//static
+template<class Sampler, class InTreeT, class OutTreeT, class Transformer>
+void
+GridResampler::transformBBox(
+    const Transformer& xform,
+    const CoordBBox& bbox,
+    const InTreeT& inTree,
+    OutTreeT& outTree,
+    const InterruptFunc& interrupt,
+    const Sampler& sampler)
+{
+    typedef typename OutTreeT::ValueType ValueT;
+
+    // Transform the corners of the input tree's bounding box
+    // and compute the enclosing bounding box in the output tree.
+    Vec3R
+        inRMin(bbox.min().x(), bbox.min().y(), bbox.min().z()),
+        inRMax(bbox.max().x(), bbox.max().y(), bbox.max().z()),
+        outRMin = math::minComponent(xform.transform(inRMin), xform.transform(inRMax)),
+        outRMax = math::maxComponent(xform.transform(inRMin), xform.transform(inRMax));
+    for (int i = 0; i < 8; ++i) {
+        Vec3R corner(
+            i & 1 ? inRMax.x() : inRMin.x(),
+            i & 2 ? inRMax.y() : inRMin.y(),
+            i & 4 ? inRMax.z() : inRMin.z());
+        outRMin = math::minComponent(outRMin, xform.transform(corner));
+        outRMax = math::maxComponent(outRMax, xform.transform(corner));
+    }
+    Vec3i
+        outMin = local_util::floorVec3(outRMin) - Sampler::radius(),
+        outMax = local_util::ceilVec3(outRMax) + Sampler::radius();
+
+    if (!xform.isAffine()) {
+        // If the transform is not affine, back-project each output voxel
+        // into the input tree.
+        Vec3R xyz, inXYZ;
+        Coord outXYZ;
+        int &x = outXYZ.x(), &y = outXYZ.y(), &z = outXYZ.z();
+        for (x = outMin.x(); x <= outMax.x(); ++x) {
+            if (interrupt && interrupt()) break;
+            xyz.x() = x;
+            for (y = outMin.y(); y <= outMax.y(); ++y) {
+                if (interrupt && interrupt()) break;
+                xyz.y() = y;
+                for (z = outMin.z(); z <= outMax.z(); ++z) {
+                    xyz.z() = z;
+                    inXYZ = xform.invTransform(xyz);
+                    ValueT result;
+                    if (sampler.sample(inTree, inXYZ, result)) {
+                        outTree.setValueOn(outXYZ, result);
+                    } else {
+                        // Note: Don't overwrite existing active values with inactive values.
+                        if (!outTree.isValueOn(outXYZ)) {
+                            outTree.setValueOff(outXYZ, result);
+                        }
+                    }
+                }
+            }
+        }
+    } else { // affine
+        // Compute step sizes in the input tree that correspond to
+        // unit steps in x, y and z in the output tree.
+        const Vec3R
+            translation = xform.invTransform(Vec3R(0, 0, 0)),
+            deltaX = xform.invTransform(Vec3R(1, 0, 0)) - translation,
+            deltaY = xform.invTransform(Vec3R(0, 1, 0)) - translation,
+            deltaZ = xform.invTransform(Vec3R(0, 0, 1)) - translation;
+
+#if defined(__ICC)
+        /// @todo The following line is a workaround for bad code generation
+        /// in opt-icc11.1_64 (but not debug or gcc) builds.  It should be
+        /// removed once the problem has been addressed at its source.
+        const Vec3R dummy = deltaX;
+#endif
+
+        // Step by whole voxels through the output tree, sampling the
+        // corresponding fractional voxels of the input tree.
+        Vec3R inStartX = xform.invTransform(Vec3R(outMin));
+        Coord outXYZ;
+        int &x = outXYZ.x(), &y = outXYZ.y(), &z = outXYZ.z();
+        for (x = outMin.x(); x <= outMax.x(); ++x, inStartX += deltaX) {
+            if (interrupt && interrupt()) break;
+            Vec3R inStartY = inStartX;
+            for (y = outMin.y(); y <= outMax.y(); ++y, inStartY += deltaY) {
+                if (interrupt && interrupt()) break;
+                Vec3R inXYZ = inStartY;
+                for (z = outMin.z(); z <= outMax.z(); ++z, inXYZ += deltaZ) {
+                    ValueT result;
+                    if (sampler.sample(inTree, inXYZ, result)) {
+                        outTree.setValueOn(outXYZ, result);
+                    } else {
+                        // Note: Don't overwrite existing active values with inactive values.
+                        if (!outTree.isValueOn(outXYZ)) {
+                            outTree.setValueOff(outXYZ, result);
+                        }
+                    }
+                }
+            }
+        }
+    }
+} // GridResampler::transformBBox()
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_GRIDTRANSFORMER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Interpolation.h b/nuparu/include/openvdb_new/tools/Interpolation.h
new file mode 100644
index 00000000..81decbad
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Interpolation.h
@@ -0,0 +1,1040 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Interpolation.h
+///
+/// Sampler classes such as PointSampler and BoxSampler that are intended for use
+/// with tools::GridTransformer should operate in voxel space and must adhere to
+/// the interface described in the example below:
+/// @code
+/// struct MySampler
+/// {
+///     // Return a short name that can be used to identify this sampler
+///     // in error messages and elsewhere.
+///     const char* name() { return "mysampler"; }
+///
+///     // Return the radius of the sampling kernel in voxels, not including
+///     // the center voxel.  This is the number of voxels of padding that
+///     // are added to all sides of a volume as a result of resampling.
+///     int radius() { return 2; }
+///
+///     // Return true if scaling by a factor smaller than 0.5 (along any axis)
+///     // should be handled via a mipmapping-like scheme of successive halvings
+///     // of a grid's resolution, until the remaining scale factor is
+///     // greater than or equal to 1/2.  Set this to false only when high-quality
+///     // scaling is not required.
+///     bool mipmap() { return true; }
+///
+///     // Specify if sampling at a location that is collocated with a grid point
+///     // is guaranteed to return the exact value at that grid point.
+///     // For most sampling kernels, this should be false.
+///     bool consistent() { return false; }
+///
+///     // Sample the tree at the given coordinates and return the result in val.
+///     // Return true if the sampled value is active.
+///     template<class TreeT>
+///     bool sample(const TreeT& tree, const Vec3R& coord, typename TreeT::ValueType& val);
+/// };
+/// @endcode
+
+#ifndef OPENVDB_TOOLS_INTERPOLATION_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_INTERPOLATION_HAS_BEEN_INCLUDED
+
+#include <cmath>
+#include <boost/shared_ptr.hpp>
+#include <openvdb/version.h> // for OPENVDB_VERSION_NAME
+#include <openvdb/Platform.h> // for round()
+#include <openvdb/math/Math.h>// for SmoothUnitStep
+#include <openvdb/math/Transform.h> // for Transform
+#include <openvdb/Grid.h>
+#include <openvdb/tree/ValueAccessor.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Provises a unified interface for sampling, i.e. interpolation.
+/// @details Order = 0: closest point
+///          Order = 1: tri-linear
+///          Order = 2: tri-quadratic
+///          Staggered: Set to true for MAC grids
+template <size_t Order, bool Staggered = false>
+struct Sampler
+{
+    BOOST_STATIC_ASSERT(Order < 3);
+    static const char* name();
+    static int radius();
+    static bool mipmap();
+    static bool consistent();
+    static bool staggered();
+    static size_t order();
+
+    /// @brief Sample @a inTree at the floating-point index coordinate @a inCoord
+    /// and store the result in @a result.
+    ///
+    /// @return @c true if the sampled value is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Sample @a inTree at the floating-point index coordinate @a inCoord.
+    ///
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+};
+
+//////////////////////////////////////// Non-Staggered Samplers
+
+// The following samplers operate in voxel space.
+// When the samplers are applied to grids holding vector or other non-scalar data,
+// the data is assumed to be collocated.  For example, using the BoxSampler on a grid
+// with ValueType Vec3f assumes that all three elements in a vector can be assigned
+// the same physical location. Consider using the GridSampler below instead.
+
+struct PointSampler
+{
+    static const char* name() { return "point"; }
+    static int radius() { return 0; }
+    static bool mipmap() { return false; }
+    static bool consistent() { return true; }
+    static bool staggered() { return false; }
+    static size_t order() { return 0; }
+
+    /// @brief Sample @a inTree at the nearest neighbor to @a inCoord
+    /// and store the result in @a result.
+    /// @return @c true if the sampled value is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Sample @a inTree at the nearest neighbor to @a inCoord
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+};
+
+
+struct BoxSampler
+{
+    static const char* name() { return "box"; }
+    static int radius() { return 1; }
+    static bool mipmap() { return true; }
+    static bool consistent() { return true; }
+    static bool staggered() { return false; }
+    static size_t order() { return 1; }
+
+    /// @brief Trilinearly reconstruct @a inTree at @a inCoord
+    /// and store the result in @a result.
+    /// @return @c true if any one of the sampled values is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Trilinearly reconstruct @a inTree at @a inCoord.
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+
+    /// @brief Import all eight values from @a inTree to support
+    /// tri-linear interpolation. 
+    template<class ValueT, class TreeT, size_t N>
+    static inline void getValues(ValueT (&data)[N][N][N], const TreeT& inTree, Coord ijk);
+
+    /// @brief Import all eight values from @a inTree to support
+    /// tri-linear interpolation.
+    /// @return @c true if any of the eight values are active
+    template<class ValueT, class TreeT, size_t N>
+    static inline bool probeValues(ValueT (&data)[N][N][N], const TreeT& inTree, Coord ijk);
+
+    /// @brief Find the minimum and maximum values of the eight cell
+    /// values in @ data.
+    template<class ValueT, size_t N>
+    static inline void extrema(ValueT (&data)[N][N][N], ValueT& vMin, ValueT& vMax);
+
+    /// @return the tri-linear interpolation with the unit cell coordinates @a uvw
+    template<class ValueT, size_t N>
+    static inline ValueT trilinearInterpolation(ValueT (&data)[N][N][N], const Vec3R& uvw);
+};
+
+
+struct QuadraticSampler
+{
+    static const char* name() { return "quadratic"; }
+    static int radius() { return 1; }
+    static bool mipmap() { return true; }
+    static bool consistent() { return false; }
+    static bool staggered() { return false; }
+    static size_t order() { return 2; }
+
+    /// @brief Triquadratically reconstruct @a inTree at @a inCoord
+    /// and store the result in @a result.
+    /// @return @c true if any one of the sampled values is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Triquadratically reconstruct @a inTree at to @a inCoord.
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+
+    template<class ValueT, size_t N>
+    static inline ValueT triquadraticInterpolation(ValueT (&data)[N][N][N], const Vec3R& uvw);
+};
+
+
+//////////////////////////////////////// Staggered Samplers
+
+
+// The following samplers operate in voxel space and are designed for Vec3
+// staggered grid data (e.g., fluid simulations using the Marker-and-Cell approach
+// associate elements of the velocity vector with different physical locations:
+// the faces of a cube).
+
+struct StaggeredPointSampler
+{
+    static const char* name() { return "point"; }
+    static int radius() { return 0; }
+    static bool mipmap() { return false; }
+    static bool consistent() { return false; }
+    static bool staggered() { return true; }
+    static size_t order() { return 0; }
+
+    /// @brief Sample @a inTree at the nearest neighbor to @a inCoord
+    /// and store the result in @a result.
+    /// @return true if the sampled value is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Sample @a inTree at the nearest neighbor to @a inCoord
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+};
+
+
+struct StaggeredBoxSampler
+{
+    static const char* name() { return "box"; }
+    static int radius() { return 1; }
+    static bool mipmap() { return true; }
+    static bool consistent() { return false; }
+    static bool staggered() { return true; }
+    static size_t order() { return 1; }
+
+    /// @brief Trilinearly reconstruct @a inTree at @a inCoord
+    /// and store the result in @a result.
+    /// @return true if any one of the sampled value is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Trilinearly reconstruct @a inTree at @a inCoord.
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+};
+
+
+struct StaggeredQuadraticSampler
+{
+    static const char* name() { return "quadratic"; }
+    static int radius() { return 1; }
+    static bool mipmap() { return true; }
+    static bool consistent() { return false; }
+    static bool staggered() { return true; }
+    static size_t order() { return 2; }
+
+    /// @brief Triquadratically reconstruct @a inTree at @a inCoord
+    /// and store the result in @a result.
+    /// @return true if any one of the sampled values is active.
+    template<class TreeT>
+    static bool sample(const TreeT& inTree, const Vec3R& inCoord,
+                       typename TreeT::ValueType& result);
+
+    /// @brief Triquadratically reconstruct @a inTree at to @a inCoord.
+    /// @return the reconstructed value
+    template<class TreeT>
+    static typename TreeT::ValueType sample(const TreeT& inTree, const Vec3R& inCoord);
+};
+
+
+//////////////////////////////////////// GridSampler
+
+
+/// @brief Class that provides the interface for continuous sampling
+/// of values in a tree.
+///
+/// @details Since trees support only discrete voxel sampling, TreeSampler
+/// must be used to sample arbitrary continuous points in (world or
+/// index) space.
+///
+/// @warning This implementation of the GridSampler stores a pointer
+/// to a Tree for value access. While this is thread-safe it is
+/// uncached and hence slow compared to using a
+/// ValueAccessor. Consequently it is normally advisable to use the
+/// template specialization below that employs a
+/// ValueAccessor. However, care must be taken when dealing with
+/// multi-threading (see warning below).
+template<typename GridOrTreeType, typename SamplerType>
+class GridSampler
+{
+public:
+    typedef boost::shared_ptr<GridSampler>                      Ptr;
+    typedef typename GridOrTreeType::ValueType                  ValueType;
+    typedef typename TreeAdapter<GridOrTreeType>::GridType      GridType;
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType      TreeType;
+    typedef typename TreeAdapter<GridOrTreeType>::AccessorType  AccessorType;
+
+     /// @param grid  a grid to be sampled
+    explicit GridSampler(const GridType& grid)
+        : mTree(&(grid.tree())), mTransform(&(grid.transform())) {}
+
+    /// @param tree  a tree to be sampled, or a ValueAccessor for the tree
+    /// @param transform is used when sampling world space locations.
+    GridSampler(const TreeType& tree, const math::Transform& transform)
+        : mTree(&tree), mTransform(&transform) {}
+
+    const math::Transform& transform() const { return *mTransform; }
+
+    /// @brief Sample a point in index space in the grid.
+    /// @param x Fractional x-coordinate of point in index-coordinates of grid
+    /// @param y Fractional y-coordinate of point in index-coordinates of grid
+    /// @param z Fractional z-coordinate of point in index-coordinates of grid
+    template<typename RealType>
+    ValueType sampleVoxel(const RealType& x, const RealType& y, const RealType& z) const
+    {
+        return this->isSample(Vec3d(x,y,z));
+    }
+
+    /// @brief Sample value in integer index space
+    /// @param i Integer x-coordinate in index space
+    /// @param j Integer y-coordinate in index space
+    /// @param k Integer x-coordinate in index space
+    ValueType sampleVoxel(typename Coord::ValueType i,
+                          typename Coord::ValueType j,
+                          typename Coord::ValueType k) const
+    {
+        return this->isSample(Coord(i,j,k));
+    }
+
+    /// @brief Sample value in integer index space
+    /// @param ijk the location in index space
+    ValueType isSample(const Coord& ijk) const { return mTree->getValue(ijk); }
+
+    /// @brief Sample in fractional index space
+    /// @param ispoint the location in index space
+    ValueType isSample(const Vec3d& ispoint) const
+    {
+        ValueType result = zeroVal<ValueType>();
+        SamplerType::sample(*mTree, ispoint, result);
+        return result;
+    }
+
+    /// @brief Sample in world space
+    /// @param wspoint the location in world space
+    ValueType wsSample(const Vec3d& wspoint) const
+    {
+        ValueType result = zeroVal<ValueType>();
+        SamplerType::sample(*mTree, mTransform->worldToIndex(wspoint), result);
+        return result;
+    }
+
+private:
+    const TreeType*        mTree;
+    const math::Transform* mTransform;
+}; // class GridSampler
+
+
+/// @brief Specialization of GridSampler for construction from a ValueAccessor type
+///
+/// @note This version should normally be favored over the one above
+/// that takes a Grid or Tree. The reason is this version uses a
+/// ValueAccessor that performs fast (cached) access where the
+/// tree-based flavor performs slower (uncached) access.
+///
+/// @warning Since this version stores a pointer to an (externally
+/// allocated) value accessor it is not threadsafe. Hence each thread
+/// should have its own instance of a GridSampler constructed from a
+/// local ValueAccessor. Alternatively the Grid/Tree-based GridSampler
+/// is threadsafe, but also slower.
+template<typename TreeT, typename SamplerType>
+class GridSampler<tree::ValueAccessor<TreeT>, SamplerType>
+{
+public:
+    typedef boost::shared_ptr<GridSampler>      Ptr;
+    typedef typename TreeT::ValueType           ValueType;
+    typedef TreeT                               TreeType;
+    typedef Grid<TreeType>                      GridType;
+    typedef typename tree::ValueAccessor<TreeT> AccessorType;
+
+    /// @param acc  a ValueAccessor to be sampled
+    /// @param transform is used when sampling world space locations.
+    GridSampler(const AccessorType& acc,
+                const math::Transform& transform)
+        : mAccessor(&acc), mTransform(&transform) {}
+
+     const math::Transform& transform() const { return *mTransform; }
+
+    /// @brief Sample a point in index space in the grid.
+    /// @param x Fractional x-coordinate of point in index-coordinates of grid
+    /// @param y Fractional y-coordinate of point in index-coordinates of grid
+    /// @param z Fractional z-coordinate of point in index-coordinates of grid
+    template<typename RealType>
+    ValueType sampleVoxel(const RealType& x, const RealType& y, const RealType& z) const
+    {
+        return this->isSample(Vec3d(x,y,z));
+    }
+
+    /// @brief Sample value in integer index space
+    /// @param i Integer x-coordinate in index space
+    /// @param j Integer y-coordinate in index space
+    /// @param k Integer x-coordinate in index space
+    ValueType sampleVoxel(typename Coord::ValueType i,
+                          typename Coord::ValueType j,
+                          typename Coord::ValueType k) const
+    {
+        return this->isSample(Coord(i,j,k));
+    }
+
+    /// @brief Sample value in integer index space
+    /// @param ijk the location in index space
+    ValueType isSample(const Coord& ijk) const { return mAccessor->getValue(ijk); }
+
+    /// @brief Sample in fractional index space
+    /// @param ispoint the location in index space
+    ValueType isSample(const Vec3d& ispoint) const
+    {
+        ValueType result = zeroVal<ValueType>();
+        SamplerType::sample(*mAccessor, ispoint, result);
+        return result;
+    }
+
+    /// @brief Sample in world space
+    /// @param wspoint the location in world space
+    ValueType wsSample(const Vec3d& wspoint) const
+    {
+        ValueType result = zeroVal<ValueType>();
+        SamplerType::sample(*mAccessor, mTransform->worldToIndex(wspoint), result);
+        return result;
+    }
+
+private:
+    const AccessorType*    mAccessor;//not thread-safe!
+    const math::Transform* mTransform;
+};//Specialization of GridSampler
+
+
+//////////////////////////////////////// DualGridSampler
+
+
+/// @brief This is a simple convenience class that allows for sampling
+/// from a source grid into the index space of a target grid. At
+/// construction the source and target grids are checked for alignment
+/// which potentially renders interpolation unnecessary. Else
+/// interpolation is performed according to the templated Sampler
+/// type.
+///
+/// @warning For performance reasons the check for alignment of the
+/// two grids is only performed at construction time!
+template<typename GridOrTreeT,
+         typename SamplerT>
+class DualGridSampler
+{
+public:
+    typedef typename GridOrTreeT::ValueType               ValueType;
+    typedef typename TreeAdapter<GridOrTreeT>::GridType   GridType;
+    typedef typename TreeAdapter<GridOrTreeT>::TreeType   TreeType;
+    typedef typename TreeAdapter<GridType>::AccessorType  AccessorType;
+
+    /// @brief Grid and transform constructor.
+    /// @param sourceGrid Source grid.
+    /// @param targetXform Transform of the target grid.
+    DualGridSampler(const GridType& sourceGrid,
+                    const math::Transform& targetXform)
+        : mSourceTree(&(sourceGrid.tree()))
+        , mSourceXform(&(sourceGrid.transform()))
+        , mTargetXform(&targetXform)
+        , mAligned(targetXform == *mSourceXform)
+    {
+    }
+    /// @brief Tree and transform constructor.
+    /// @param sourceTree Source tree.
+    /// @param sourceXform Transform of the source grid.
+    /// @param targetXform Transform of the target grid.
+    DualGridSampler(const TreeType& sourceTree,
+                    const math::Transform& sourceXform,
+                    const math::Transform& targetXform)
+        : mSourceTree(&sourceTree)
+        , mSourceXform(&sourceXform)
+        , mTargetXform(&targetXform)
+        , mAligned(targetXform == sourceXform)
+    {
+    }
+    /// @brief Return the value of the source grid at the index
+    /// coordinates, ijk, relative to the target grid (or its tranform).
+    inline ValueType operator()(const Coord& ijk) const
+    {
+        if (mAligned) return mSourceTree->getValue(ijk);
+        const Vec3R world = mTargetXform->indexToWorld(ijk);
+        return SamplerT::sample(*mSourceTree, mSourceXform->worldToIndex(world));
+    }
+    /// @brief Return true if the two grids are aligned.
+    inline bool isAligned() const { return mAligned; }
+private:
+    const TreeType*        mSourceTree;
+    const math::Transform* mSourceXform;
+    const math::Transform* mTargetXform;
+    const bool             mAligned;
+};// DualGridSampler
+
+/// @brief Specialization of DualGridSampler for construction from a ValueAccessor type.
+template<typename TreeT,
+         typename SamplerT>
+class DualGridSampler<tree::ValueAccessor<TreeT>, SamplerT>
+{
+    public:
+    typedef typename TreeT::ValueType ValueType;
+    typedef TreeT                     TreeType;
+    typedef Grid<TreeType>            GridType;
+    typedef typename tree::ValueAccessor<TreeT> AccessorType;
+
+    /// @brief ValueAccessor and transform constructor.
+    /// @param sourceAccessor ValueAccessor into the source grid.
+    /// @param sourceXform Transform for the source grid.
+    /// @param targetXform Transform for the target grid.
+    DualGridSampler(const AccessorType& sourceAccessor,
+                    const math::Transform& sourceXform,
+                    const math::Transform& targetXform)
+        : mSourceAcc(&sourceAccessor)
+        , mSourceXform(&sourceXform)
+        , mTargetXform(&targetXform)
+        , mAligned(targetXform == sourceXform)
+    {
+    }
+    /// @brief Return the value of the source grid at the index
+    /// coordinates, ijk, relative to the target grid.
+    inline ValueType operator()(const Coord& ijk) const
+    {
+        if (mAligned) return mSourceAcc->getValue(ijk);
+        const Vec3R world = mTargetXform->indexToWorld(ijk);
+        return SamplerT::sample(*mSourceAcc, mSourceXform->worldToIndex(world));
+    }
+    /// @brief Return true if the two grids are aligned.
+    inline bool isAligned() const { return mAligned; }
+private:
+    const AccessorType*    mSourceAcc;
+    const math::Transform* mSourceXform;
+    const math::Transform* mTargetXform;
+    const bool             mAligned;
+};//Specialization of DualGridSampler
+
+//////////////////////////////////////// AlphaMask
+
+
+// Class to derive the normalized alpha mask
+template <typename GridT,
+          typename MaskT,
+          typename SamplerT = tools::BoxSampler,
+          typename FloatT = float>
+class AlphaMask
+{
+public:
+    BOOST_STATIC_ASSERT(boost::is_floating_point<FloatT>::value);
+    typedef GridT    GridType;
+    typedef MaskT    MaskType;
+    typedef SamplerT SamlerType;
+    typedef FloatT   FloatType;
+
+    AlphaMask(const GridT& grid, const MaskT& mask, FloatT min, FloatT max, bool invert)
+        : mAcc(mask.tree())
+        , mSampler(mAcc, mask.transform() , grid.transform())
+        , mMin(min)
+        , mInvNorm(1/(max-min))
+        , mInvert(invert)
+    {
+        assert(min < max);
+    }
+
+    inline bool operator()(const Coord& xyz, FloatT& a, FloatT& b) const
+    {
+        a = math::SmoothUnitStep( (mSampler(xyz) - mMin) * mInvNorm );//smooth mapping to 0->1
+        b = 1 - a;
+        if (mInvert) std::swap(a,b);
+        return a>0;
+    }
+
+protected:
+    typedef typename MaskType::ConstAccessor AccT;
+    AccT mAcc;
+    tools::DualGridSampler<AccT, SamplerT> mSampler;
+    const FloatT mMin, mInvNorm;
+    const bool mInvert;
+};// AlphaMask
+
+////////////////////////////////////////
+
+namespace local_util {
+
+inline Vec3i
+floorVec3(const Vec3R& v)
+{
+    return Vec3i(int(std::floor(v(0))), int(std::floor(v(1))), int(std::floor(v(2))));
+}
+
+
+inline Vec3i
+ceilVec3(const Vec3R& v)
+{
+    return Vec3i(int(std::ceil(v(0))), int(std::ceil(v(1))), int(std::ceil(v(2))));
+}
+
+
+inline Vec3i
+roundVec3(const Vec3R& v)
+{
+    return Vec3i(int(::round(v(0))), int(::round(v(1))), int(::round(v(2))));
+}
+
+} // namespace local_util
+
+
+//////////////////////////////////////// PointSampler
+
+
+template<class TreeT>
+inline bool
+PointSampler::sample(const TreeT& inTree, const Vec3R& inCoord,
+                     typename TreeT::ValueType& result)
+{
+    return inTree.probeValue(Coord(local_util::roundVec3(inCoord)), result);
+}
+
+template<class TreeT>
+inline typename TreeT::ValueType
+PointSampler::sample(const TreeT& inTree, const Vec3R& inCoord)
+{
+    return inTree.getValue(Coord(local_util::roundVec3(inCoord)));
+}
+
+
+//////////////////////////////////////// BoxSampler
+
+template<class ValueT, class TreeT, size_t N>
+inline void
+BoxSampler::getValues(ValueT (&data)[N][N][N], const TreeT& inTree, Coord ijk)
+{
+    data[0][0][0] = inTree.getValue(ijk); // i, j, k
+
+    ijk[2] += 1;
+    data[0][0][1] = inTree.getValue(ijk); // i, j, k + 1
+
+    ijk[1] += 1;
+    data[0][1][1] = inTree.getValue(ijk); // i, j+1, k + 1
+
+    ijk[2] -= 1;
+    data[0][1][0] = inTree.getValue(ijk); // i, j+1, k
+
+    ijk[0] += 1;
+    ijk[1] -= 1;
+    data[1][0][0] = inTree.getValue(ijk); // i+1, j, k
+
+    ijk[2] += 1;
+    data[1][0][1] = inTree.getValue(ijk); // i+1, j, k + 1
+
+    ijk[1] += 1;
+    data[1][1][1] = inTree.getValue(ijk); // i+1, j+1, k + 1
+
+    ijk[2] -= 1;
+    data[1][1][0] = inTree.getValue(ijk); // i+1, j+1, k
+}
+
+template<class ValueT, class TreeT, size_t N>
+inline bool
+BoxSampler::probeValues(ValueT (&data)[N][N][N], const TreeT& inTree, Coord ijk)
+{
+    bool hasActiveValues = false;
+    hasActiveValues |= inTree.probeValue(ijk, data[0][0][0]); // i, j, k
+
+    ijk[2] += 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[0][0][1]); // i, j, k + 1
+
+    ijk[1] += 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[0][1][1]); // i, j+1, k + 1
+
+    ijk[2] -= 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[0][1][0]); // i, j+1, k
+
+    ijk[0] += 1;
+    ijk[1] -= 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[1][0][0]); // i+1, j, k
+
+    ijk[2] += 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[1][0][1]); // i+1, j, k + 1
+
+    ijk[1] += 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[1][1][1]); // i+1, j+1, k + 1
+
+    ijk[2] -= 1;
+    hasActiveValues |= inTree.probeValue(ijk, data[1][1][0]); // i+1, j+1, k
+
+    return hasActiveValues;
+}
+
+template<class ValueT, size_t N>
+inline void
+BoxSampler::extrema(ValueT (&data)[N][N][N], ValueT& vMin, ValueT &vMax)
+{
+    vMin = vMax = data[0][0][0];
+    vMin = math::Min(vMin, data[0][0][1]);
+    vMax = math::Max(vMax, data[0][0][1]);
+    vMin = math::Min(vMin, data[0][1][0]);
+    vMax = math::Max(vMax, data[0][1][0]);
+    vMin = math::Min(vMin, data[0][1][1]);
+    vMax = math::Max(vMax, data[0][1][1]);
+    vMin = math::Min(vMin, data[1][0][0]);
+    vMax = math::Max(vMax, data[1][0][0]);
+    vMin = math::Min(vMin, data[1][0][1]);
+    vMax = math::Max(vMax, data[1][0][1]);
+    vMin = math::Min(vMin, data[1][1][0]);
+    vMax = math::Max(vMax, data[1][1][0]);
+    vMin = math::Min(vMin, data[1][1][1]);
+    vMax = math::Max(vMax, data[1][1][1]);
+}
+
+
+template<class ValueT, size_t N>
+inline ValueT
+BoxSampler::trilinearInterpolation(ValueT (&data)[N][N][N], const Vec3R& uvw)
+{
+    // Trilinear interpolation:
+    // The eight surrounding latice values are used to construct the result. \n
+    // result(x,y,z) =
+    //     v000 (1-x)(1-y)(1-z) + v001 (1-x)(1-y)z + v010 (1-x)y(1-z) + v011 (1-x)yz
+    //   + v100 x(1-y)(1-z)     + v101 x(1-y)z     + v110 xy(1-z)     + v111 xyz
+
+    ValueT resultA, resultB;
+
+    resultA = data[0][0][0] + ValueT((data[0][0][1] - data[0][0][0]) * uvw[2]);
+    resultB = data[0][1][0] + ValueT((data[0][1][1] - data[0][1][0]) * uvw[2]);
+    ValueT result1 = resultA + ValueT((resultB-resultA) * uvw[1]);
+
+    resultA = data[1][0][0] + ValueT((data[1][0][1] - data[1][0][0]) * uvw[2]);
+    resultB = data[1][1][0] + ValueT((data[1][1][1] - data[1][1][0]) * uvw[2]);
+    ValueT result2 = resultA + ValueT((resultB - resultA) * uvw[1]);
+
+    return result1 + ValueT(uvw[0] * (result2 - result1));
+}
+
+
+template<class TreeT>
+inline bool
+BoxSampler::sample(const TreeT& inTree, const Vec3R& inCoord,
+                   typename TreeT::ValueType& result)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const Vec3i inIdx = local_util::floorVec3(inCoord);
+    const Vec3R uvw = inCoord - inIdx;
+
+    // Retrieve the values of the eight voxels surrounding the
+    // fractional source coordinates.
+    ValueT data[2][2][2];
+
+    const bool hasActiveValues = BoxSampler::probeValues(data, inTree, Coord(inIdx));
+
+    result = BoxSampler::trilinearInterpolation(data, uvw);
+
+    return hasActiveValues;
+}
+
+
+template<class TreeT>
+inline typename TreeT::ValueType
+BoxSampler::sample(const TreeT& inTree, const Vec3R& inCoord)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const Vec3i inIdx = local_util::floorVec3(inCoord);
+    const Vec3R uvw = inCoord - inIdx;
+
+    // Retrieve the values of the eight voxels surrounding the
+    // fractional source coordinates.
+    ValueT data[2][2][2];
+
+    BoxSampler::getValues(data, inTree, Coord(inIdx));
+   
+    return BoxSampler::trilinearInterpolation(data, uvw);
+}
+
+
+//////////////////////////////////////// QuadraticSampler
+
+template<class ValueT, size_t N>
+inline ValueT
+QuadraticSampler::triquadraticInterpolation(ValueT (&data)[N][N][N], const Vec3R& uvw)
+{
+    /// @todo For vector types, interpolate over each component independently.
+    ValueT vx[3];
+    for (int dx = 0; dx < 3; ++dx) {
+        ValueT vy[3];
+        for (int dy = 0; dy < 3; ++dy) {
+            // Fit a parabola to three contiguous samples in z
+            // (at z=-1, z=0 and z=1), then evaluate the parabola at z',
+            // where z' is the fractional part of inCoord.z, i.e.,
+            // inCoord.z - inIdx.z.  The coefficients come from solving
+            //
+            // | (-1)^2  -1   1 || a |   | v0 |
+            // |    0     0   1 || b | = | v1 |
+            // |   1^2    1   1 || c |   | v2 |
+            //
+            // for a, b and c.
+            const ValueT* vz = &data[dx][dy][0];
+            const ValueT
+                az = static_cast<ValueT>(0.5 * (vz[0] + vz[2]) - vz[1]),
+                bz = static_cast<ValueT>(0.5 * (vz[2] - vz[0])),
+                cz = static_cast<ValueT>(vz[1]);
+            vy[dy] = static_cast<ValueT>(uvw.z() * (uvw.z() * az + bz) + cz);
+        }//loop over y
+        // Fit a parabola to three interpolated samples in y, then
+        // evaluate the parabola at y', where y' is the fractional
+        // part of inCoord.y.
+        const ValueT
+            ay = static_cast<ValueT>(0.5 * (vy[0] + vy[2]) - vy[1]),
+            by = static_cast<ValueT>(0.5 * (vy[2] - vy[0])),
+            cy = static_cast<ValueT>(vy[1]);
+        vx[dx] = static_cast<ValueT>(uvw.y() * (uvw.y() * ay + by) + cy);
+    }//loop over x
+    // Fit a parabola to three interpolated samples in x, then
+    // evaluate the parabola at the fractional part of inCoord.x.
+    const ValueT
+        ax = static_cast<ValueT>(0.5 * (vx[0] + vx[2]) - vx[1]),
+        bx = static_cast<ValueT>(0.5 * (vx[2] - vx[0])),
+        cx = static_cast<ValueT>(vx[1]);
+    return static_cast<ValueT>(uvw.x() * (uvw.x() * ax + bx) + cx);
+}
+
+template<class TreeT>
+inline bool
+QuadraticSampler::sample(const TreeT& inTree, const Vec3R& inCoord,
+    typename TreeT::ValueType& result)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const Vec3i inIdx = local_util::floorVec3(inCoord), inLoIdx = inIdx - Vec3i(1, 1, 1);
+    const Vec3R uvw = inCoord - inIdx;
+
+    // Retrieve the values of the 27 voxels surrounding the
+    // fractional source coordinates.
+    bool active = false;
+    ValueT data[3][3][3];
+    for (int dx = 0, ix = inLoIdx.x(); dx < 3; ++dx, ++ix) {
+        for (int dy = 0, iy = inLoIdx.y(); dy < 3; ++dy, ++iy) {
+            for (int dz = 0, iz = inLoIdx.z(); dz < 3; ++dz, ++iz) {
+                if (inTree.probeValue(Coord(ix, iy, iz), data[dx][dy][dz])) active = true;
+            }
+        }
+    }
+
+    result = QuadraticSampler::triquadraticInterpolation(data, uvw);
+
+    return active;
+}
+
+template<class TreeT>
+inline typename TreeT::ValueType
+QuadraticSampler::sample(const TreeT& inTree, const Vec3R& inCoord)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const Vec3i inIdx = local_util::floorVec3(inCoord), inLoIdx = inIdx - Vec3i(1, 1, 1);
+    const Vec3R uvw = inCoord - inIdx;
+
+    // Retrieve the values of the 27 voxels surrounding the
+    // fractional source coordinates.
+    ValueT data[3][3][3];
+    for (int dx = 0, ix = inLoIdx.x(); dx < 3; ++dx, ++ix) {
+        for (int dy = 0, iy = inLoIdx.y(); dy < 3; ++dy, ++iy) {
+            for (int dz = 0, iz = inLoIdx.z(); dz < 3; ++dz, ++iz) {
+                data[dx][dy][dz] = inTree.getValue(Coord(ix, iy, iz));
+            }
+        }
+    }
+
+    return QuadraticSampler::triquadraticInterpolation(data, uvw);
+}
+
+
+//////////////////////////////////////// StaggeredPointSampler
+
+
+template<class TreeT>
+inline bool
+StaggeredPointSampler::sample(const TreeT& inTree, const Vec3R& inCoord,
+                              typename TreeT::ValueType& result)
+{
+    typedef typename TreeT::ValueType ValueType;
+
+    ValueType tempX, tempY, tempZ;
+    bool active = false;
+
+    active = PointSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.5, 0, 0), tempX) || active;
+    active = PointSampler::sample<TreeT>(inTree, inCoord + Vec3R(0, 0.5, 0), tempY) || active;
+    active = PointSampler::sample<TreeT>(inTree, inCoord + Vec3R(0, 0, 0.5), tempZ) || active;
+
+    result.x() = tempX.x();
+    result.y() = tempY.y();
+    result.z() = tempZ.z();
+
+    return active;
+}
+
+template<class TreeT>
+inline typename TreeT::ValueType
+StaggeredPointSampler::sample(const TreeT& inTree, const Vec3R& inCoord)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const ValueT tempX = PointSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.5, 0.0, 0.0));
+    const ValueT tempY = PointSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.0, 0.5, 0.0));
+    const ValueT tempZ = PointSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.0, 0.0, 0.5));
+
+    return ValueT(tempX.x(), tempY.y(), tempZ.z());
+}
+
+
+//////////////////////////////////////// StaggeredBoxSampler
+
+
+template<class TreeT>
+inline bool
+StaggeredBoxSampler::sample(const TreeT& inTree, const Vec3R& inCoord,
+                            typename TreeT::ValueType& result)
+{
+    typedef typename TreeT::ValueType ValueType;
+
+    ValueType tempX, tempY, tempZ;
+    tempX = tempY = tempZ = zeroVal<ValueType>();
+    bool active = false;
+
+    active = BoxSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.5, 0, 0), tempX) || active;
+    active = BoxSampler::sample<TreeT>(inTree, inCoord + Vec3R(0, 0.5, 0), tempY) || active;
+    active = BoxSampler::sample<TreeT>(inTree, inCoord + Vec3R(0, 0, 0.5), tempZ) || active;
+
+    result.x() = tempX.x();
+    result.y() = tempY.y();
+    result.z() = tempZ.z();
+
+    return active;
+}
+
+template<class TreeT>
+inline typename TreeT::ValueType
+StaggeredBoxSampler::sample(const TreeT& inTree, const Vec3R& inCoord)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const ValueT tempX = BoxSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.5, 0.0, 0.0));
+    const ValueT tempY = BoxSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.0, 0.5, 0.0));
+    const ValueT tempZ = BoxSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.0, 0.0, 0.5));
+
+    return ValueT(tempX.x(), tempY.y(), tempZ.z());
+}
+
+
+//////////////////////////////////////// StaggeredQuadraticSampler
+
+
+template<class TreeT>
+inline bool
+StaggeredQuadraticSampler::sample(const TreeT& inTree, const Vec3R& inCoord,
+    typename TreeT::ValueType& result)
+{
+    typedef typename TreeT::ValueType ValueType;
+
+    ValueType tempX, tempY, tempZ;
+    bool active = false;
+
+    active = QuadraticSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.5, 0, 0), tempX) || active;
+    active = QuadraticSampler::sample<TreeT>(inTree, inCoord + Vec3R(0, 0.5, 0), tempY) || active;
+    active = QuadraticSampler::sample<TreeT>(inTree, inCoord + Vec3R(0, 0, 0.5), tempZ) || active;
+
+    result.x() = tempX.x();
+    result.y() = tempY.y();
+    result.z() = tempZ.z();
+
+    return active;
+}
+
+template<class TreeT>
+inline typename TreeT::ValueType
+StaggeredQuadraticSampler::sample(const TreeT& inTree, const Vec3R& inCoord)
+{
+    typedef typename TreeT::ValueType ValueT;
+
+    const ValueT tempX = QuadraticSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.5, 0.0, 0.0));
+    const ValueT tempY = QuadraticSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.0, 0.5, 0.0));
+    const ValueT tempZ = QuadraticSampler::sample<TreeT>(inTree, inCoord + Vec3R(0.0, 0.0, 0.5));
+
+    return ValueT(tempX.x(), tempY.y(), tempZ.z());
+}
+
+//////////////////////////////////////// Sampler
+
+template <>
+struct Sampler<0, false> : public PointSampler {};
+
+template <>
+struct Sampler<1, false> : public BoxSampler {};
+
+template <>
+struct Sampler<2, false> : public QuadraticSampler {};
+
+template <>
+struct Sampler<0, true> : public StaggeredPointSampler {};
+
+template <>
+struct Sampler<1, true> : public StaggeredBoxSampler {};
+
+template <>
+struct Sampler<2, true> : public StaggeredQuadraticSampler {};
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_INTERPOLATION_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetAdvect.h b/nuparu/include/openvdb_new/tools/LevelSetAdvect.h
new file mode 100644
index 00000000..2139315a
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetAdvect.h
@@ -0,0 +1,569 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file LevelSetAdvect.h
+///
+/// @brief Hyperbolic advection of narrow-band level sets
+
+#ifndef OPENVDB_TOOLS_LEVEL_SET_ADVECT_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVEL_SET_ADVECT_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <openvdb/Platform.h>
+#include "LevelSetTracker.h"
+#include "VelocityFields.h" // for EnrightField
+#include <openvdb/math/FiniteDifference.h>
+#include <boost/math/constants/constants.hpp>
+#include <openvdb/util/CpuTimer.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief  Hyperbolic advection of narrow-band level sets in an
+/// external velocity field
+///
+/// The @c FieldType template argument below refers to any functor
+/// with the following interface (see tools/VelocityFields.h
+/// for examples):
+///
+/// @code
+/// class VelocityField {
+///   ...
+/// public:
+///   openvdb::VectorType operator() (const openvdb::Coord& xyz, ValueType time) const;
+///   ...
+/// };
+/// @endcode
+///
+/// @note The functor method returns the velocity field at coordinate
+/// position xyz of the advection grid, and for the specified
+/// time. Note that since the velocity is returned in the local
+/// coordinate space of the grid that is being advected, the functor
+/// typically depends on the transformation of that grid. This design
+/// is chosen for performance reasons. Finally we will assume that the
+/// functor method is NOT threadsafe (typically uses a ValueAccessor)
+/// and that its lightweight enough that we can copy it per thread.    
+///
+/// The @c InterruptType template argument below refers to any class
+/// with the following interface:
+/// @code
+/// class Interrupter {
+///   ...
+/// public:
+///   void start(const char* name = NULL)// called when computations begin
+///   void end()                         // called when computations end
+///   bool wasInterrupted(int percent=-1)// return true to break computation
+///};
+/// @endcode
+///
+/// @note If no template argument is provided for this InterruptType
+/// the util::NullInterrupter is used which implies that all
+/// interrupter calls are no-ops (i.e. incurs no computational overhead).
+///
+
+template<typename GridT,
+         typename FieldT     = EnrightField<typename GridT::ValueType>,
+         typename InterruptT = util::NullInterrupter>
+class LevelSetAdvection
+{
+public:
+    typedef GridT                              GridType;
+    typedef LevelSetTracker<GridT, InterruptT> TrackerT;
+    typedef typename TrackerT::LeafRange       LeafRange;
+    typedef typename TrackerT::LeafType        LeafType;
+    typedef typename TrackerT::BufferType      BufferType;
+    typedef typename TrackerT::ValueType       ValueType;
+    typedef typename FieldT::VectorType        VectorType;
+
+    /// Main constructor
+    LevelSetAdvection(GridT& grid, const FieldT& field, InterruptT* interrupt = NULL):
+        mTracker(grid, interrupt), mField(field),
+        mSpatialScheme(math::HJWENO5_BIAS),
+        mTemporalScheme(math::TVD_RK2) {}
+
+    virtual ~LevelSetAdvection() {}
+
+    /// @return the spatial finite difference scheme
+    math::BiasedGradientScheme getSpatialScheme() const { return mSpatialScheme; }
+    /// @brief Set the spatial finite difference scheme
+    void setSpatialScheme(math::BiasedGradientScheme scheme) { mSpatialScheme = scheme; }
+
+    /// @return the temporal integration scheme
+    math::TemporalIntegrationScheme getTemporalScheme() const { return mTemporalScheme; }
+    /// @brief Set the spatial finite difference scheme
+    void setTemporalScheme(math::TemporalIntegrationScheme scheme) { mTemporalScheme = scheme; }
+
+    /// @return the spatial finite difference scheme
+    math::BiasedGradientScheme getTrackerSpatialScheme() const { return mTracker.getSpatialScheme(); }
+    /// @brief Set the spatial finite difference scheme
+    void setTrackerSpatialScheme(math::BiasedGradientScheme scheme) { mTracker.setSpatialScheme(scheme); }
+
+    /// @return the temporal integration scheme
+    math::TemporalIntegrationScheme getTrackerTemporalScheme() const { return mTracker.getTemporalScheme(); }
+    /// @brief Set the spatial finite difference scheme
+    void setTrackerTemporalScheme(math::TemporalIntegrationScheme scheme) { mTracker.setTemporalScheme(scheme); }
+
+    /// @return The number of normalizations performed per track or
+    /// normalize call.
+    int  getNormCount() const { return mTracker.getNormCount(); }
+    /// @brief Set the number of normalizations performed per track or
+    /// normalize call.
+    void setNormCount(int n) { mTracker.setNormCount(n); }
+
+    /// @return the grain-size used for multi-threading
+    int  getGrainSize() const { return mTracker.getGrainSize(); }
+    /// @brief Set the grain-size used for multi-threading.
+    /// @note A grain size of 0 or less disables multi-threading!
+    void setGrainSize(int grainsize) { mTracker.setGrainSize(grainsize); }
+
+    /// Advect the level set from its current time, time0, to its
+    /// final time, time1. If time0>time1 backward advection is performed.
+    ///
+    /// @return number of CFL iterations used to advect from time0 to time1
+    size_t advect(ValueType time0, ValueType time1);
+
+private:
+
+    // disallow copy construction and copy by assinment!
+    LevelSetAdvection(const LevelSetAdvection&);// not implemented
+    LevelSetAdvection& operator=(const LevelSetAdvection&);// not implemented
+
+    // This templated private struct implements all the level set magic.
+    template<typename MapT, math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme>
+    struct Advect
+    {
+        /// Main constructor
+        Advect(LevelSetAdvection& parent);
+        /// Shallow copy constructor called by tbb::parallel_for() threads
+        Advect(const Advect& other);
+        /// Destructor
+        virtual ~Advect() { if (mIsMaster) this->clearField(); }
+        /// Advect the level set from its current time, time0, to its final time, time1.
+        /// @return number of CFL iterations
+        size_t advect(ValueType time0, ValueType time1);
+        /// Used internally by tbb::parallel_for()
+        void operator()(const LeafRange& r) const
+        {
+            if (mTask) mTask(const_cast<Advect*>(this), r);
+            else OPENVDB_THROW(ValueError, "task is undefined - don\'t call this method directly");
+        }
+        /// method calling tbb
+        void cook(const char* msg, size_t swapBuffer = 0);
+        /// Sample field and return the CFL time step
+        typename GridT::ValueType sampleField(ValueType time0, ValueType time1);
+        template <bool Aligned> void sample(const LeafRange& r, ValueType t0, ValueType t1);
+        inline void sampleXformed(const LeafRange& r, ValueType t0, ValueType t1)
+        {
+            this->sample<false>(r, t0, t1);
+        }
+        inline void sampleAligned(const LeafRange& r, ValueType t0, ValueType t1)
+        {
+            this->sample<true>(r, t0, t1);
+        }
+        void clearField();
+        // Convex combination of Phi and a forward Euler advection steps:
+        // Phi(result) = alpha * Phi(phi) + (1-alpha) * (Phi(0) - dt * Speed(speed)*|Grad[Phi(0)]|);
+        template <int Nominator, int Denominator>
+        void euler(const LeafRange&, ValueType, Index, Index);
+        inline void euler01(const LeafRange& r, ValueType t) {this->euler<0,1>(r, t, 0, 1);}
+        inline void euler12(const LeafRange& r, ValueType t) {this->euler<1,2>(r, t, 1, 1);}
+        inline void euler34(const LeafRange& r, ValueType t) {this->euler<3,4>(r, t, 1, 2);}
+        inline void euler13(const LeafRange& r, ValueType t) {this->euler<1,3>(r, t, 1, 2);}
+
+        LevelSetAdvection& mParent;
+        VectorType*        mVelocity;
+        size_t*            mOffsets;
+        const MapT*        mMap;
+        typename boost::function<void (Advect*, const LeafRange&)> mTask;
+        const bool         mIsMaster;
+    }; // end of private Advect struct
+    
+    template<math::BiasedGradientScheme SpatialScheme>
+    size_t advect1(ValueType time0, ValueType time1);
+
+    template<math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme>
+    size_t advect2(ValueType time0, ValueType time1);
+
+    template<math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme,
+             typename MapType>
+    size_t advect3(ValueType time0, ValueType time1);
+
+    TrackerT                        mTracker;
+    //each thread needs a deep copy of the field since it might contain a ValueAccessor
+    const FieldT                    mField;
+    math::BiasedGradientScheme      mSpatialScheme;
+    math::TemporalIntegrationScheme mTemporalScheme;
+
+};//end of LevelSetAdvection
+
+template<typename GridT, typename FieldT, typename InterruptT>
+inline size_t
+LevelSetAdvection<GridT, FieldT, InterruptT>::advect(ValueType time0, ValueType time1)
+{
+    switch (mSpatialScheme) {
+    case math::FIRST_BIAS:
+        return this->advect1<math::FIRST_BIAS  >(time0, time1);
+    case math::SECOND_BIAS:
+        return this->advect1<math::SECOND_BIAS >(time0, time1);
+    case math::THIRD_BIAS:
+        return this->advect1<math::THIRD_BIAS  >(time0, time1);
+    case math::WENO5_BIAS:
+        return this->advect1<math::WENO5_BIAS  >(time0, time1);
+    case math::HJWENO5_BIAS:
+        return this->advect1<math::HJWENO5_BIAS>(time0, time1);
+    default:
+        OPENVDB_THROW(ValueError, "Spatial difference scheme not supported!");
+    }
+    return 0;
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme>
+inline size_t
+LevelSetAdvection<GridT, FieldT, InterruptT>::advect1(ValueType time0, ValueType time1)
+{
+    switch (mTemporalScheme) {
+    case math::TVD_RK1:
+        return this->advect2<SpatialScheme, math::TVD_RK1>(time0, time1);
+    case math::TVD_RK2:
+        return this->advect2<SpatialScheme, math::TVD_RK2>(time0, time1);
+    case math::TVD_RK3:
+        return this->advect2<SpatialScheme, math::TVD_RK3>(time0, time1);
+    default:
+        OPENVDB_THROW(ValueError, "Temporal integration scheme not supported!");
+    }
+    return 0;
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme>
+inline size_t
+LevelSetAdvection<GridT, FieldT, InterruptT>::advect2(ValueType time0, ValueType time1)
+{
+    const math::Transform& trans = mTracker.grid().transform();
+    if (trans.mapType() == math::UniformScaleMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::UniformScaleMap>(time0, time1);
+    } else if (trans.mapType() == math::UniformScaleTranslateMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::UniformScaleTranslateMap>(time0, time1);
+    } else if (trans.mapType() == math::UnitaryMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::UnitaryMap    >(time0, time1);
+    } else if (trans.mapType() == math::TranslationMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::TranslationMap>(time0, time1);
+    } else {
+        OPENVDB_THROW(ValueError, "MapType not supported!");
+    }
+    return 0;
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MapT>
+inline size_t
+LevelSetAdvection<GridT, FieldT, InterruptT>::advect3(ValueType time0, ValueType time1)
+{
+    Advect<MapT, SpatialScheme, TemporalScheme> tmp(*this);
+    return tmp.advect(time0, time1);
+}
+
+
+///////////////////////////////////////////////////////////////////////
+
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+Advect(LevelSetAdvection& parent)
+    : mParent(parent)
+    , mVelocity(NULL)
+    , mOffsets(NULL)
+    , mMap(parent.mTracker.grid().transform().template constMap<MapT>().get())
+    , mTask(0)
+    , mIsMaster(true)
+{
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+Advect(const Advect& other)
+    : mParent(other.mParent)
+    , mVelocity(other.mVelocity)
+    , mOffsets(other.mOffsets)
+    , mMap(other.mMap)
+    , mTask(other.mTask)
+    , mIsMaster(false)
+{
+}
+   
+template<typename GridT, typename FieldT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline size_t
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+advect(ValueType time0, ValueType time1)
+{
+    //util::CpuTimer timer;
+    size_t countCFL = 0;
+    if ( math::isZero(time0 - time1) ) return countCFL;
+    const bool isForward = time0 < time1;
+    while ((isForward ? time0<time1 : time0>time1) && mParent.mTracker.checkInterrupter()) {
+        /// Make sure we have enough temporal auxiliary buffers
+        //timer.start( "\nallocate buffers" );
+        mParent.mTracker.leafs().rebuildAuxBuffers(TemporalScheme == math::TVD_RK3 ? 2 : 1);
+        //timer.stop();
+        
+        const ValueType dt = this->sampleField(time0, time1);
+        if ( math::isZero(dt) ) break;//V is essentially zero so terminate
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN //switch is resolved at compile-time
+        switch(TemporalScheme) {
+        case math::TVD_RK1:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * VdotG_t0(0)
+            mTask = boost::bind(&Advect::euler01, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook("Advecting level set using TVD_RK1", 1);
+            break;
+        case math::TVD_RK2:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * VdotG_t0(0)
+            mTask = boost::bind(&Advect::euler01, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook("Advecting level set using TVD_RK1 (step 1 of 2)", 1);
+
+            // Convex combine explict Euler step: t2 = t0 + dt
+            // Phi_t2(1) = 1/2 * Phi_t0(1) + 1/2 * (Phi_t1(0) - dt * V.Grad_t1(0))
+            mTask = boost::bind(&Advect::euler12, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t2(0) and Phi_t1(1)
+            this->cook("Advecting level set using TVD_RK1 (step 2 of 2)", 1);
+            break;
+        case math::TVD_RK3:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * VdotG_t0(0)
+            mTask = boost::bind(&Advect::euler01, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook("Advecting level set using TVD_RK3 (step 1 of 3)", 1);
+
+            // Convex combine explict Euler step: t2 = t0 + dt/2
+            // Phi_t2(2) = 3/4 * Phi_t0(1) + 1/4 * (Phi_t1(0) - dt * V.Grad_t1(0))
+            mTask = boost::bind(&Advect::euler34, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 2 such that Phi_t2(0) and Phi_t1(2)
+            this->cook("Advecting level set using TVD_RK3 (step 2 of 3)", 2);
+
+            // Convex combine explict Euler step: t3 = t0 + dt
+            // Phi_t3(2) = 1/3 * Phi_t0(1) + 2/3 * (Phi_t2(0) - dt * V.Grad_t2(0)
+            mTask = boost::bind(&Advect::euler13, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 2 such that Phi_t3(0) and Phi_t2(2)
+            this->cook("Advecting level set using TVD_RK3 (step 3 of 3)", 2);
+            break;
+        default:
+            OPENVDB_THROW(ValueError, "Temporal integration scheme not supported!");
+        }//end of compile-time resolved switch
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+            
+        time0 += isForward ? dt : -dt;
+        ++countCFL;
+        mParent.mTracker.leafs().removeAuxBuffers();
+        this->clearField();
+        /// Track the narrow band
+        mParent.mTracker.track();
+    }//end wile-loop over time
+    return countCFL;//number of CLF propagation steps
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template<typename MapT, math::BiasedGradientScheme SpatialScheme,
+	math::TemporalIntegrationScheme TemporalScheme>
+inline typename GridT::ValueType
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+sampleField(ValueType time0, ValueType time1)
+{
+    const int grainSize = mParent.mTracker.getGrainSize();
+    const size_t leafCount = mParent.mTracker.leafs().leafCount();
+    if (leafCount==0) return ValueType(0.0);
+
+    // Compute the pre-fix sum of offsets to active voxels
+    size_t size=0, voxelCount=mParent.mTracker.leafs().getPreFixSum(mOffsets, size, grainSize);
+
+    // Sample the velocity field
+    if (mParent.mField.transform() == mParent.mTracker.grid().transform()) {
+        mTask = boost::bind(&Advect::sampleAligned, _1, _2, time0, time1);
+    } else {
+        mTask = boost::bind(&Advect::sampleXformed, _1, _2, time0, time1);
+    }
+    assert(voxelCount != mParent.mTracker.grid().activeVoxelCount());
+    mVelocity = new VectorType[ voxelCount ];
+    this->cook("Sampling advection field");
+
+    // Find the extrema of the magnitude of the velocities
+    ValueType maxAbsV = 0;
+    VectorType* v = mVelocity;
+    for (size_t i=0; i<voxelCount; ++i, ++v) maxAbsV = math::Max(maxAbsV, ValueType(v->lengthSqr()));
+
+    // Compute the CFL number
+    if (math::isApproxZero(maxAbsV, math::Delta<ValueType>::value())) return ValueType(0);
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const ValueType CFL = (TemporalScheme == math::TVD_RK1 ? ValueType(0.3) :
+        TemporalScheme == math::TVD_RK2 ? ValueType(0.9) :
+        ValueType(1.0))/math::Sqrt(ValueType(3.0));
+    const ValueType dt = math::Abs(time1 - time0), dx = mParent.mTracker.voxelSize();
+    return math::Min(dt, ValueType(CFL*dx/math::Sqrt(maxAbsV)));
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+template <bool Aligned>
+inline void
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+sample(const LeafRange& range, ValueType time0, ValueType time1)
+{
+    const bool isForward = time0 < time1;
+    typedef typename LeafType::ValueOnCIter VoxelIterT;
+    const MapT& map = *mMap;
+    const FieldT field( mParent.mField );
+    mParent.mTracker.checkInterrupter();
+    for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+        VectorType* vel = mVelocity + mOffsets[ leafIter.pos() ];
+        for (VoxelIterT iter = leafIter->cbeginValueOn(); iter; ++iter, ++vel) {
+            const VectorType v = Aligned ? field(iter.getCoord(), time0) ://resolved at compile time
+                                 field(map.applyMap(iter.getCoord().asVec3d()), time0);
+            *vel = isForward ? v : -v;
+        }
+    }
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline void
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+clearField()
+{
+    delete [] mOffsets; 
+    delete [] mVelocity;
+    mOffsets  = NULL;
+    mVelocity = NULL;
+}
+
+template<typename GridT, typename FieldT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline void
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+cook(const char* msg, size_t swapBuffer)
+{
+    mParent.mTracker.startInterrupter( msg );
+
+    const int grainSize   = mParent.mTracker.getGrainSize();
+    const LeafRange range = mParent.mTracker.leafs().leafRange(grainSize);
+
+    grainSize == 0 ? (*this)(range) : tbb::parallel_for(range, *this);
+
+    mParent.mTracker.leafs().swapLeafBuffer(swapBuffer, grainSize == 0);
+
+    mParent.mTracker.endInterrupter();
+}
+
+// Convex combination of Phi and a forward Euler advection steps:
+// Phi(result) = alpha * Phi(phi) + (1-alpha) * (Phi(0) - dt * V.Grad(0));
+template<typename GridT, typename FieldT, typename InterruptT>
+template<typename MapT, math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme>
+template <int Nominator, int Denominator>
+inline void
+LevelSetAdvection<GridT, FieldT, InterruptT>::
+Advect<MapT, SpatialScheme, TemporalScheme>::
+euler(const LeafRange& range, ValueType dt, Index phiBuffer, Index resultBuffer)
+{
+    typedef math::BIAS_SCHEME<SpatialScheme>                             SchemeT;
+    typedef typename SchemeT::template ISStencil<GridType>::StencilType  StencilT;
+    typedef typename LeafType::ValueOnCIter                              VoxelIterT;
+    typedef math::GradientBiased<MapT, SpatialScheme>                    GradT;
+
+    static const ValueType Alpha = ValueType(Nominator)/ValueType(Denominator);
+    static const ValueType Beta  = ValueType(1) - Alpha;
+
+    mParent.mTracker.checkInterrupter();
+    const MapT& map = *mMap;
+    StencilT stencil(mParent.mTracker.grid());
+    for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+        const VectorType* vel = mVelocity + mOffsets[ leafIter.pos() ];
+        const ValueType* phi = leafIter.buffer(phiBuffer).data();
+        ValueType* result = leafIter.buffer(resultBuffer).data();
+        for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter, ++vel) {
+            const Index i = voxelIter.pos();
+            stencil.moveTo(voxelIter);
+            const ValueType a = stencil.getValue() - dt * vel->dot(GradT::result(map, stencil, *vel));
+            result[i] = Nominator ? Alpha * phi[i] + Beta * a : a;
+        }//loop over active voxels in the leaf of the mask
+    }//loop over leafs of the level set
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVEL_SET_ADVECT_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetFilter.h b/nuparu/include/openvdb_new/tools/LevelSetFilter.h
new file mode 100644
index 00000000..4e9a864b
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetFilter.h
@@ -0,0 +1,546 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file LevelSetFilter.h
+///
+/// @brief Performs various types of level set deformations with
+/// interface tracking. These unrestricted deformations include
+/// surface smoothing (e.g., Laplacian flow), filtering (e.g., mean
+/// value) and morphological operations (e.g., morphological opening).
+/// All these operations can optionally be masked with another grid that
+/// acts as an alpha-mask.
+
+#ifndef OPENVDB_TOOLS_LEVELSETFILTER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVELSETFILTER_HAS_BEEN_INCLUDED
+
+#include <assert.h>
+#include <boost/type_traits/is_floating_point.hpp>
+#include "LevelSetTracker.h"
+#include "Interpolation.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Filtering (e.g. diffusion) of narrow-band level sets. An
+/// optional scalar field can be used to produce a (smooth) alpha mask
+/// for the filtering.
+///
+/// @note This class performs proper interface tracking which allows
+/// for unrestricted surface deformations
+template<typename GridT,
+         typename MaskT = typename GridT::template ValueConverter<float>::Type,
+         typename InterruptT = util::NullInterrupter>
+class LevelSetFilter : public LevelSetTracker<GridT, InterruptT>
+{
+public:
+    typedef LevelSetTracker<GridT, InterruptT>              BaseType;
+    typedef GridT                                           GridType;
+    typedef MaskT                                           MaskType;
+    typedef typename GridType::TreeType                     TreeType;
+    typedef typename TreeType::ValueType                    ValueType;
+    typedef typename MaskType::ValueType                    AlphaType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<AlphaType>::value);
+
+    /// @brief Main constructor from a grid
+    /// @param grid The level set to be filtered.
+    /// @param interrupt Optional interrupter.
+    LevelSetFilter(GridType& grid, InterruptT* interrupt = NULL)
+        : BaseType(grid, interrupt)
+        , mMinMask(0)
+        , mMaxMask(1)
+        , mInvertMask(false)
+    {
+    }
+    /// @brief Default destructor
+    virtual ~LevelSetFilter() {}
+
+    /// @brief Return the minimum value of the mask to be used for the
+    /// derivation of a smooth alpha value.
+    AlphaType minMask() const { return mMinMask; }
+    /// @brief Return the maximum value of the mask to be used for the
+    /// derivation of a smooth alpha value.
+    AlphaType maxMask() const { return mMaxMask; }
+    /// @brief Define the range for the (optional) scalar mask.
+    /// @param min Minimum value of the range.
+    /// @param max Maximum value of the range.
+    /// @details Mask values outside the range maps to alpha values of
+    /// respectfully zero and one, and values inside the range maps
+    /// smoothly to 0->1 (unless of course the mask is inverted).
+    /// @throw ValueError if @a min is not smaller than @a max.
+    void setMaskRange(AlphaType min, AlphaType max)
+    {
+        if (!(min < max)) OPENVDB_THROW(ValueError, "Invalid mask range (expects min < max)");
+        mMinMask = min;
+        mMaxMask = max;
+    }
+
+    /// @brief Return true if the mask is inverted, i.e. min->max in the
+    /// original mask maps to 1->0 in the inverted alpha mask.
+    bool isMaskInverted() const { return mInvertMask; }
+    /// @brief Invert the optional mask, i.e. min->max in the original
+    /// mask maps to 1->0 in the inverted alpha mask.
+    void invertMask(bool invert=true) { mInvertMask = invert; }
+
+    /// @brief One iteration of mean-curvature flow of the level set.
+    /// @param mask Optional alpha mask.
+    void meanCurvature(const MaskType* mask = NULL)
+    {
+        Filter f(this, mask); f.meanCurvature();
+    }
+
+    /// @brief One iteration of Laplacian flow of the level set.
+    /// @param mask Optional alpha mask.
+    void laplacian(const MaskType* mask = NULL)
+    {
+        Filter f(this, mask); f.laplacian();
+    }
+
+    /// @brief One iteration of a fast separable Gaussian filter.
+    /// @param width Width of the Gaussian kernel in voxel units.
+    /// @param mask Optional alpha mask.
+    ///
+    /// @note This is approximated as 4 iterations of a separable mean filter
+    /// which typically leads an approximation that's better than 95%!
+    void gaussian(int width = 1, const MaskType* mask = NULL)
+    {
+        Filter f(this, mask); f.gaussian(width);
+    }
+
+    /// @brief Offset the level set by the specified (world) distance.
+    /// @param offset Value of the offset.
+    /// @param mask Optional alpha mask.
+    void offset(ValueType offset, const MaskType* mask = NULL)
+    {
+        Filter f(this, mask); f.offset(offset);
+    }
+
+    /// @brief One iteration of median-value flow of the level set.
+    /// @param width Width of the median-value kernel in voxel units.
+    /// @param mask Optional alpha mask.
+    ///
+    /// @warning This filter is not separable and is hence relatively
+    /// slow!
+    void median(int width = 1, const MaskType* mask = NULL)
+    {
+        Filter f(this, mask); f.median(width);
+    }
+
+    /// @brief One iteration of mean-value flow of the level set.
+    /// @param width Width of the mean-value kernel in voxel units.
+    /// @param mask Optional alpha mask.
+    ///
+    /// @note This filter is separable so it's fast!
+    void mean(int width = 1, const MaskType* mask = NULL)
+    {
+        Filter f(this, mask); f.mean(width);
+    }
+
+private:
+    // disallow copy construction and copy by assignment!
+    LevelSetFilter(const LevelSetFilter&);// not implemented
+    LevelSetFilter& operator=(const LevelSetFilter&);// not implemented
+
+    // Private struct that implements all the filtering.
+    struct Filter
+    {
+        typedef typename TreeType::LeafNodeType                  LeafT;
+        typedef typename LeafT::ValueOnIter                      VoxelIterT;
+        typedef typename LeafT::ValueOnCIter                     VoxelCIterT;
+        typedef typename tree::LeafManager<TreeType>::BufferType BufferT;
+        typedef typename tree::LeafManager<TreeType>::LeafRange  LeafRange;
+        typedef typename LeafRange::Iterator                     LeafIterT;
+        typedef tools::AlphaMask<GridT, MaskT>                   AlphaMaskT;
+
+        Filter(LevelSetFilter* parent, const MaskType* mask) : mParent(parent), mMask(mask) {}
+        virtual ~Filter() {}
+
+        void box(int width);
+        void median(int width);
+        void mean(int width);
+        void gaussian(int width);
+        void laplacian();
+        void meanCurvature();
+        void offset(ValueType value);
+        void operator()(const LeafRange& r) const
+        {
+            if (mTask) mTask(const_cast<Filter*>(this), r);
+            else OPENVDB_THROW(ValueError, "task is undefined - don\'t call this method directly");
+        }
+        void cook(bool swap)
+        {
+            const int n = mParent->getGrainSize();
+            if (n>0) {
+                tbb::parallel_for(mParent->leafs().leafRange(n), *this);
+            } else {
+                (*this)(mParent->leafs().leafRange());
+            }
+            if (swap) mParent->leafs().swapLeafBuffer(1, n==0);
+        }
+
+        template <size_t Axis>
+        struct Avg {
+            Avg(const GridT& grid, Int32 w) :
+                acc(grid.tree()), width(w), frac(1/ValueType(2*w+1)) {}
+            inline ValueType operator()(Coord xyz)
+            {
+                ValueType sum = zeroVal<ValueType>();
+                Int32& i = xyz[Axis], j = i + width;
+                for (i -= width; i <= j; ++i) sum += acc.getValue(xyz);
+                return sum*frac;
+            }
+            typename GridT::ConstAccessor acc;
+            const Int32 width;
+            const ValueType frac;
+        };
+
+        template <typename AvgT>
+        void box( const LeafRange& r, Int32 w);
+
+        void boxX(const LeafRange& r, Int32 w) { this->box<Avg<0> >(r,w); }
+        void boxZ(const LeafRange& r, Int32 w) { this->box<Avg<1> >(r,w); }
+        void boxY(const LeafRange& r, Int32 w) { this->box<Avg<2> >(r,w); }
+
+        void median(const LeafRange&, int);
+        void meanCurvature(const LeafRange&);
+        void laplacian(const LeafRange&);
+        void offset(const LeafRange&, ValueType);
+
+        LevelSetFilter* mParent;
+        const MaskType* mMask;
+        typename boost::function<void (Filter*, const LeafRange&)> mTask;
+    }; // end of private Filter struct
+
+    AlphaType mMinMask, mMaxMask;
+    bool      mInvertMask;
+
+}; // end of LevelSetFilter class
+
+
+////////////////////////////////////////
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::median(int width)
+{
+    mParent->startInterrupter("Median-value flow of level set");
+
+    mParent->leafs().rebuildAuxBuffers(1, mParent->getGrainSize()==0);
+
+    mTask = boost::bind(&Filter::median, _1, _2, std::max(1, width));
+    this->cook(true);
+
+    mParent->track();
+
+    mParent->endInterrupter();
+}
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::mean(int width)
+{
+    mParent->startInterrupter("Mean-value flow of level set");
+
+    this->box(width);
+
+    mParent->endInterrupter();
+}
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::gaussian(int width)
+{
+    mParent->startInterrupter("Gaussian flow of level set");
+
+    for (int n=0; n<4; ++n) this->box(width);
+
+    mParent->endInterrupter();
+}
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::box(int width)
+{
+    mParent->leafs().rebuildAuxBuffers(1, mParent->getGrainSize()==0);
+
+    width = std::max(1, width);
+
+    mTask = boost::bind(&Filter::boxX, _1, _2, width);
+    this->cook(true);
+
+    mTask = boost::bind(&Filter::boxY, _1, _2, width);
+    this->cook(true);
+
+    mTask = boost::bind(&Filter::boxZ, _1, _2, width);
+    this->cook(true);
+
+    mParent->track();
+}
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::meanCurvature()
+{
+    mParent->startInterrupter("Mean-curvature flow of level set");
+
+    mParent->leafs().rebuildAuxBuffers(1, mParent->getGrainSize()==0);
+
+    mTask = boost::bind(&Filter::meanCurvature, _1, _2);
+    this->cook(true);
+
+    mParent->track();
+
+    mParent->endInterrupter();
+}
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::laplacian()
+{
+    mParent->startInterrupter("Laplacian flow of level set");
+
+    mParent->leafs().rebuildAuxBuffers(1, mParent->getGrainSize()==0);
+
+    mTask = boost::bind(&Filter::laplacian, _1, _2);
+    this->cook(true);
+
+    mParent->track();
+
+    mParent->endInterrupter();
+}
+
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::offset(ValueType value)
+{
+    mParent->startInterrupter("Offsetting level set");
+
+    mParent->leafs().removeAuxBuffers();// no auxiliary buffers required
+
+    const ValueType CFL = ValueType(0.5) * mParent->voxelSize(), offset = openvdb::math::Abs(value);
+    ValueType dist = 0.0;
+    while (offset-dist > ValueType(0.001)*CFL && mParent->checkInterrupter()) {
+        const ValueType delta = openvdb::math::Min(offset-dist, CFL);
+        dist += delta;
+
+        mTask = boost::bind(&Filter::offset, _1, _2, copysign(delta, value));
+        this->cook(false);
+
+        mParent->track();
+    }
+
+    mParent->endInterrupter();
+}
+
+
+///////////////////////// PRIVATE METHODS //////////////////////
+
+/// Performs parabolic mean-curvature diffusion
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::meanCurvature(const LeafRange& range)
+{
+    mParent->checkInterrupter();
+    //const float CFL = 0.9f, dt = CFL * mDx * mDx / 6.0f;
+    const ValueType dx = mParent->voxelSize(), dt = math::Pow2(dx) / ValueType(3.0);
+    math::CurvatureStencil<GridType> stencil(mParent->grid(), dx);
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(mParent->grid(), *mMask, mParent->minMask(),
+                         mParent->maxMask(), mParent->isMaskInverted());
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                if (alpha(iter.getCoord(), a, b)) {
+                    stencil.moveTo(iter);
+                    const ValueType phi0 = *iter, phi1 = phi0 + dt*stencil.meanCurvatureNormGrad();
+                    buffer[iter.pos()] = b * phi0 + a * phi1;
+                }
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                stencil.moveTo(iter);
+                buffer[iter.pos()] = *iter + dt*stencil.meanCurvatureNormGrad();
+            }
+        }
+    }
+}
+
+/// Performs Laplacian diffusion. Note if the grids contains a true
+/// signed distance field (e.g. a solution to the Eikonal equation)
+/// Laplacian diffusions (e.g. geometric heat equation) is actually
+/// identical to mean curvature diffusion, yet less computationally
+/// expensive! In other words if you're performing renormalization
+/// anyway (e.g. rebuilding the narrow-band) you should consider
+/// performing Laplacian diffusion over mean curvature flow!
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::laplacian(const LeafRange& range)
+{
+    mParent->checkInterrupter();
+    //const float CFL = 0.9f, half_dt = CFL * mDx * mDx / 12.0f;
+    const ValueType dx = mParent->voxelSize(), dt = math::Pow2(dx) / ValueType(6.0);
+    math::GradStencil<GridType> stencil(mParent->grid(), dx);
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(mParent->grid(), *mMask, mParent->minMask(),
+                         mParent->maxMask(), mParent->isMaskInverted());
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                if (alpha(iter.getCoord(), a, b)) {
+                    stencil.moveTo(iter);
+                    const ValueType phi0 = *iter, phi1 = phi0 + dt*stencil.laplacian();
+                    buffer[iter.pos()] = b * phi0 + a * phi1;
+                }
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                stencil.moveTo(iter);
+                buffer[iter.pos()] = *iter + dt*stencil.laplacian();
+            }
+        }
+    }
+}
+
+/// Offsets the values by a constant
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::offset(const LeafRange& range, ValueType offset)
+{
+    mParent->checkInterrupter();
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(mParent->grid(), *mMask, mParent->minMask(),
+                         mParent->maxMask(), mParent->isMaskInverted());
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            for (VoxelIterT iter = leafIter->beginValueOn(); iter; ++iter) {
+                if (alpha(iter.getCoord(), a, b)) iter.setValue(*iter + a*offset);
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            for (VoxelIterT iter = leafIter->beginValueOn(); iter; ++iter) {
+                iter.setValue(*iter + offset);
+            }
+        }
+    }
+}
+
+/// Performs simple but slow median-value diffusion
+template<typename GridT, typename MaskT, typename InterruptT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::median(const LeafRange& range, int width)
+{
+    mParent->checkInterrupter();
+    typename math::DenseStencil<GridType> stencil(mParent->grid(), width);//creates local cache!
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(mParent->grid(), *mMask, mParent->minMask(),
+                         mParent->maxMask(), mParent->isMaskInverted());
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                if (alpha(iter.getCoord(), a, b)) {
+                    stencil.moveTo(iter);
+                    buffer[iter.pos()] = b * (*iter) + a * stencil.median();
+                }
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                stencil.moveTo(iter);
+                buffer[iter.pos()] = stencil.median();
+            }
+        }
+    }
+}
+
+/// One dimensional convolution of a separable box filter
+template<typename GridT, typename MaskT, typename InterruptT>
+template <typename AvgT>
+inline void
+LevelSetFilter<GridT, MaskT, InterruptT>::
+Filter::box(const LeafRange& range, Int32 w)
+{
+    mParent->checkInterrupter();
+    AvgT avg(mParent->grid(), w);
+    if (mMask) {
+        typename AlphaMaskT::FloatType a, b;
+        AlphaMaskT alpha(mParent->grid(), *mMask, mParent->minMask(),
+                         mParent->maxMask(), mParent->isMaskInverted());
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                const Coord xyz = iter.getCoord();
+                if (alpha(xyz, a, b)) buffer[iter.pos()] = b * (*iter)+ a * avg(xyz);
+            }
+        }
+    } else {
+        for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+            ValueType* buffer = leafIter.buffer(1).data();
+            for (VoxelCIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                buffer[iter.pos()] = avg(iter.getCoord());
+            }
+        }
+    }
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVELSETFILTER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetFracture.h b/nuparu/include/openvdb_new/tools/LevelSetFracture.h
new file mode 100644
index 00000000..db73ee08
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetFracture.h
@@ -0,0 +1,346 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file tools/LevelSetFracture.h
+///
+/// @brief Divide volumes represented by level set grids into multiple,
+/// disjoint pieces by intersecting them with one or more "cutter" volumes,
+/// also represented by level sets.
+
+#ifndef OPENVDB_TOOLS_LEVELSETFRACTURE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVELSETFRACTURE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/math/Quat.h>
+#include <openvdb/util/NullInterrupter.h>
+
+#include "Composite.h" // for csgIntersectionCopy() and csgDifferenceCopy()
+#include "GridTransformer.h" // for resampleToMatch()
+#include "LevelSetUtil.h" // for sdfSegmentation()
+
+#include <limits>
+#include <list>
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Level set fracturing
+template<class GridType, class InterruptType = util::NullInterrupter>
+class LevelSetFracture
+{
+public:
+    typedef std::vector<Vec3s> Vec3sList;
+    typedef std::vector<math::Quats> QuatsList;
+    typedef std::list<typename GridType::Ptr> GridPtrList;
+    typedef typename GridPtrList::iterator GridPtrListIter;
+
+
+    /// @brief Default constructor
+    ///
+    /// @param interrupter  optional interrupter object
+    explicit LevelSetFracture(InterruptType* interrupter = NULL);
+
+    /// @brief Divide volumes represented by level set grids into multiple,
+    /// disjoint pieces by intersecting them with one or more "cutter" volumes,
+    /// also represented by level sets.
+    /// @details If desired, the process can be applied iteratively, so that
+    /// fragments created with one cutter are subdivided by other cutters.
+    ///
+    /// @note  The incoming @a grids and the @a cutter are required to have matching
+    ///        transforms and narrow band widths!
+    ///
+    /// @param grids          list of grids to fracture. The residuals of the
+    ///                       fractured grids will remain in this list
+    /// @param cutter         a level set grid to use as the cutter object
+    /// @param segment        toggle to split disjoint fragments into their own grids
+    /// @param points         optional list of world space points at which to instance the
+    ///                       cutter object (if null, use the cutter's current position only)
+    /// @param rotations      optional list of custom rotations for each cutter instance
+    /// @param cutterOverlap  toggle to allow consecutive cutter instances to fracture
+    ///                       previously generated fragments
+    void fracture(GridPtrList& grids, const GridType& cutter, bool segment = false,
+        const Vec3sList* points = NULL, const QuatsList* rotations = NULL,
+        bool cutterOverlap = true);
+
+    /// Return a list of new fragments, not including the residuals from the input grids.
+    GridPtrList& fragments() { return mFragments; }
+
+    /// Remove all elements from the fragment list.
+    void clear() { mFragments.clear(); }
+
+private:
+    // disallow copy by assignment
+    void operator=(const LevelSetFracture&) {}
+
+    bool wasInterrupted(int percent = -1) const {
+        return mInterrupter && mInterrupter->wasInterrupted(percent);
+    }
+
+    bool isValidFragment(GridType&) const;
+    void segmentFragments(GridPtrList&) const;
+    void process(GridPtrList&, const GridType& cutter);
+
+    InterruptType* mInterrupter;
+    GridPtrList mFragments;
+};
+
+
+////////////////////////////////////////
+
+
+// Internal utility objects and implementation details
+
+namespace level_set_fracture_internal {
+
+
+template<typename LeafNodeType>
+struct FindMinMaxVoxelValue {
+
+    typedef typename LeafNodeType::ValueType    ValueType;
+
+    FindMinMaxVoxelValue(const std::vector<const LeafNodeType*>& nodes)
+        : minValue(std::numeric_limits<ValueType>::max())
+        , maxValue(-minValue)
+        , mNodes(nodes.empty() ? NULL : &nodes.front())
+    {
+    }
+
+    FindMinMaxVoxelValue(FindMinMaxVoxelValue& rhs, tbb::split)
+        : minValue(std::numeric_limits<ValueType>::max())
+        , maxValue(-minValue)
+        , mNodes(rhs.mNodes)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            const ValueType* data = mNodes[n]->buffer().data();
+            for (Index i = 0; i < LeafNodeType::SIZE; ++i) {
+                minValue = std::min(minValue, data[i]);
+                maxValue = std::max(maxValue, data[i]);
+            }
+        }
+    }
+
+    void join(FindMinMaxVoxelValue& rhs) {
+        minValue = std::min(minValue, rhs.minValue);
+        maxValue = std::max(maxValue, rhs.maxValue);
+    }
+
+    ValueType minValue, maxValue;
+
+    LeafNodeType const * const * const mNodes;
+}; // struct FindMinMaxVoxelValue
+
+
+} // namespace level_set_fracture_internal
+
+
+////////////////////////////////////////
+
+
+template<class GridType, class InterruptType>
+LevelSetFracture<GridType, InterruptType>::LevelSetFracture(InterruptType* interrupter)
+    : mInterrupter(interrupter)
+    , mFragments()
+{
+}
+
+
+template<class GridType, class InterruptType>
+void
+LevelSetFracture<GridType, InterruptType>::fracture(GridPtrList& grids, const GridType& cutter,
+    bool segmentation, const Vec3sList* points, const QuatsList* rotations, bool cutterOverlap)
+{
+    // We can process all incoming grids with the same cutter instance,
+    // this optimization is enabled by the requirement of having matching
+    // transforms between all incoming grids and the cutter object.
+    if (points && points->size() != 0) {
+
+
+        math::Transform::Ptr originalCutterTransform = cutter.transform().copy();
+        GridType cutterGrid(cutter, ShallowCopy());
+
+        const bool hasInstanceRotations =
+            points && rotations && points->size() == rotations->size();
+
+        // for each instance point..
+        for (size_t p = 0, P = points->size(); p < P; ++p) {
+            int percent = int((float(p) / float(P)) * 100.0);
+            if (wasInterrupted(percent)) break;
+
+            GridType instCutterGrid;
+            instCutterGrid.setTransform(originalCutterTransform->copy());
+            math::Transform::Ptr xform = originalCutterTransform->copy();
+
+            if (hasInstanceRotations) {
+                const Vec3s& rot = (*rotations)[p].eulerAngles(math::XYZ_ROTATION);
+                xform->preRotate(rot[0], math::X_AXIS);
+                xform->preRotate(rot[1], math::Y_AXIS);
+                xform->preRotate(rot[2], math::Z_AXIS);
+                xform->postTranslate((*points)[p]);
+            } else {
+                xform->postTranslate((*points)[p]);
+            }
+
+            cutterGrid.setTransform(xform);
+
+            // Since there is no scaling, use the generic resampler instead of
+            // the more expensive level set rebuild tool.
+            if (mInterrupter != NULL) {
+
+                if (hasInstanceRotations) {
+                    doResampleToMatch<BoxSampler>(cutterGrid, instCutterGrid, *mInterrupter);
+                } else {
+                    doResampleToMatch<PointSampler>(cutterGrid, instCutterGrid, *mInterrupter);
+                }
+            } else {
+                util::NullInterrupter interrupter;
+                if (hasInstanceRotations) {
+                    doResampleToMatch<BoxSampler>(cutterGrid, instCutterGrid, interrupter);
+                } else {
+                    doResampleToMatch<PointSampler>(cutterGrid, instCutterGrid, interrupter);
+                }
+            }
+
+            if (wasInterrupted(percent)) break;
+
+            if (cutterOverlap && !mFragments.empty()) process(mFragments, instCutterGrid);
+            process(grids, instCutterGrid);
+        }
+
+    } else {
+        // use cutter in place
+        if (cutterOverlap && !mFragments.empty()) process(mFragments, cutter);
+        process(grids, cutter);
+    }
+
+    if (segmentation) {
+        segmentFragments(mFragments);
+        segmentFragments(grids);
+    }
+}
+
+
+template<class GridType, class InterruptType>
+bool
+LevelSetFracture<GridType, InterruptType>::isValidFragment(GridType& grid) const
+{
+    typedef typename GridType::TreeType::LeafNodeType LeafNodeType;
+
+    if (grid.tree().leafCount() < 9) {
+
+        std::vector<const LeafNodeType*> nodes;
+        grid.tree().getNodes(nodes);
+
+        Index64 activeVoxelCount = 0;
+
+        for (size_t n = 0, N = nodes.size(); n < N; ++n) {
+            activeVoxelCount += nodes[n]->onVoxelCount();
+        }
+
+        if (activeVoxelCount < 27) return false;
+
+        level_set_fracture_internal::FindMinMaxVoxelValue<LeafNodeType> op(nodes);
+        tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), op);
+
+        if ((op.minValue < 0) == (op.maxValue < 0)) return false;
+    }
+
+    return true;
+}
+
+
+template<class GridType, class InterruptType>
+void
+LevelSetFracture<GridType, InterruptType>::segmentFragments(GridPtrList& grids) const
+{
+    GridPtrList newFragments;
+
+    for (GridPtrListIter it = grids.begin(); it != grids.end(); ++it) {
+
+        std::vector<typename GridType::Ptr> segments;
+        segmentSDF(*(*it), segments);
+
+        for (size_t n = 0, N = segments.size(); n < N; ++n) {
+            newFragments.push_back(segments[n]);
+        }
+    }
+
+    grids.swap(newFragments);
+}
+
+
+template<class GridType, class InterruptType>
+void
+LevelSetFracture<GridType, InterruptType>::process(
+    GridPtrList& grids, const GridType& cutter)
+{
+    typedef typename GridType::Ptr GridPtr;
+    GridPtrList newFragments;
+
+    for (GridPtrListIter it = grids.begin(); it != grids.end(); ++it) {
+
+        if (wasInterrupted()) break;
+
+        GridPtr& grid = *it;
+
+        GridPtr fragment = csgIntersectionCopy(*grid, cutter);
+        if (!isValidFragment(*fragment)) continue;
+
+        GridPtr residual = csgDifferenceCopy(*grid, cutter);
+        if (!isValidFragment(*residual)) continue;
+
+        newFragments.push_back(fragment);
+
+        grid->tree().clear();
+        grid->tree().merge(residual->tree());
+    }
+
+    if (!newFragments.empty()) {
+        mFragments.splice(mFragments.end(), newFragments);
+    }
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVELSETFRACTURE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetMeasure.h b/nuparu/include/openvdb_new/tools/LevelSetMeasure.h
new file mode 100644
index 00000000..f7880182
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetMeasure.h
@@ -0,0 +1,567 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file LevelSetMeasure.h
+
+#ifndef OPENVDB_TOOLS_LEVELSETMEASURE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVELSETMEASURE_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_sort.h>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <boost/math/constants/constants.hpp>//for Pi
+#include <openvdb/math/Math.h>
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/tree/LeafManager.h>
+#include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/math/FiniteDifference.h>
+#include <openvdb/math/Operators.h>
+#include <openvdb/util/NullInterrupter.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Return the surface area of a narrow-band level set.
+///
+/// @param grid          a scalar, floating-point grid with one or more disjoint,
+///                      closed isosurfaces at the given @a isovalue
+/// @param useWorldSpace if true the area is computed in
+///                      world space units, else in voxel units.
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point or not a level set.
+template<class GridType>
+inline Real
+levelSetArea(const GridType& grid, bool useWorldSpace = true);
+
+/// @brief Return the volume of a narrow-band level set surface.
+///
+/// @param grid          a scalar, floating-point grid with one or more disjoint,
+///                      closed isosurfaces at the given @a isovalue
+/// @param useWorldSpace if true the volume is computed in
+///                      world space units, else in voxel units.
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point or not a level set.
+template<class GridType>
+inline Real
+levelSetVolume(const GridType& grid, bool useWorldSpace = true);
+
+/// @brief Compute the surface area and volume of a narrow-band level set.
+///
+/// @param grid          a scalar, floating-point grid with one or more disjoint,
+///                      closed isosurfaces at the given @a isovalue
+/// @param area          surface area of the level set
+/// @param volume        volume of the level set surface
+/// @param useWorldSpace if true the area and volume are computed in
+///                      world space units, else in voxel units.
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point or not a level set.
+template<class GridType>
+inline void
+levelSetMeasure(const GridType& grid, Real& area, Real& volume, bool useWorldSpace = true);
+
+/// @brief Compute the surface area and volume of a narrow-band level set.
+///
+/// @param grid          a scalar, floating-point grid with one or more disjoint,
+///                      closed isosurfaces at the given @a isovalue
+/// @param area          surface area of the level set
+/// @param volume        volume of the level set surface
+/// @param avgCurvature  average mean curvature of the level set surface
+/// @param useWorldSpace if true the area, volume and curvature are computed in
+///                      world space units, else in voxel units.
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point or not a level set.
+template<class GridType>
+inline void
+levelSetMeasure(const GridType& grid, Real& area, Real& volume, Real& avgCurvature,
+                bool useWorldSpace = true);
+
+/// @brief Smeared-out and continuous Dirac Delta function.
+template<typename RealT>
+class DiracDelta
+{
+public:
+    DiracDelta(RealT eps) : mC(0.5/eps), mD(2*boost::math::constants::pi<RealT>()*mC), mE(eps) {}
+    inline RealT operator()(RealT phi) const { return math::Abs(phi) > mE ? 0 : mC*(1+cos(mD*phi)); }
+private:
+    const RealT mC, mD, mE;
+};
+
+
+/// @brief Multi-threaded computation of surface area, volume and
+/// average mean-curvature for narrow band level sets.
+///
+/// @details To reduce the risk of round-off errors (primarily due to
+/// catastrophic cancellation) and guarantee determinism during
+/// multi-threading this class is implemented using parallel_for, and
+/// delayed reduction of a sorted list.
+template<typename GridT,
+         typename InterruptT = util::NullInterrupter>
+class LevelSetMeasure
+{
+public:
+    typedef GridT                                GridType;
+    typedef typename GridType::TreeType          TreeType;
+    typedef typename TreeType::ValueType         ValueType;
+    typedef typename tree::LeafManager<const TreeType> ManagerType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+
+    /// @brief Main constructor from a grid
+    /// @param grid The level set to be measured.
+    /// @param interrupt Optional interrupter.
+    /// @throw RuntimeError if the grid is not a level set.
+    LevelSetMeasure(const GridType& grid, InterruptT* interrupt = NULL);
+
+    LevelSetMeasure(ManagerType& leafs, Real Dx, InterruptT* interrupt);
+
+    /// @brief Re-initialize using the specified grid.
+    void reinit(const GridType& grid);
+
+    /// @brief Re-initialize using the specified LeafManager and voxelSize.
+    void reinit(ManagerType& leafs, Real dx);
+
+    /// @brief Destructor
+    virtual ~LevelSetMeasure() {}
+
+     /// @return the grain-size used for multi-threading
+    int getGrainSize() const { return mGrainSize; }
+
+    /// @brief Set the grain-size used for multi-threading.
+    /// @note A grain size of 0 or less disables multi-threading!
+    void setGrainSize(int grainsize) { mGrainSize = grainsize; }
+
+    /// @brief Compute the surface area and volume of the level
+    /// set. Use the last argument to specify the result in world or
+    /// voxel units.
+    /// @note This method is faster (about 3x) then the measure method
+    /// below that also computes the average mean-curvature.
+    void measure(Real& area, Real& volume, bool useWorldUnits = true);
+
+    /// @brief Compute the surface area, volume, and average
+    /// mean-curvature of the level set. Use the last argument to
+    /// specify the result in world or voxel units.
+    /// @note This method is slower (about 3x) then the measure method
+    /// above that only computes the area and volume.
+    void measure(Real& area, Real& volume, Real& avgMeanCurvature, bool useWorldUnits = true);
+
+private:
+    // disallow copy construction and copy by assignment!
+    LevelSetMeasure(const LevelSetMeasure&);// not implemented
+    LevelSetMeasure& operator=(const LevelSetMeasure&);// not implemented
+
+    const TreeType* mTree;
+    ManagerType*    mLeafs;
+    InterruptT*     mInterrupter;
+    double          mDx;
+    double*         mArray;
+    int             mGrainSize;
+
+    // @brief Return false if the process was interrupted
+    bool checkInterrupter();
+
+    typedef typename TreeType::LeafNodeType  LeafT;
+    typedef typename LeafT::ValueOnCIter     VoxelCIterT;
+    typedef typename ManagerType::LeafRange  LeafRange;
+    typedef typename LeafRange::Iterator     LeafIterT;
+
+    struct Measure2
+    {
+        Measure2(LevelSetMeasure* parent) : mParent(parent), mAcc(*mParent->mTree)
+        {
+            if (parent->mGrainSize>0) {
+                tbb::parallel_for(parent->mLeafs->leafRange(parent->mGrainSize), *this);
+            } else {
+                (*this)(parent->mLeafs->leafRange());
+            }
+        }
+        Measure2(const Measure2& other) : mParent(other.mParent), mAcc(*mParent->mTree) {}
+        void operator()(const LeafRange& range) const;
+        LevelSetMeasure* mParent;
+        typename GridT::ConstAccessor mAcc;
+    };
+    struct Measure3
+    {
+        Measure3(LevelSetMeasure* parent) : mParent(parent), mAcc(*mParent->mTree)
+        {
+            if (parent->mGrainSize>0) {
+                tbb::parallel_for(parent->mLeafs->leafRange(parent->mGrainSize), *this);
+            } else {
+                (*this)(parent->mLeafs->leafRange());
+            }
+        }
+        Measure3(const Measure3& other) : mParent(other.mParent), mAcc(*mParent->mTree) {}
+        void operator()(const LeafRange& range) const;
+        LevelSetMeasure* mParent;
+        typename GridT::ConstAccessor mAcc;
+    };
+    inline double reduce(double* first, double scale)
+    {
+        double* last = first + mLeafs->leafCount();
+        tbb::parallel_sort(first, last);//reduces catastrophic cancellation
+        Real sum = 0.0;
+        while(first != last) sum += *first++;
+        return scale * sum;
+    }
+
+}; // end of LevelSetMeasure class
+
+
+template<typename GridT, typename InterruptT>
+inline
+LevelSetMeasure<GridT, InterruptT>::LevelSetMeasure(const GridType& grid, InterruptT* interrupt)
+    : mTree(&(grid.tree()))
+    , mLeafs(NULL)
+    , mInterrupter(interrupt)
+    , mDx(grid.voxelSize()[0])
+    , mArray(NULL)
+    , mGrainSize(1)
+{
+    if (!grid.hasUniformVoxels()) {
+         OPENVDB_THROW(RuntimeError,
+             "The transform must have uniform scale for the LevelSetMeasure to function");
+    }
+    if (grid.getGridClass() != GRID_LEVEL_SET) {
+        OPENVDB_THROW(RuntimeError,
+            "LevelSetMeasure only supports level sets;"
+            " try setting the grid class to \"level set\"");
+    }
+}
+
+
+template<typename GridT, typename InterruptT>
+inline
+LevelSetMeasure<GridT, InterruptT>::LevelSetMeasure(
+    ManagerType& leafs, Real dx, InterruptT* interrupt)
+    : mTree(&(leafs.tree()))
+    , mLeafs(&leafs)
+    , mInterrupter(interrupt)
+    , mDx(dx)
+    , mArray(NULL)
+    , mGrainSize(1)
+{
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetMeasure<GridT, InterruptT>::reinit(const GridType& grid)
+{
+    if (!grid.hasUniformVoxels()) {
+         OPENVDB_THROW(RuntimeError,
+             "The transform must have uniform scale for the LevelSetMeasure to function");
+    }
+    if (grid.getGridClass() != GRID_LEVEL_SET) {
+        OPENVDB_THROW(RuntimeError,
+            "LevelSetMeasure only supports level sets;"
+            " try setting the grid class to \"level set\"");
+    }
+    mTree = &(grid.tree());
+    mLeafs = NULL;
+    mDx = grid.voxelSize()[0];
+}
+
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetMeasure<GridT, InterruptT>::reinit(ManagerType& leafs, Real dx)
+{
+    mLeafs = &leafs;
+    mTree = &(leafs.tree());
+    mDx = dx;
+}
+
+////////////////////////////////////////
+
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetMeasure<GridT, InterruptT>::measure(Real& area, Real& volume, bool useWorldUnits)
+{
+    if (mInterrupter) mInterrupter->start("Measuring level set");
+
+
+    const bool newLeafs = mLeafs == NULL;
+    if (newLeafs) mLeafs = new ManagerType(*mTree);
+    const size_t leafCount = mLeafs->leafCount();
+    if (leafCount == 0) {
+        area = volume = 0;
+        return;
+    }
+    mArray = new double[2*leafCount];
+
+    Measure2 m(this);
+
+    const double dx = useWorldUnits ? mDx : 1.0;
+    area = this->reduce(mArray, math::Pow2(dx));
+    volume = this->reduce(mArray + leafCount, math::Pow3(dx) / 3.0);
+
+    if (newLeafs) {
+        delete mLeafs;
+        mLeafs = NULL;
+    }
+    delete [] mArray;
+
+    if (mInterrupter) mInterrupter->end();
+}
+
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetMeasure<GridT, InterruptT>::measure(Real& area, Real& volume,
+                                            Real& avgMeanCurvature,
+                                            bool useWorldUnits)
+{
+    if (mInterrupter) mInterrupter->start("Measuring level set");
+
+    const bool newLeafs = mLeafs == NULL;
+    if (newLeafs) mLeafs = new ManagerType(*mTree);
+    const size_t leafCount = mLeafs->leafCount();
+    if (leafCount == 0) {
+        area = volume = avgMeanCurvature = 0;
+        return;
+    }
+    mArray = new double[3*leafCount];
+
+    Measure3 m(this);
+
+    const double dx = useWorldUnits ? mDx : 1.0;
+    area = this->reduce(mArray, math::Pow2(dx));
+    volume = this->reduce(mArray + leafCount, math::Pow3(dx) / 3.0);
+    avgMeanCurvature = this->reduce(mArray + 2*leafCount, dx/area);
+
+    if (newLeafs) {
+        delete mLeafs;
+        mLeafs = NULL;
+    }
+    delete [] mArray;
+
+    if (mInterrupter) mInterrupter->end();
+}
+
+
+///////////////////////// PRIVATE METHODS //////////////////////
+
+
+template<typename GridT, typename InterruptT>
+inline bool
+LevelSetMeasure<GridT, InterruptT>::checkInterrupter()
+{
+    if (util::wasInterrupted(mInterrupter)) {
+        tbb::task::self().cancel_group_execution();
+        return false;
+    }
+    return true;
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetMeasure<GridT, InterruptT>::
+Measure2::operator()(const LeafRange& range) const
+{
+    typedef math::Vec3<ValueType> Vec3T;
+    typedef math::ISGradient<math::CD_2ND> Grad;
+    mParent->checkInterrupter();
+    const Real invDx = 1.0/mParent->mDx;
+    const DiracDelta<Real> DD(1.5);
+    const size_t leafCount = mParent->mLeafs->leafCount();
+    for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+        Real sumA = 0, sumV = 0;//reduce risk of catastrophic cancellation
+        for (VoxelCIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+            const Real dd = DD(invDx * (*voxelIter));
+            if (dd > 0.0) {
+                const Coord p = voxelIter.getCoord();
+                const Vec3T g = invDx*Grad::result(mAcc, p);//voxel units
+                sumA += dd * g.dot(g);
+                sumV += dd * (g[0]*p[0]+g[1]*p[1]+g[2]*p[2]);
+            }
+        }
+        double* v = mParent->mArray + leafIter.pos();
+        *v = sumA;
+        v += leafCount;
+        *v = sumV;
+    }
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetMeasure<GridT, InterruptT>::
+Measure3::operator()(const LeafRange& range) const
+{
+    typedef math::Vec3<ValueType> Vec3T;
+    typedef math::ISGradient<math::CD_2ND> Grad;
+    typedef math::ISMeanCurvature<math::CD_SECOND, math::CD_2ND> Curv;
+    mParent->checkInterrupter();
+    const Real invDx = 1.0/mParent->mDx;
+    const DiracDelta<Real> DD(1.5);
+    ValueType alpha, beta;
+    const size_t leafCount = mParent->mLeafs->leafCount();
+    for (LeafIterT leafIter=range.begin(); leafIter; ++leafIter) {
+        Real sumA = 0, sumV = 0, sumC = 0;//reduce risk of catastrophic cancellation
+        for (VoxelCIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+            const Real dd = DD(invDx * (*voxelIter));
+            if (dd > 0.0) {
+                const Coord p = voxelIter.getCoord();
+                const Vec3T g = invDx*Grad::result(mAcc, p);//voxel units
+                const Real dA = dd * g.dot(g);
+                sumA += dA;
+                sumV += dd * (g[0]*p[0]+g[1]*p[1]+g[2]*p[2]);
+                Curv::result(mAcc, p, alpha, beta);
+                sumC += dA * alpha/(2*math::Pow2(beta))*invDx;
+            }
+        }
+        double* v = mParent->mArray + leafIter.pos();
+        *v = sumA;
+        v += leafCount;
+        *v = sumV;
+        v += leafCount;
+        *v = sumC;
+    }
+}
+
+////////////////////////////////////////
+
+template<class GridT>
+inline typename boost::enable_if<boost::is_floating_point<typename GridT::ValueType>, Real>::type
+doLevelSetArea(const GridT& grid, bool useWorldSpace)
+{
+    Real area, volume;
+    LevelSetMeasure<GridT> m(grid);
+    m.measure(area, volume, useWorldSpace);
+    return area;
+}
+
+template<class GridT>
+inline typename boost::disable_if<boost::is_floating_point<typename GridT::ValueType>, Real>::type
+doLevelSetArea(const GridT&, bool)
+{
+    OPENVDB_THROW(TypeError,
+        "level set area is supported only for scalar, floating-point grids");
+}
+
+template<class GridT>
+inline Real
+levelSetArea(const GridT& grid, bool useWorldSpace)
+{
+    return doLevelSetArea<GridT>(grid, useWorldSpace);
+}
+
+////////////////////////////////////////
+
+template<class GridT>
+inline typename boost::enable_if<boost::is_floating_point<typename GridT::ValueType>, Real>::type
+doLevelSetVolume(const GridT& grid, bool useWorldSpace)
+{
+    Real area, volume;
+    LevelSetMeasure<GridT> m(grid);
+    m.measure(area, volume, useWorldSpace);
+    return volume;
+}
+
+template<class GridT>
+inline typename boost::disable_if<boost::is_floating_point<typename GridT::ValueType>, Real>::type
+doLevelSetVolume(const GridT&, bool)
+{
+    OPENVDB_THROW(TypeError,
+        "level set volume is supported only for scalar, floating-point grids");
+}
+
+template<class GridT>
+inline Real
+levelSetVolume(const GridT& grid, bool useWorldSpace)
+{
+    return doLevelSetVolume<GridT>(grid, useWorldSpace);
+}
+
+////////////////////////////////////////
+
+template<class GridT>
+inline typename boost::enable_if<boost::is_floating_point<typename GridT::ValueType> >::type
+doLevelSetMeasure(const GridT& grid, Real& area, Real& volume, bool useWorldSpace)
+{
+    LevelSetMeasure<GridT> m(grid);
+    m.measure(area, volume, useWorldSpace);
+}
+
+template<class GridT>
+inline typename boost::disable_if<boost::is_floating_point<typename GridT::ValueType> >::type
+doLevelSetMeasure(const GridT&, Real&, Real&, bool)
+{
+    OPENVDB_THROW(TypeError,
+        "level set measure is supported only for scalar, floating-point grids");
+}
+
+template<class GridT>
+inline void
+levelSetMeasure(const GridT& grid, Real& area, Real& volume, bool useWorldSpace)
+{
+    doLevelSetMeasure<GridT>(grid, area, volume, useWorldSpace);
+}
+
+////////////////////////////////////////
+
+template<class GridT>
+inline typename boost::enable_if<boost::is_floating_point<typename GridT::ValueType> >::type
+doLevelSetMeasure(const GridT& grid, Real& area, Real& volume, Real& avgCurvature,
+                  bool useWorldSpace)
+{
+    LevelSetMeasure<GridT> m(grid);
+    m.measure(area, volume, avgCurvature, useWorldSpace);
+}
+
+template<class GridT>
+inline typename boost::disable_if<boost::is_floating_point<typename GridT::ValueType> >::type
+doLevelSetMeasure(const GridT&, Real&, Real&, Real&, bool)
+{
+    OPENVDB_THROW(TypeError,
+        "level set measure is supported only for scalar, floating-point grids");
+}
+
+template<class GridT>
+inline void
+levelSetMeasure(const GridT& grid, Real& area, Real& volume, Real& avgCurvature, bool useWorldSpace)
+{
+    doLevelSetMeasure<GridT>(grid, area, volume, avgCurvature, useWorldSpace);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVELSETMEASURE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetMorph.h b/nuparu/include/openvdb_new/tools/LevelSetMorph.h
new file mode 100644
index 00000000..618d5bea
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetMorph.h
@@ -0,0 +1,664 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file LevelSetMorph.h
+///
+/// @brief Shape morphology of level sets. Morphing from a source
+/// narrow-band level sets to a target narrow-band level set.
+
+#ifndef OPENVDB_TOOLS_LEVEL_SET_MORPH_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVEL_SET_MORPH_HAS_BEEN_INCLUDED
+
+#include "LevelSetTracker.h"
+#include "Interpolation.h" // for BoxSampler, etc.
+#include <openvdb/math/FiniteDifference.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Shape morphology of level sets. Morphing from a source
+/// narrow-band level sets to a target narrow-band level set.
+///
+/// @details
+/// The @c InterruptType template argument below refers to any class
+/// with the following interface:
+/// @code
+/// class Interrupter {
+///   ...
+/// public:
+///   void start(const char* name = NULL)// called when computations begin
+///   void end()                         // called when computations end
+///   bool wasInterrupted(int percent=-1)// return true to break computation
+/// };
+/// @endcode
+///
+/// @note If no template argument is provided for this InterruptType,
+/// the util::NullInterrupter is used, which implies that all interrupter
+/// calls are no-ops (i.e., they incur no computational overhead).
+template<typename GridT,
+         typename InterruptT = util::NullInterrupter>
+class LevelSetMorphing
+{
+public:
+    typedef GridT                              GridType;
+    typedef typename GridT::TreeType           TreeType;
+    typedef LevelSetTracker<GridT, InterruptT> TrackerT;
+    typedef typename TrackerT::LeafRange       LeafRange;
+    typedef typename TrackerT::LeafType        LeafType;
+    typedef typename TrackerT::BufferType      BufferType;
+    typedef typename TrackerT::ValueType       ValueType;
+
+    /// Main constructor
+    LevelSetMorphing(GridT& sourceGrid,
+                     const GridT& targetGrid,
+                     InterruptT* interrupt = NULL)
+        : mTracker(sourceGrid, interrupt)
+        , mTarget(&targetGrid)
+        , mMask(NULL)
+        , mSpatialScheme(math::HJWENO5_BIAS)
+        , mTemporalScheme(math::TVD_RK2)
+        , mMinMask(0)
+        , mDeltaMask(1)
+        , mInvertMask(false)
+    {
+    }
+
+    virtual ~LevelSetMorphing() {}
+
+    /// Redefine the target level set
+    void setTarget(const GridT& targetGrid) { mTarget = &targetGrid; }
+
+    /// Define the alpha mask
+    void setAlphaMask(const GridT& maskGrid) { mMask = &maskGrid; }
+
+    /// Return the spatial finite-difference scheme
+    math::BiasedGradientScheme getSpatialScheme() const { return mSpatialScheme; }
+    /// Set the spatial finite-difference scheme
+    void setSpatialScheme(math::BiasedGradientScheme scheme) { mSpatialScheme = scheme; }
+
+    /// Return the temporal integration scheme
+    math::TemporalIntegrationScheme getTemporalScheme() const { return mTemporalScheme; }
+    /// Set the temporal integration scheme
+    void setTemporalScheme(math::TemporalIntegrationScheme scheme) { mTemporalScheme = scheme; }
+
+    /// Return the spatial finite-difference scheme
+    math::BiasedGradientScheme getTrackerSpatialScheme() const
+    {
+        return mTracker.getSpatialScheme();
+    }
+    /// Set the spatial finite-difference scheme
+    void setTrackerSpatialScheme(math::BiasedGradientScheme scheme)
+    {
+        mTracker.setSpatialScheme(scheme);
+    }
+    /// Return the temporal integration scheme
+    math::TemporalIntegrationScheme getTrackerTemporalScheme() const
+    {
+        return mTracker.getTemporalScheme();
+    }
+    /// Set the temporal integration scheme
+    void setTrackerTemporalScheme(math::TemporalIntegrationScheme scheme)
+    {
+        mTracker.setTemporalScheme(scheme);
+    }
+    /// Return the number of normalizations performed per track or normalize call.
+    int  getNormCount() const { return mTracker.getNormCount(); }
+    /// Set the number of normalizations performed per track or normalize call.
+    void setNormCount(int n) { mTracker.setNormCount(n); }
+
+    /// Return the grain size used for multithreading
+    int  getGrainSize() const { return mTracker.getGrainSize(); }
+    /// @brief Set the grain size used for multithreading.
+    /// @note A grain size of 0 or less disables multithreading!
+    void setGrainSize(int grainsize) { mTracker.setGrainSize(grainsize); }
+
+    /// @brief Return the minimum value of the mask to be used for the
+    /// derivation of a smooth alpha value.
+    ValueType minMask() const { return mMinMask; }
+
+    /// @brief Return the maximum value of the mask to be used for the
+    /// derivation of a smooth alpha value.
+    ValueType maxMask() const { return mDeltaMask + mMinMask; }
+
+    /// @brief Define the range for the (optional) scalar mask.
+    /// @param min Minimum value of the range.
+    /// @param max Maximum value of the range.
+    /// @details Mask values outside the range maps to alpha values of
+    /// respectfully zero and one, and values inside the range maps
+    /// smoothly to 0->1 (unless of course the mask is inverted).
+    /// @throw ValueError if @a min is not smaller than @a max.
+    void setMaskRange(ValueType min, ValueType max)
+    {
+        if (!(min < max)) OPENVDB_THROW(ValueError, "Invalid mask range (expects min < max)");
+        mMinMask   = min;
+        mDeltaMask = max-min;
+    }
+
+    /// @brief Return true if the mask is inverted, i.e. min->max in the
+    /// original mask maps to 1->0 in the inverted alpha mask.
+    bool isMaskInverted() const { return mInvertMask; }
+    /// @brief Invert the optional mask, i.e. min->max in the original
+    /// mask maps to 1->0 in the inverted alpha mask.
+    void invertMask(bool invert=true) { mInvertMask = invert; }
+
+    /// @brief Advect the level set from its current time, @a time0, to its
+    /// final time, @a time1. If @a time0 > @a time1, perform backward advection.
+    ///
+    /// @return the number of CFL iterations used to advect from @a time0 to @a time1
+    size_t advect(ValueType time0, ValueType time1);
+
+private:
+
+    // disallow copy construction and copy by assignment!
+    LevelSetMorphing(const LevelSetMorphing&);// not implemented
+    LevelSetMorphing& operator=(const LevelSetMorphing&);// not implemented
+
+    template<math::BiasedGradientScheme SpatialScheme>
+    size_t advect1(ValueType time0, ValueType time1);
+
+    template<math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme>
+    size_t advect2(ValueType time0, ValueType time1);
+
+    template<math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme,
+             typename MapType>
+    size_t advect3(ValueType time0, ValueType time1);
+
+    TrackerT                        mTracker;
+    const GridT                    *mTarget, *mMask;
+    math::BiasedGradientScheme      mSpatialScheme;
+    math::TemporalIntegrationScheme mTemporalScheme;
+    ValueType                       mMinMask, mDeltaMask;
+    bool                            mInvertMask;
+
+    // This templated private class implements all the level set magic.
+    template<typename MapT, math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme>
+    struct Morph
+    {
+        /// Main constructor
+        Morph(LevelSetMorphing<GridT, InterruptT>& parent);
+        /// Shallow copy constructor called by tbb::parallel_for() threads
+        Morph(const Morph& other);
+        /// Shallow copy constructor called by tbb::parallel_reduce() threads
+        Morph(Morph& other, tbb::split);
+        /// destructor
+        virtual ~Morph() {}
+        /// Advect the level set from its current time, time0, to its final time, time1.
+        /// @return number of CFL iterations
+        size_t advect(ValueType time0, ValueType time1);
+        /// Used internally by tbb::parallel_for()
+        void operator()(const LeafRange& r) const
+        {
+            if (mTask) mTask(const_cast<Morph*>(this), r);
+            else OPENVDB_THROW(ValueError, "task is undefined - don\'t call this method directly");
+        }
+        /// Used internally by tbb::parallel_reduce()
+        void operator()(const LeafRange& r)
+        {
+            if (mTask) mTask(this, r);
+            else OPENVDB_THROW(ValueError, "task is undefined - don\'t call this method directly");
+        }
+        /// This is only called by tbb::parallel_reduce() threads
+        void join(const Morph& other) { mMaxAbsS = math::Max(mMaxAbsS, other.mMaxAbsS); }
+
+        /// Enum to define the type of multithreading
+        enum ThreadingMode { PARALLEL_FOR, PARALLEL_REDUCE }; // for internal use
+        // method calling tbb
+        void cook(ThreadingMode mode, size_t swapBuffer = 0);
+
+        /// Sample field and return the CFT time step
+        typename GridT::ValueType sampleSpeed(ValueType time0, ValueType time1, Index speedBuffer);
+        void sampleXformedSpeed(const LeafRange& r, Index speedBuffer);
+        void sampleAlignedSpeed(const LeafRange& r, Index speedBuffer);
+
+        // Convex combination of Phi and a forward Euler advection steps:
+        // Phi(result) = alpha * Phi(phi) + (1-alpha) * (Phi(0) - dt * Speed(speed)*|Grad[Phi(0)]|);
+        template <int Nominator, int Denominator>
+        void euler(const LeafRange&, ValueType, Index, Index, Index);
+        inline void euler01(const LeafRange& r, ValueType t, Index s) {this->euler<0,1>(r,t,0,1,s);}
+        inline void euler12(const LeafRange& r, ValueType t) {this->euler<1,2>(r, t, 1, 1, 2);}
+        inline void euler34(const LeafRange& r, ValueType t) {this->euler<3,4>(r, t, 1, 2, 3);}
+        inline void euler13(const LeafRange& r, ValueType t) {this->euler<1,3>(r, t, 1, 2, 3);}
+
+        typedef typename boost::function<void (Morph*, const LeafRange&)> FuncType;
+        LevelSetMorphing* mParent;
+        ValueType         mMinAbsS, mMaxAbsS;
+        const MapT*       mMap;
+        FuncType          mTask;
+    }; // end of private Morph struct
+
+};//end of LevelSetMorphing
+
+template<typename GridT, typename InterruptT>
+inline size_t
+LevelSetMorphing<GridT, InterruptT>::advect(ValueType time0, ValueType time1)
+{
+    switch (mSpatialScheme) {
+    case math::FIRST_BIAS:
+        return this->advect1<math::FIRST_BIAS  >(time0, time1);
+    //case math::SECOND_BIAS:
+    //return this->advect1<math::SECOND_BIAS >(time0, time1);
+    //case math::THIRD_BIAS:
+    //return this->advect1<math::THIRD_BIAS  >(time0, time1);
+    //case math::WENO5_BIAS:
+    //return this->advect1<math::WENO5_BIAS  >(time0, time1);
+    case math::HJWENO5_BIAS:
+        return this->advect1<math::HJWENO5_BIAS>(time0, time1);
+    default:
+        OPENVDB_THROW(ValueError, "Spatial difference scheme not supported!");
+    }
+    return 0;
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme>
+inline size_t
+LevelSetMorphing<GridT, InterruptT>::advect1(ValueType time0, ValueType time1)
+{
+    switch (mTemporalScheme) {
+    case math::TVD_RK1:
+        return this->advect2<SpatialScheme, math::TVD_RK1>(time0, time1);
+    case math::TVD_RK2:
+        return this->advect2<SpatialScheme, math::TVD_RK2>(time0, time1);
+    case math::TVD_RK3:
+        return this->advect2<SpatialScheme, math::TVD_RK3>(time0, time1);
+    default:
+        OPENVDB_THROW(ValueError, "Temporal integration scheme not supported!");
+    }
+    return 0;
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme>
+inline size_t
+LevelSetMorphing<GridT, InterruptT>::advect2(ValueType time0, ValueType time1)
+{
+    const math::Transform& trans = mTracker.grid().transform();
+    if (trans.mapType() == math::UniformScaleMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::UniformScaleMap>(time0, time1);
+    } else if (trans.mapType() == math::UniformScaleTranslateMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::UniformScaleTranslateMap>(
+            time0, time1);
+    } else if (trans.mapType() == math::UnitaryMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::UnitaryMap    >(time0, time1);
+    } else if (trans.mapType() == math::TranslationMap::mapType()) {
+        return this->advect3<SpatialScheme, TemporalScheme, math::TranslationMap>(time0, time1);
+    } else {
+        OPENVDB_THROW(ValueError, "MapType not supported!");
+    }
+    return 0;
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MapT>
+inline size_t
+LevelSetMorphing<GridT, InterruptT>::advect3(ValueType time0, ValueType time1)
+{
+    Morph<MapT, SpatialScheme, TemporalScheme> tmp(*this);
+    return tmp.advect(time0, time1);
+}
+
+
+///////////////////////////////////////////////////////////////////////
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+Morph(LevelSetMorphing<GridT, InterruptT>& parent)
+    : mParent(&parent)
+    , mMinAbsS(ValueType(1e-6))
+    , mMap(parent.mTracker.grid().transform().template constMap<MapT>().get())
+    , mTask(0)
+{
+}
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+Morph(const Morph& other)
+    : mParent(other.mParent)
+    , mMinAbsS(other.mMinAbsS)
+    , mMaxAbsS(other.mMaxAbsS)
+    , mMap(other.mMap)
+    , mTask(other.mTask)
+{
+}
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+Morph(Morph& other, tbb::split)
+    : mParent(other.mParent)
+    , mMinAbsS(other.mMinAbsS)
+    , mMaxAbsS(other.mMaxAbsS)
+    , mMap(other.mMap)
+    , mTask(other.mTask)
+{
+}
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline size_t
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+advect(ValueType time0, ValueType time1)
+{
+    // Make sure we have enough temporal auxiliary buffers for the time
+    // integration AS WELL AS an extra buffer with the speed function!
+    static const Index auxBuffers = 1 + (TemporalScheme == math::TVD_RK3 ? 2 : 1);
+    size_t countCFL = 0;
+    while (time0 < time1 && mParent->mTracker.checkInterrupter()) {
+        mParent->mTracker.leafs().rebuildAuxBuffers(auxBuffers);
+
+        const ValueType dt = this->sampleSpeed(time0, time1, auxBuffers);
+        if ( math::isZero(dt) ) break;//V is essentially zero so terminate
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN //switch is resolved at compile-time
+        switch(TemporalScheme) {
+        case math::TVD_RK1:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * Speed(2) * |Grad[Phi(0)]|
+            mTask = boost::bind(&Morph::euler01, _1, _2, dt, /*speed*/2);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook(PARALLEL_FOR, 1);
+            break;
+        case math::TVD_RK2:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * Speed(2) * |Grad[Phi(0)]|
+            mTask = boost::bind(&Morph::euler01, _1, _2, dt, /*speed*/2);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook(PARALLEL_FOR, 1);
+
+            // Convex combine explict Euler step: t2 = t0 + dt
+            // Phi_t2(1) = 1/2 * Phi_t0(1) + 1/2 * (Phi_t1(0) - dt * Speed(2) * |Grad[Phi(0)]|)
+            mTask = boost::bind(&Morph::euler12, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t2(0) and Phi_t1(1)
+            this->cook(PARALLEL_FOR, 1);
+            break;
+        case math::TVD_RK3:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * Speed(3) * |Grad[Phi(0)]|
+            mTask = boost::bind(&Morph::euler01, _1, _2, dt, /*speed*/3);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook(PARALLEL_FOR, 1);
+
+            // Convex combine explict Euler step: t2 = t0 + dt/2
+            // Phi_t2(2) = 3/4 * Phi_t0(1) + 1/4 * (Phi_t1(0) - dt * Speed(3) * |Grad[Phi(0)]|)
+            mTask = boost::bind(&Morph::euler34, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 2 such that Phi_t2(0) and Phi_t1(2)
+            this->cook(PARALLEL_FOR, 2);
+
+            // Convex combine explict Euler step: t3 = t0 + dt
+            // Phi_t3(2) = 1/3 * Phi_t0(1) + 2/3 * (Phi_t2(0) - dt * Speed(3) * |Grad[Phi(0)]|)
+            mTask = boost::bind(&Morph::euler13, _1, _2, dt);
+
+            // Cook and swap buffer 0 and 2 such that Phi_t3(0) and Phi_t2(2)
+            this->cook(PARALLEL_FOR, 2);
+            break;
+        default:
+            OPENVDB_THROW(ValueError, "Temporal integration scheme not supported!");
+        }//end of compile-time resolved switch
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+
+        time0 += dt;
+        ++countCFL;
+        mParent->mTracker.leafs().removeAuxBuffers();
+
+        // Track the narrow band
+        mParent->mTracker.track();
+    }//end wile-loop over time
+
+    return countCFL;//number of CLF propagation steps
+}
+
+template<typename GridT, typename InterruptT>
+template<typename MapT, math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme>
+inline typename GridT::ValueType
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+sampleSpeed(ValueType time0, ValueType time1, Index speedBuffer)
+{
+    mMaxAbsS = mMinAbsS;
+    const size_t leafCount = mParent->mTracker.leafs().leafCount();
+    if (leafCount==0 || time0 >= time1) return ValueType(0);
+
+    const math::Transform& xform  = mParent->mTracker.grid().transform();
+    if (mParent->mTarget->transform() == xform &&
+        (mParent->mMask == NULL || mParent->mMask->transform() == xform)) {
+        mTask = boost::bind(&Morph::sampleAlignedSpeed, _1, _2, speedBuffer);
+    } else {
+        mTask = boost::bind(&Morph::sampleXformedSpeed, _1, _2, speedBuffer);
+    }
+    this->cook(PARALLEL_REDUCE);
+    if (math::isApproxEqual(mMinAbsS, mMaxAbsS)) return ValueType(0);//speed is essentially zero
+    static const ValueType CFL = (TemporalScheme == math::TVD_RK1 ? ValueType(0.3) :
+                                  TemporalScheme == math::TVD_RK2 ? ValueType(0.9) :
+                                  ValueType(1.0))/math::Sqrt(ValueType(3.0));
+    const ValueType dt = math::Abs(time1 - time0), dx = mParent->mTracker.voxelSize();
+    return math::Min(dt, ValueType(CFL*dx/mMaxAbsS));
+}
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline void
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+sampleXformedSpeed(const LeafRange& range, Index speedBuffer)
+{
+    typedef typename LeafType::ValueOnCIter VoxelIterT;
+    typedef tools::GridSampler<typename GridT::ConstAccessor, tools::BoxSampler> SamplerT;
+    const MapT& map = *mMap;
+    mParent->mTracker.checkInterrupter();
+
+    typename GridT::ConstAccessor targetAcc = mParent->mTarget->getAccessor();
+    SamplerT target(targetAcc, mParent->mTarget->transform());
+    if (mParent->mMask == NULL) {
+        for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueType* speed = leafIter.buffer(speedBuffer).data();
+            bool isZero = true;
+            for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+                ValueType& s = speed[voxelIter.pos()];
+                s -= target.wsSample(map.applyMap(voxelIter.getCoord().asVec3d()));
+                if (!math::isApproxZero(s)) isZero = false;
+                mMaxAbsS = math::Max(mMaxAbsS, math::Abs(s));
+            }
+            if (isZero) speed[0] = std::numeric_limits<ValueType>::max();//tag first voxel
+        }
+    } else {
+        const ValueType min = mParent->mMinMask, invNorm = 1.0f/(mParent->mDeltaMask);
+        const bool invMask = mParent->isMaskInverted();
+        typename GridT::ConstAccessor maskAcc = mParent->mMask->getAccessor();
+        SamplerT mask(maskAcc,  mParent->mMask->transform());
+        for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueType* speed = leafIter.buffer(speedBuffer).data();
+            bool isZero = true;
+            for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+                const Vec3R xyz = map.applyMap(voxelIter.getCoord().asVec3d());//world space
+                const ValueType a = math::SmoothUnitStep((mask.wsSample(xyz)-min)*invNorm);
+                ValueType& s = speed[voxelIter.pos()];
+                s -= target.wsSample(xyz);
+                s *= invMask ? 1 - a : a;
+                if (!math::isApproxZero(s)) isZero = false;
+                mMaxAbsS = math::Max(mMaxAbsS, math::Abs(s));
+            }
+            if (isZero) speed[0] = std::numeric_limits<ValueType>::max();//tag first voxel
+        }
+    }
+}
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline void
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+sampleAlignedSpeed(const LeafRange& range, Index speedBuffer)
+{
+    typedef typename LeafType::ValueOnCIter VoxelIterT;
+    mParent->mTracker.checkInterrupter();
+
+    typename GridT::ConstAccessor target = mParent->mTarget->getAccessor();
+
+    if (mParent->mMask == NULL) {
+        for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueType* speed = leafIter.buffer(speedBuffer).data();
+            bool isZero = true;
+            for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+                ValueType& s = speed[voxelIter.pos()];
+                s -= target.getValue(voxelIter.getCoord());
+                if (!math::isApproxZero(s)) isZero = false;
+                mMaxAbsS = math::Max(mMaxAbsS, math::Abs(s));
+            }
+            if (isZero) speed[0] = std::numeric_limits<ValueType>::max();//tag first voxel
+        }
+    } else {
+        const ValueType min = mParent->mMinMask, invNorm = 1.0f/(mParent->mDeltaMask);
+        const bool invMask = mParent->isMaskInverted();
+        typename GridT::ConstAccessor mask = mParent->mMask->getAccessor();
+        for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueType* speed = leafIter.buffer(speedBuffer).data();
+            bool isZero = true;
+            for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+                const Coord ijk = voxelIter.getCoord();//index space
+                const ValueType a = math::SmoothUnitStep((mask.getValue(ijk)-min)*invNorm);
+                ValueType& s = speed[voxelIter.pos()];
+                s -= target.getValue(ijk);
+                s *= invMask ? 1 - a : a;
+                if (!math::isApproxZero(s)) isZero = false;
+                mMaxAbsS = math::Max(mMaxAbsS, math::Abs(s));
+            }
+            if (isZero) speed[0] = std::numeric_limits<ValueType>::max();//tag first voxel
+        }
+    }
+}
+
+template<typename GridT, typename InterruptT>
+template <typename MapT, math::BiasedGradientScheme SpatialScheme,
+          math::TemporalIntegrationScheme TemporalScheme>
+inline void
+LevelSetMorphing<GridT, InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+cook(ThreadingMode mode, size_t swapBuffer)
+{
+    mParent->mTracker.startInterrupter("Morphing level set");
+
+    const int grainSize   = mParent->mTracker.getGrainSize();
+    const LeafRange range = mParent->mTracker.leafs().leafRange(grainSize);
+
+    if (mParent->mTracker.getGrainSize()==0) {
+        (*this)(range);
+    } else if (mode == PARALLEL_FOR) {
+        tbb::parallel_for(range, *this);
+    } else if (mode == PARALLEL_REDUCE) {
+        tbb::parallel_reduce(range, *this);
+    } else {
+        throw std::runtime_error("Undefined threading mode");
+    }
+
+    mParent->mTracker.leafs().swapLeafBuffer(swapBuffer, grainSize == 0);
+
+    mParent->mTracker.endInterrupter();
+}
+
+template<typename GridT, typename InterruptT>
+template<typename MapT, math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme>
+template <int Nominator, int Denominator>
+inline void
+LevelSetMorphing<GridT,InterruptT>::
+Morph<MapT, SpatialScheme, TemporalScheme>::
+euler(const LeafRange& range, ValueType dt,
+      Index phiBuffer, Index resultBuffer, Index speedBuffer)
+{
+    typedef math::BIAS_SCHEME<SpatialScheme>                             SchemeT;
+    typedef typename SchemeT::template ISStencil<GridType>::StencilType  StencilT;
+    typedef typename LeafType::ValueOnCIter                              VoxelIterT;
+    typedef math::GradientNormSqrd<MapT, SpatialScheme>                  NumGrad;
+
+    static const ValueType Alpha = ValueType(Nominator)/ValueType(Denominator);
+    static const ValueType Beta  = ValueType(1) - Alpha;
+
+    mParent->mTracker.checkInterrupter();
+    const MapT& map = *mMap;
+    StencilT stencil(mParent->mTracker.grid());
+
+    for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+        const ValueType* speed = leafIter.buffer(speedBuffer).data();
+        if (math::isExactlyEqual(speed[0], std::numeric_limits<ValueType>::max())) continue;
+        const ValueType* phi = leafIter.buffer(phiBuffer).data();
+        ValueType* result = leafIter.buffer(resultBuffer).data();
+        for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+            const Index n = voxelIter.pos();
+            if (math::isApproxZero(speed[n])) continue;
+            stencil.moveTo(voxelIter);
+            const ValueType v = stencil.getValue() - dt * speed[n] * NumGrad::result(map, stencil);
+            result[n] = Nominator ? Alpha * phi[n] + Beta * v : v;
+        }//loop over active voxels in the leaf of the mask
+    }//loop over leafs of the level set
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVEL_SET_MORPH_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetPlatonic.h b/nuparu/include/openvdb_new/tools/LevelSetPlatonic.h
new file mode 100644
index 00000000..215effb6
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetPlatonic.h
@@ -0,0 +1,494 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @author Ken Museth
+///
+/// @file LevelSetPlatonic.h
+///
+/// @brief Generate a narrow-band level sets of the five platonic solids.
+///
+/// @note By definition a level set has a fixed narrow band width
+/// (the half width is defined by LEVEL_SET_HALF_WIDTH in Types.h),
+/// whereas an SDF can have a variable narrow band width.
+
+#ifndef OPENVDB_TOOLS_LEVELSETPLATONIC_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVELSETPLATONIC_HAS_BEEN_INCLUDED
+
+#include <vector>
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/tools/MeshToVolume.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <boost/utility.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a platonic solid.
+///
+/// @param faceCount    number of faces of the platonic solid, i.e. 4, 6, 8, 12 or 20
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @details Faces: TETRAHEDRON=4, CUBE=6, OCTAHEDRON=8, DODECAHEDRON=12, ICOSAHEDRON=20
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetPlatonic(int faceCount,// 4, 6, 8, 12 or 20
+                       float scale = 1.0f,
+                       const Vec3f& center = Vec3f(0.0f),
+                       float voxelSize = 0.1f,
+                       float halfWidth = float(LEVEL_SET_HALF_WIDTH),
+                       InterruptT* interrupt = NULL);
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a platonic solid.
+///
+/// @param faceCount    number of faces of the platonic solid, i.e. 4, 6, 8, 12 or 20
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @details Faces: TETRAHEDRON=4, CUBE=6, OCTAHEDRON=8, DODECAHEDRON=12, ICOSAHEDRON=20
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetPlatonic(int faceCount,// 4, 6, 8, 12 or 20
+                       float scale = 1.0f,
+                       const Vec3f& center = Vec3f(0.0f),
+                       float voxelSize = 0.1f,
+                       float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetPlatonic<GridType, util::NullInterrupter>(faceCount,
+                                                                   scale,
+                                                                   center,
+                                                                   voxelSize,
+                                                                   halfWidth);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a tetrahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetTetrahedron(float scale = 1.0f,
+                          const Vec3f& center = Vec3f(0.0f),
+                          float voxelSize = 0.1f,
+                          float halfWidth = float(LEVEL_SET_HALF_WIDTH),
+                          InterruptT* interrupt =  NULL)
+{
+    return createLevelSetPlatonic<GridType, InterruptT>(4, scale, center,
+                                                        voxelSize, halfWidth, interrupt);
+}
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a tetrahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetTetrahedron(float scale = 1.0f,
+                          const Vec3f& center = Vec3f(0.0f),
+                          float voxelSize = 0.1f,
+                          float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetPlatonic<GridType, util::NullInterrupter>(4, scale, center,
+                                                                   voxelSize, halfWidth);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a cube.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetCube(float scale = 1.0f,
+                   const Vec3f& center = Vec3f(0.0f),
+                   float voxelSize = 0.1f,
+                   float halfWidth = float(LEVEL_SET_HALF_WIDTH),
+                   InterruptT* interrupt =  NULL)
+{
+    return createLevelSetPlatonic<GridType, InterruptT>(6, scale, center,
+                                                        voxelSize, halfWidth, interrupt);
+}
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a cube.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetCube(float scale = 1.0f,
+                   const Vec3f& center = Vec3f(0.0f),
+                   float voxelSize = 0.1f,
+                   float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetPlatonic<GridType, util::NullInterrupter>(6, scale, center,
+                                                                   voxelSize, halfWidth);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of an octahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetOctahedron(float scale = 1.0f,
+                         const Vec3f& center = Vec3f(0.0f),
+                         float voxelSize = 0.1f,
+                         float halfWidth = float(LEVEL_SET_HALF_WIDTH),
+                         InterruptT* interrupt =  NULL)
+{
+    return createLevelSetPlatonic<GridType, InterruptT>(8, scale, center,
+                                                        voxelSize, halfWidth, interrupt);
+}
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of an octahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetOctahedron(float scale = 1.0f,
+                         const Vec3f& center = Vec3f(0.0f),
+                         float voxelSize = 0.1f,
+                         float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetPlatonic<GridType, util::NullInterrupter>(8, scale, center,
+                                                                   voxelSize, halfWidth);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a dodecahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetDodecahedron(float scale = 1.0f,
+                           const Vec3f& center = Vec3f(0.0f),
+                           float voxelSize = 0.1f,
+                           float halfWidth = float(LEVEL_SET_HALF_WIDTH),
+                           InterruptT* interrupt =  NULL)
+{
+    return createLevelSetPlatonic<GridType, InterruptT>(12, scale, center,
+                                                        voxelSize, halfWidth, interrupt);
+}
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a dodecahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetDodecahedron(float scale = 1.0f,
+                           const Vec3f& center = Vec3f(0.0f),
+                           float voxelSize = 0.1f,
+                           float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetPlatonic<GridType, util::NullInterrupter>(12, scale, center,
+                                                                   voxelSize, halfWidth);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of an icosahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetIcosahedron(float scale = 1.0f,
+                          const Vec3f& center = Vec3f(0.0f),
+                          float voxelSize = 0.1f,
+                          float halfWidth = float(LEVEL_SET_HALF_WIDTH),
+                          InterruptT* interrupt =  NULL)
+{
+    return createLevelSetPlatonic<GridType, InterruptT>(20, scale, center,
+                                                        voxelSize, halfWidth, interrupt);
+}
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of an icosahedron.
+///
+/// @param scale        scale of the platonic solid in world units
+/// @param center       center of the platonic solid in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetIcosahedron(float scale = 1.0f,
+                          const Vec3f& center = Vec3f(0.0f),
+                          float voxelSize = 0.1f,
+                          float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetPlatonic<GridType, util::NullInterrupter>(20, scale, center,
+                                                                   voxelSize, halfWidth);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetPlatonic(int faceCount,float scale, const Vec3f& center,
+                       float voxelSize, float halfWidth, InterruptT*)
+{
+    // GridType::ValueType is required to be a floating-point scalar.
+    BOOST_STATIC_ASSERT(boost::is_floating_point<typename GridType::ValueType>::value);
+
+    const math::Transform::Ptr xform = math::Transform::createLinearTransform( voxelSize );
+
+    std::vector<Vec3f> vtx;
+    std::vector<Vec3I> tri;
+    std::vector<Vec4I> qua;
+
+    if (faceCount == 4) {// Tetrahedron
+
+        vtx.push_back( Vec3f( 0.0f,          1.0f,         0.0f) );
+        vtx.push_back( Vec3f(-0.942810297f, -0.333329707f, 0.0f) );
+        vtx.push_back( Vec3f( 0.471405149f, -0.333329707f, 0.816497624) );
+        vtx.push_back( Vec3f( 0.471405149f, -0.333329707f, -0.816497624f) );
+
+        tri.push_back( Vec3I(0, 2, 3) );
+        tri.push_back( Vec3I(0, 3, 1) );
+        tri.push_back( Vec3I(0, 1, 2) );
+        tri.push_back( Vec3I(1, 3, 2) );
+
+    } else if (faceCount == 6) {// Cube
+
+        vtx.push_back( Vec3f(-0.5f, -0.5f, -0.5f) );
+        vtx.push_back( Vec3f( 0.5f, -0.5f, -0.5f) );
+        vtx.push_back( Vec3f( 0.5f, -0.5f,  0.5f) );
+        vtx.push_back( Vec3f(-0.5f, -0.5f,  0.5f) );
+        vtx.push_back( Vec3f(-0.5f,  0.5f, -0.5f) );
+        vtx.push_back( Vec3f( 0.5f,  0.5f, -0.5f) );
+        vtx.push_back( Vec3f( 0.5f,  0.5f,  0.5f) );
+        vtx.push_back( Vec3f(-0.5f,  0.5f,  0.5f) );
+
+        qua.push_back( Vec4I(1, 0, 4, 5) );
+        qua.push_back( Vec4I(2, 1, 5, 6) );
+        qua.push_back( Vec4I(3, 2, 6, 7) );
+        qua.push_back( Vec4I(0, 3, 7, 4) );
+        qua.push_back( Vec4I(2, 3, 0, 1) );
+        qua.push_back( Vec4I(5, 4, 7, 6) );
+
+    } else if (faceCount == 8) {// Octahedron
+
+        vtx.push_back( Vec3f( 0.0f, 0.0f, -1.0f) );
+        vtx.push_back( Vec3f( 1.0f, 0.0f,  0.0f) );
+        vtx.push_back( Vec3f( 0.0f, 0.0f,  1.0f) );
+        vtx.push_back( Vec3f(-1.0f, 0.0f,  0.0f) );
+        vtx.push_back( Vec3f( 0.0f,-1.0f,  0.0f) );
+        vtx.push_back( Vec3f( 0.0f, 1.0f,  0.0f) );
+
+        tri.push_back( Vec3I(0, 4, 3) );
+        tri.push_back( Vec3I(0, 1, 4) );
+        tri.push_back( Vec3I(1, 2, 4) );
+        tri.push_back( Vec3I(2, 3, 4) );
+        tri.push_back( Vec3I(0, 3, 5) );
+        tri.push_back( Vec3I(0, 5, 1) );
+        tri.push_back( Vec3I(1, 5, 2) );
+        tri.push_back( Vec3I(2, 5, 3) );
+
+    } else if (faceCount == 12) {// Dodecahedron
+
+        vtx.push_back( Vec3f( 0.354437858f,  0.487842113f, -0.789344311f) );
+        vtx.push_back( Vec3f( 0.573492587f, -0.186338872f, -0.78934437f) );
+        vtx.push_back( Vec3f( 0.0f,         -0.603005826f, -0.78934443f) );
+        vtx.push_back( Vec3f(-0.573492587f, -0.186338872f, -0.78934437f) );
+        vtx.push_back( Vec3f(-0.354437858f,  0.487842113f, -0.789344311f) );
+        vtx.push_back( Vec3f(-0.573492587f,  0.789345026f, -0.186338797f) );
+        vtx.push_back( Vec3f(-0.927930415f, -0.301502913f, -0.186338872f) );
+        vtx.push_back( Vec3f( 0.0f,         -0.975683928f, -0.186338902f) );
+        vtx.push_back( Vec3f( 0.927930415f, -0.301502913f, -0.186338872f) );
+        vtx.push_back( Vec3f( 0.573492587f,  0.789345026f, -0.186338797f) );
+        vtx.push_back( Vec3f( 0.0f,          0.975683868f,  0.186338902f) );
+        vtx.push_back( Vec3f(-0.927930415f,  0.301502913f,  0.186338872f) );
+        vtx.push_back( Vec3f(-0.573492587f, -0.789345026f,  0.186338797f) );
+        vtx.push_back( Vec3f( 0.573492587f, -0.789345026f,  0.186338797f) );
+        vtx.push_back( Vec3f( 0.927930415f,  0.301502913f,  0.186338872f) );
+        vtx.push_back( Vec3f( 0.0f,          0.603005826f,  0.78934443f) );
+        vtx.push_back( Vec3f( 0.573492587f,  0.186338872f,  0.78934437f) );
+        vtx.push_back( Vec3f( 0.354437858f, -0.487842113f,  0.789344311f) );
+        vtx.push_back( Vec3f(-0.354437858f, -0.487842113f,  0.789344311f) );
+        vtx.push_back( Vec3f(-0.573492587f,  0.186338872f,  0.78934437f) );
+
+        qua.push_back( Vec4I(0, 1, 2, 3) );
+        tri.push_back( Vec3I(0, 3, 4) );
+        qua.push_back( Vec4I(0, 4, 5, 10) );
+        tri.push_back( Vec3I(0, 10, 9) );
+        qua.push_back( Vec4I(0, 9, 14, 8) );
+        tri.push_back( Vec3I(0, 8, 1) );
+        qua.push_back( Vec4I(1, 8, 13, 7) );
+        tri.push_back( Vec3I(1, 7, 2) );
+        qua.push_back( Vec4I(2, 7, 12, 6) );
+        tri.push_back( Vec3I(2, 6, 3) );
+        qua.push_back( Vec4I(3, 6, 11, 5) );
+        tri.push_back( Vec3I(3, 5, 4) );
+        qua.push_back( Vec4I(5, 11, 19, 15) );
+        tri.push_back( Vec3I(5, 15, 10) );
+        qua.push_back( Vec4I(6, 12, 18, 19) );
+        tri.push_back( Vec3I(6, 19, 11) );
+        qua.push_back( Vec4I(7, 13, 17, 18) );
+        tri.push_back( Vec3I(7, 18, 12) );
+        qua.push_back( Vec4I(8, 14, 16, 17) );
+        tri.push_back( Vec3I(8, 17, 13) );
+        qua.push_back( Vec4I(9, 10, 15, 16) );
+        tri.push_back( Vec3I(9, 16, 14) );
+        qua.push_back( Vec4I(15, 19, 18, 17) );
+        tri.push_back( Vec3I(15, 17, 16) );
+
+    } else if (faceCount == 20) {// Icosahedron
+
+        vtx.push_back( Vec3f(0.0f, 0.0f, -1.0f) );
+        vtx.push_back( Vec3f(0.0f, 0.894427359f, -0.447213143f) );
+        vtx.push_back( Vec3f(0.850650847f, 0.276393682f, -0.447213203f) );
+        vtx.push_back( Vec3f(0.525731206f, -0.723606944f, -0.447213262f) );
+        vtx.push_back( Vec3f(-0.525731206f, -0.723606944f, -0.447213262f) );
+        vtx.push_back( Vec3f(-0.850650847f, 0.276393682f, -0.447213203f) );
+        vtx.push_back( Vec3f(-0.525731206f, 0.723606944f, 0.447213262f) );
+        vtx.push_back( Vec3f(-0.850650847f, -0.276393682f, 0.447213203f) );
+        vtx.push_back( Vec3f(0.0f, -0.894427359f, 0.447213143f) );
+        vtx.push_back( Vec3f(0.850650847f, -0.276393682f, 0.447213203f) );
+        vtx.push_back( Vec3f(0.525731206f, 0.723606944f, 0.447213262f) );
+        vtx.push_back( Vec3f(0.0f, 0.0f, 1.0f) );
+
+        tri.push_back( Vec3I( 2,  0,  1) );
+        tri.push_back( Vec3I( 3,  0,  2) );
+        tri.push_back( Vec3I( 4,  0,  3) );
+        tri.push_back( Vec3I( 5,  0,  4) );
+        tri.push_back( Vec3I( 1,  0,  5) );
+        tri.push_back( Vec3I( 6,  1,  5) );
+        tri.push_back( Vec3I( 7,  5,  4) );
+        tri.push_back( Vec3I( 8,  4,  3) );
+        tri.push_back( Vec3I( 9,  3,  2) );
+        tri.push_back( Vec3I(10,  2,  1) );
+        tri.push_back( Vec3I(10,  1,  6) );
+        tri.push_back( Vec3I( 6,  5,  7) );
+        tri.push_back( Vec3I( 7,  4,  8) );
+        tri.push_back( Vec3I( 8,  3,  9) );
+        tri.push_back( Vec3I( 9,  2, 10) );
+        tri.push_back( Vec3I( 6, 11, 10) );
+        tri.push_back( Vec3I(10, 11,  9) );
+        tri.push_back( Vec3I( 9, 11,  8) );
+        tri.push_back( Vec3I( 8, 11,  7) );
+        tri.push_back( Vec3I( 7, 11,  6) );
+
+    } else {
+        OPENVDB_THROW(RuntimeError, "Invalid face count");
+    }
+
+    // Apply scale and translation to all the vertices
+    for ( size_t i = 0; i<vtx.size(); ++i ) vtx[i] = scale * vtx[i] + center;
+
+    return meshToLevelSet<GridType>( *xform, vtx, tri, qua, halfWidth );
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVELSETPLATONIC_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetRebuild.h b/nuparu/include/openvdb_new/tools/LevelSetRebuild.h
new file mode 100644
index 00000000..9b1c7ddc
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetRebuild.h
@@ -0,0 +1,353 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TOOLS_LEVELSETREBUILD_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVELSETREBUILD_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/Exceptions.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/tools/VolumeToMesh.h>
+#include <openvdb/tools/MeshToVolume.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Util.h>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Return a new grid of type @c GridType that contains a narrow-band level set
+/// representation of an isosurface of a given grid.
+///
+/// @param grid       a scalar, floating-point grid with one or more disjoint,
+///                   closed isosurfaces at the given @a isovalue
+/// @param isovalue   the isovalue that defines the implicit surface (defaults to zero,
+///                   which is typical if the input grid is already a level set or a SDF).
+/// @param halfWidth  half the width of the narrow band, in voxel units
+///                   (defaults to 3 voxels, which is required for some level set operations)
+/// @param xform      optional transform for the output grid
+///                   (if not provided, the transform of the input @a grid will be matched)
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point
+///
+/// @note If the input grid contains overlapping isosurfaces, interior edges will be lost.
+template<class GridType>
+inline typename GridType::Ptr
+levelSetRebuild(const GridType& grid, float isovalue = 0,
+    float halfWidth = float(LEVEL_SET_HALF_WIDTH), const math::Transform* xform = NULL);
+
+
+/// @brief Return a new grid of type @c GridType that contains a narrow-band level set
+/// representation of an isosurface of a given grid.
+///
+/// @param grid         a scalar, floating-point grid with one or more disjoint,
+///                     closed isosurfaces at the given @a isovalue
+/// @param isovalue     the isovalue that defines the implicit surface
+/// @param exBandWidth  the exterior narrow-band width in voxel units
+/// @param inBandWidth  the interior narrow-band width in voxel units
+/// @param xform        optional transform for the output grid
+///                     (if not provided, the transform of the input @a grid will be matched)
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point
+///
+/// @note If the input grid contains overlapping isosurfaces, interior edges will be lost.
+template<class GridType>
+inline typename GridType::Ptr
+levelSetRebuild(const GridType& grid, float isovalue, float exBandWidth, float inBandWidth,
+    const math::Transform* xform = NULL);
+
+
+/// @brief Return a new grid of type @c GridType that contains a narrow-band level set
+/// representation of an isosurface of a given grid.
+///
+/// @param grid         a scalar, floating-point grid with one or more disjoint,
+///                     closed isosurfaces at the given @a isovalue
+/// @param isovalue     the isovalue that defines the implicit surface
+/// @param exBandWidth  the exterior narrow-band width in voxel units
+/// @param inBandWidth  the interior narrow-band width in voxel units
+/// @param xform        optional transform for the output grid
+///                     (if not provided, the transform of the input @a grid will be matched)
+/// @param interrupter  optional interrupter object
+///
+/// @throw TypeError if @a grid is not scalar or not floating-point
+///
+/// @note If the input grid contains overlapping isosurfaces, interior edges will be lost.
+template<class GridType, typename InterruptT>
+inline typename GridType::Ptr
+levelSetRebuild(const GridType& grid, float isovalue, float exBandWidth, float inBandWidth,
+    const math::Transform* xform = NULL, InterruptT* interrupter = NULL);
+
+
+////////////////////////////////////////
+
+
+// Internal utility objects and implementation details
+
+namespace internal {
+
+class PointListTransform
+{
+public:
+    PointListTransform(const PointList& pointsIn, std::vector<Vec3s>& pointsOut,
+        const math::Transform& xform)
+        : mPointsIn(pointsIn)
+        , mPointsOut(&pointsOut)
+        , mXform(xform)
+    {
+    }
+
+    void runParallel()
+    {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mPointsOut->size()), *this);
+    }
+
+    void runSerial()
+    {
+        (*this)(tbb::blocked_range<size_t>(0, mPointsOut->size()));
+    }
+
+    inline void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        for (size_t n = range.begin(); n < range.end(); ++n) {
+            (*mPointsOut)[n] = Vec3s(mXform.worldToIndex(mPointsIn[n]));
+        }
+    }
+
+private:
+    const PointList& mPointsIn;
+    std::vector<Vec3s> * const mPointsOut;
+    const math::Transform& mXform;
+};
+
+
+class PrimCpy
+{
+public:
+    PrimCpy(const PolygonPoolList& primsIn, const std::vector<size_t>& indexList,
+        std::vector<Vec4I>& primsOut)
+        : mPrimsIn(primsIn)
+        , mIndexList(indexList)
+        , mPrimsOut(&primsOut)
+    {
+    }
+
+    void runParallel()
+    {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mIndexList.size()), *this);
+    }
+
+    void runSerial()
+    {
+        (*this)(tbb::blocked_range<size_t>(0, mIndexList.size()));
+    }
+
+    inline void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        openvdb::Vec4I quad;
+        quad[3] = openvdb::util::INVALID_IDX;
+        std::vector<Vec4I>& primsOut = *mPrimsOut;
+
+        for (size_t n = range.begin(); n < range.end(); ++n) {
+            size_t index = mIndexList[n];
+            PolygonPool& polygons = mPrimsIn[n];
+
+            // Copy quads
+            for (size_t i = 0, I = polygons.numQuads(); i < I; ++i) {
+                primsOut[index++] = polygons.quad(i);
+            }
+            polygons.clearQuads();
+
+            // Copy triangles (adaptive mesh)
+            for (size_t i = 0, I = polygons.numTriangles(); i < I; ++i) {
+                const openvdb::Vec3I& triangle = polygons.triangle(i);
+                quad[0] = triangle[0];
+                quad[1] = triangle[1];
+                quad[2] = triangle[2];
+                primsOut[index++] = quad;
+            }
+
+            polygons.clearTriangles();
+        }
+    }
+
+private:
+    const PolygonPoolList& mPrimsIn;
+    const std::vector<size_t>& mIndexList;
+    std::vector<Vec4I> * const mPrimsOut;
+};
+
+} // namespace internal
+
+
+////////////////////////////////////////
+
+
+/// The normal entry points for level set rebuild are the levelSetRebuild() functions.
+/// doLevelSetRebuild() is mainly for internal use, but when the isovalue and half band
+/// widths are given in ValueType units (for example, if they are queried from
+/// a grid), it might be more convenient to call this function directly.
+///
+/// @internal This overload is enabled only for grids with a scalar, floating-point ValueType.
+template<class GridType, typename InterruptT>
+inline typename boost::enable_if<boost::is_floating_point<typename GridType::ValueType>,
+typename GridType::Ptr>::type
+doLevelSetRebuild(const GridType& grid, typename GridType::ValueType iso,
+    typename GridType::ValueType exWidth, typename GridType::ValueType inWidth,
+    const math::Transform* xform, InterruptT* interrupter)
+{
+    const float
+        isovalue = float(iso),
+        exBandWidth = float(exWidth),
+        inBandWidth = float(inWidth);
+
+    tools::VolumeToMesh mesher(isovalue);
+    mesher(grid);
+
+    math::Transform::Ptr transform = (xform != NULL) ? xform->copy() : grid.transform().copy();
+
+    std::vector<Vec3s> points(mesher.pointListSize());
+
+    { // Copy and transform (required for MeshToVolume) points to grid space.
+        internal::PointListTransform ptnXForm(mesher.pointList(), points, *transform);
+        ptnXForm.runParallel();
+        mesher.pointList().reset(NULL);
+    }
+
+    std::vector<Vec4I> primitives;
+
+    { // Copy primitives.
+        PolygonPoolList& polygonPoolList = mesher.polygonPoolList();
+
+        size_t numPrimitives = 0;
+        std::vector<size_t> indexlist(mesher.polygonPoolListSize());
+
+        for (size_t n = 0, N = mesher.polygonPoolListSize(); n < N; ++n) {
+            const openvdb::tools::PolygonPool& polygons = polygonPoolList[n];
+            indexlist[n] = numPrimitives;
+            numPrimitives += polygons.numQuads();
+            numPrimitives += polygons.numTriangles();
+        }
+
+        primitives.resize(numPrimitives);
+        internal::PrimCpy primCpy(polygonPoolList, indexlist, primitives);
+        primCpy.runParallel();
+    }
+
+    QuadAndTriangleDataAdapter<Vec3s, Vec4I> mesh(points, primitives);
+
+    if (interrupter) {
+        return meshToVolume<GridType>(*interrupter, mesh, *transform, exBandWidth, inBandWidth,
+            DISABLE_RENORMALIZATION, NULL);
+    }
+
+    return meshToVolume<GridType>(mesh, *transform, exBandWidth, inBandWidth,
+        DISABLE_RENORMALIZATION, NULL);
+}
+
+
+/// @internal This overload is enabled only for grids that do not have a scalar,
+/// floating-point ValueType.
+template<class GridType, typename InterruptT>
+inline typename boost::disable_if<boost::is_floating_point<typename GridType::ValueType>,
+typename GridType::Ptr>::type
+doLevelSetRebuild(const GridType&, typename GridType::ValueType /*isovalue*/,
+    typename GridType::ValueType /*exWidth*/, typename GridType::ValueType /*inWidth*/,
+    const math::Transform*, InterruptT*)
+{
+    OPENVDB_THROW(TypeError,
+        "level set rebuild is supported only for scalar, floating-point grids");
+}
+
+
+////////////////////////////////////////
+
+
+template<class GridType, typename InterruptT>
+inline typename GridType::Ptr
+levelSetRebuild(const GridType& grid, float iso, float exWidth, float inWidth,
+    const math::Transform* xform, InterruptT* interrupter)
+{
+    typedef typename GridType::ValueType ValueT;
+    ValueT
+        isovalue(zeroVal<ValueT>() + ValueT(iso)),
+        exBandWidth(zeroVal<ValueT>() + ValueT(exWidth)),
+        inBandWidth(zeroVal<ValueT>() + ValueT(inWidth));
+
+    return doLevelSetRebuild(grid, isovalue, exBandWidth, inBandWidth, xform, interrupter);
+}
+
+
+template<class GridType>
+inline typename GridType::Ptr
+levelSetRebuild(const GridType& grid, float iso, float exWidth, float inWidth,
+    const math::Transform* xform)
+{
+    typedef typename GridType::ValueType ValueT;
+    ValueT
+        isovalue(zeroVal<ValueT>() + ValueT(iso)),
+        exBandWidth(zeroVal<ValueT>() + ValueT(exWidth)),
+        inBandWidth(zeroVal<ValueT>() + ValueT(inWidth));
+
+    return doLevelSetRebuild<GridType, util::NullInterrupter>(
+        grid, isovalue, exBandWidth, inBandWidth, xform, NULL);
+}
+
+
+template<class GridType>
+inline typename GridType::Ptr
+levelSetRebuild(const GridType& grid, float iso, float halfVal, const math::Transform* xform)
+{
+    typedef typename GridType::ValueType ValueT;
+    ValueT
+        isovalue(zeroVal<ValueT>() + ValueT(iso)),
+        halfWidth(zeroVal<ValueT>() + ValueT(halfVal));
+
+    return doLevelSetRebuild<GridType, util::NullInterrupter>(
+        grid, isovalue, halfWidth, halfWidth, xform, NULL);
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVELSETREBUILD_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetSphere.h b/nuparu/include/openvdb_new/tools/LevelSetSphere.h
new file mode 100644
index 00000000..2b87d1a9
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetSphere.h
@@ -0,0 +1,222 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file LevelSetSphere.h
+///
+/// @brief Generate a narrow-band level set of sphere.
+///
+/// @note By definition a level set has a fixed narrow band width
+/// (the half width is defined by LEVEL_SET_HALF_WIDTH in Types.h),
+/// whereas an SDF can have a variable narrow band width.
+
+#ifndef OPENVDB_TOOLS_LEVELSETSPHERE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVELSETSPHERE_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <boost/utility.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include "SignedFloodFill.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a sphere.
+///
+/// @param radius       radius of the sphere in world units
+/// @param center       center of the sphere in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+/// @param interrupt    a pointer adhering to the util::NullInterrupter interface
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+/// @note The leapfrog algorithm employed in this method is best suited
+/// for a single large sphere.  For multiple small spheres consider
+/// using the faster algorithm in ParticlesToLevelSet.h
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetSphere(float radius, const openvdb::Vec3f& center, float voxelSize,
+                     float halfWidth = float(LEVEL_SET_HALF_WIDTH), InterruptT* interrupt =  NULL);
+
+/// @brief Return a grid of type @c GridType containing a narrow-band level set
+/// representation of a sphere.
+///
+/// @param radius       radius of the sphere in world units
+/// @param center       center of the sphere in world units
+/// @param voxelSize    voxel size in world units
+/// @param halfWidth    half the width of the narrow band, in voxel units
+///
+/// @note @c GridType::ValueType must be a floating-point scalar.
+/// @note The leapfrog algorithm employed in this method is best suited
+/// for a single large sphere.  For multiple small spheres consider
+/// using the faster algorithm in ParticlesToLevelSet.h
+template<typename GridType>
+typename GridType::Ptr
+createLevelSetSphere(float radius, const openvdb::Vec3f& center, float voxelSize,
+                     float halfWidth = float(LEVEL_SET_HALF_WIDTH))
+{
+    return createLevelSetSphere<GridType, util::NullInterrupter>(radius,center,voxelSize,halfWidth);
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Generates a signed distance field (or narrow band level
+/// set) to a single sphere.
+///
+/// @note The leapfrog algorithm employed in this class is best
+/// suited for a single large sphere. For multiple small spheres consider
+/// using the faster algorithm in tools/ParticlesToLevelSet.h
+template<typename GridT, typename InterruptT = util::NullInterrupter>
+class LevelSetSphere
+{
+public:
+    typedef typename GridT::ValueType   ValueT;
+    typedef typename math::Vec3<ValueT> Vec3T;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueT>::value);
+
+    /// @brief Constructor
+    ///
+    /// @param radius radius of the sphere in world units
+    /// @param center center of the sphere in world units
+    /// @param interrupt pointer to optional interrupter. Use template
+    /// argument util::NullInterrupter if no interruption is desired.
+    ///
+    /// @note If the radius of the sphere is smaller than
+    /// 1.5*voxelSize, i.e. the sphere is smaller than the Nyquist
+    /// frequency of the grid, it is ignored!
+    LevelSetSphere(ValueT radius, const Vec3T &center, InterruptT* interrupt = NULL)
+        : mRadius(radius), mCenter(center), mInterrupt(interrupt)
+    {
+        if (mRadius<=0) OPENVDB_THROW(ValueError, "radius must be positive");
+    }
+
+    /// @return a narrow-band level set of the sphere
+    ///
+    /// @param voxelSize  Size of voxels in world units
+    /// @param halfWidth  Half-width of narrow-band in voxel units
+    typename GridT::Ptr getLevelSet(ValueT voxelSize, ValueT halfWidth)
+    {
+        mGrid = createLevelSet<GridT>(voxelSize, halfWidth);
+        this->rasterSphere(voxelSize, halfWidth);
+        mGrid->setGridClass(GRID_LEVEL_SET);
+        return mGrid;
+    }
+
+private:
+    void rasterSphere(ValueT dx, ValueT w)
+    {
+        if (!(dx>0.0f)) OPENVDB_THROW(ValueError, "voxel size must be positive");
+        if (!(w>1)) OPENVDB_THROW(ValueError, "half-width must be larger than one");
+
+        // Define radius of sphere and narrow-band in voxel units
+        const ValueT r0 = mRadius/dx, rmax = r0 + w;
+
+        // Radius below the Nyquist frequency
+        if (r0 < 1.5f)  return;
+
+        // Define center of sphere in voxel units
+        const Vec3T c(mCenter[0]/dx, mCenter[1]/dx, mCenter[2]/dx);
+
+        // Define index coordinates and their respective bounds
+        openvdb::Coord ijk;
+        int &i = ijk[0], &j = ijk[1], &k = ijk[2], m=1;
+        const int imin=math::Floor(c[0]-rmax), imax=math::Ceil(c[0]+rmax);
+        const int jmin=math::Floor(c[1]-rmax), jmax=math::Ceil(c[1]+rmax);
+        const int kmin=math::Floor(c[2]-rmax), kmax=math::Ceil(c[2]+rmax);
+
+        // Allocate a ValueAccessor for accelerated random access
+        typename GridT::Accessor accessor = mGrid->getAccessor();
+
+        if (mInterrupt) mInterrupt->start("Generating level set of sphere");
+        // Compute signed distances to sphere using leapfrogging in k
+        for ( i = imin; i <= imax; ++i ) {
+            if (util::wasInterrupted(mInterrupt)) return;
+            const float x2 = math::Pow2(i - c[0]);
+            for ( j = jmin; j <= jmax; ++j ) {
+                const float x2y2 = math::Pow2(j - c[1]) + x2;
+                for (k=kmin; k<=kmax; k += m) {
+                    m = 1;
+                    /// Distance in voxel units to sphere
+                    const float v = math::Sqrt(x2y2 + math::Pow2(k-c[2]))-r0,
+                        d = math::Abs(v);
+                    if ( d < w ){ // inside narrow band
+                        accessor.setValue(ijk, dx*v);// distance in world units
+                    } else {// outside narrow band
+                        m += math::Floor(d-w);// leapfrog
+                    }
+                }//end leapfrog over k
+            }//end loop over j
+        }//end loop over i
+
+        // Define consistent signed distances outside the narrow-band
+        tools::signedFloodFill(mGrid->tree());
+
+        if (mInterrupt) mInterrupt->end();
+    }
+
+    const ValueT        mRadius;
+    const Vec3T         mCenter;
+    InterruptT*         mInterrupt;
+    typename GridT::Ptr mGrid;
+};// LevelSetSphere
+
+
+////////////////////////////////////////
+
+
+template<typename GridType, typename InterruptT>
+typename GridType::Ptr
+createLevelSetSphere(float radius, const openvdb::Vec3f& center, float voxelSize,
+    float halfWidth, InterruptT* interrupt)
+{
+    // GridType::ValueType is required to be a floating-point scalar.
+    BOOST_STATIC_ASSERT(boost::is_floating_point<typename GridType::ValueType>::value);
+
+    typedef typename GridType::ValueType ValueT;
+    LevelSetSphere<GridType, InterruptT> factory(ValueT(radius), center, interrupt);
+    return factory.getLevelSet(ValueT(voxelSize), ValueT(halfWidth));
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVELSETSPHERE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetTracker.h b/nuparu/include/openvdb_new/tools/LevelSetTracker.h
new file mode 100644
index 00000000..1d4acd8a
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetTracker.h
@@ -0,0 +1,650 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file LevelSetTracker.h
+///
+/// @brief Performs multi-threaded interface tracking of narrow band
+/// level sets. This is the building-block for most level set
+/// computations that involve dynamic topology, e.g. advection.
+
+#ifndef OPENVDB_TOOLS_LEVEL_SET_TRACKER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVEL_SET_TRACKER_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_for.h>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/FiniteDifference.h>
+#include <openvdb/math/Operators.h>
+#include <openvdb/math/Stencils.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/Grid.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/tree/LeafManager.h>
+#include "ChangeBackground.h"// for changeLevelSetBackground
+#include "Morphology.h"//for dilateActiveValues
+#include "Prune.h"// for pruneLevelSet
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Performs multi-threaded interface tracking of narrow band level sets
+template<typename GridT, typename InterruptT = util::NullInterrupter>
+class LevelSetTracker
+{
+public:
+    typedef GridT                                GridType;
+    typedef typename GridT::TreeType             TreeType;
+    typedef typename TreeType::LeafNodeType      LeafType;
+    typedef typename TreeType::ValueType         ValueType;
+    typedef typename tree::LeafManager<TreeType> LeafManagerType; // leafs + buffers
+    typedef typename LeafManagerType::LeafRange  LeafRange;
+    typedef typename LeafManagerType::BufferType BufferType;
+    typedef typename TreeType::template ValueConverter<ValueMask>::Type MaskTreeType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+
+    /// Lightweight struct that stores the state of the LevelSetTracker
+    struct State {
+        State(math::BiasedGradientScheme s = math::HJWENO5_BIAS,
+              math::TemporalIntegrationScheme t = math::TVD_RK1,
+              int n = static_cast<int>(LEVEL_SET_HALF_WIDTH), int g = 1)
+            : spatialScheme(s), temporalScheme(t), normCount(n), grainSize(g) {}
+        math::BiasedGradientScheme      spatialScheme;
+        math::TemporalIntegrationScheme temporalScheme;
+        int                             normCount;// Number of iterations of normalization
+        int                             grainSize;
+    };
+
+    /// @brief Main constructor
+    /// @throw RuntimeError if the grid is not a level set
+    LevelSetTracker(GridT& grid, InterruptT* interrupt = NULL);
+
+    virtual ~LevelSetTracker() { delete mLeafs; }
+
+    /// @brief Iterative normalization, i.e. solving the Eikonal equation
+    /// @note The mask it optional and by default it is ignored.
+    template <typename MaskType>
+    void normalize(const MaskType* mask);
+
+    /// @brief Iterative normalization, i.e. solving the Eikonal equation
+    void normalize() { this->normalize<MaskTreeType>(NULL); }
+
+    /// @brief Track the level set interface, i.e. rebuild and normalize the
+    /// narrow band of the level set.
+    void track();
+
+    /// @brief Remove voxels that are outside the narrow band. (substep of track)
+    void prune();
+
+    /// @brief Fast but approximate dilation of the narrow band - one
+    /// layer at a time. Normally we recommend using the resize method below
+    /// which internally calls dilate (or erode) with the correct
+    /// number of @a iterations to achieve the desired half voxel width
+    /// of the narrow band (3 is recomended for most level set applications).
+    ///
+    /// @note Since many level set applications perform
+    /// interface-tracking, which in turn rebuilds the narrow-band
+    /// accurately, this dilate method can often be used with a
+    /// single iterations of low-order re-normalization. This
+    /// effectively allows very narrow bands to be created from points
+    /// or polygons (e.g. with a half voxel width of 1), followed by a
+    /// fast but approximate dilation (typically with a half voxel
+    /// width of 3). This can be significantly faster than generating
+    /// the final width of the narrow band from points or polygons.
+    void dilate(int iterations = 1);
+
+    /// @brief Erodes the width of the narrow-band and update the background values
+    /// @throw ValueError if @a iterations is larger than the current half-width.
+    void erode(int iterations = 1);
+
+    /// @brief Resize the width of the narrow band, i.e. perform
+    /// dilation and renormalization or erosion as required.
+    bool resize(Index halfWidth = static_cast<Index>(LEVEL_SET_HALF_WIDTH));
+
+    /// @brief Return the half width of the narrow band in floating-point voxel units.
+    ValueType getHalfWidth() const { return mGrid->background()/mDx; }
+
+    /// @brief Return the state of the tracker (see struct defined above)
+    State getState() const { return mState; }
+
+    /// @brief Set the state of the tracker (see struct defined above)
+    void setState(const State& s) { mState =s; }
+
+    /// @return the spatial finite difference scheme
+    math::BiasedGradientScheme getSpatialScheme() const { return mState.spatialScheme; }
+
+    /// @brief Set the spatial finite difference scheme
+    void setSpatialScheme(math::BiasedGradientScheme scheme) { mState.spatialScheme = scheme; }
+
+    /// @return the temporal integration scheme
+    math::TemporalIntegrationScheme getTemporalScheme() const { return mState.temporalScheme; }
+
+    /// @brief Set the spatial finite difference scheme
+    void setTemporalScheme(math::TemporalIntegrationScheme scheme) { mState.temporalScheme = scheme;}
+
+    /// @return The number of normalizations performed per track or
+    /// normalize call.
+    int  getNormCount() const { return mState.normCount; }
+
+    /// @brief Set the number of normalizations performed per track or
+    /// normalize call.
+    void setNormCount(int n) { mState.normCount = n; }
+
+    /// @return the grain-size used for multi-threading
+    int  getGrainSize() const { return mState.grainSize; }
+
+    /// @brief Set the grain-size used for multi-threading.
+    /// @note A grainsize of 0 or less disables multi-threading!
+    void setGrainSize(int grainsize) { mState.grainSize = grainsize; }
+
+    ValueType voxelSize() const { return mDx; }
+
+    void startInterrupter(const char* msg);
+
+    void endInterrupter();
+
+    /// @return false if the process was interrupted
+    bool checkInterrupter();
+
+    const GridType& grid() const { return *mGrid; }
+
+    LeafManagerType& leafs() { return *mLeafs; }
+
+    const LeafManagerType& leafs() const { return *mLeafs; }
+
+private:
+
+    // disallow copy construction and copy by assignment!
+    LevelSetTracker(const LevelSetTracker&);// not implemented
+    LevelSetTracker& operator=(const LevelSetTracker&);// not implemented
+
+    // Private class to perform multi-threaded trimming of
+    // voxels that are too far away from the zero-crossing.
+    struct Trim
+    {
+        Trim(LevelSetTracker& tracker) : mTracker(tracker) {}
+        void trim();
+        void operator()(const LeafRange& r) const;
+        LevelSetTracker& mTracker;
+    };// Trim
+
+    // Private struct to perform multi-threaded normalization
+    template<math::BiasedGradientScheme      SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme,
+             typename MaskT>
+    struct Normalizer
+    {
+        typedef math::BIAS_SCHEME<SpatialScheme>                             SchemeT;
+        typedef typename SchemeT::template ISStencil<GridType>::StencilType  StencilT;
+        typedef typename MaskT::LeafNodeType MaskLeafT;
+        typedef typename MaskLeafT::ValueOnCIter MaskIterT;
+        typedef typename LeafType::ValueOnCIter VoxelIterT;
+        Normalizer(LevelSetTracker& tracker, const MaskT* mask);
+        void normalize();
+        void operator()(const LeafRange& r) const {mTask(const_cast<Normalizer*>(this), r);}
+        void cook(const char* msg, int swapBuffer=0);
+        template <int Nominator, int Denominator>
+        void euler(const LeafRange& range, Index phiBuffer, Index resultBuffer);
+        inline void euler01(const LeafRange& r) {this->euler<0,1>(r, 0, 1);}
+        inline void euler12(const LeafRange& r) {this->euler<1,2>(r, 1, 1);}
+        inline void euler34(const LeafRange& r) {this->euler<3,4>(r, 1, 2);}
+        inline void euler13(const LeafRange& r) {this->euler<1,3>(r, 1, 2);}
+        template <int Nominator, int Denominator>
+        void eval(StencilT& stencil, const ValueType* phi, ValueType* result, Index n) const;
+        LevelSetTracker& mTracker;
+        const MaskT*     mMask;
+        const ValueType  mDt, mInvDx;
+        typename boost::function<void (Normalizer*, const LeafRange&)> mTask;
+    }; // Normalizer struct
+
+    template<math::BiasedGradientScheme SpatialScheme, typename MaskT>
+    void normalize1(const MaskT* mask);
+
+    template<math::BiasedGradientScheme SpatialScheme,
+             math::TemporalIntegrationScheme TemporalScheme, typename MaskT>
+    void normalize2(const MaskT* mask);
+
+    // Throughout the methods below mLeafs is always assumed to contain
+    // a list of the current LeafNodes! The auxiliary buffers on the
+    // other hand always have to be allocated locally, since some
+    // methods need them and others don't!
+    GridType*        mGrid;
+    LeafManagerType* mLeafs;
+    InterruptT*      mInterrupter;
+    const ValueType  mDx;
+    State            mState;
+}; // end of LevelSetTracker class
+
+template<typename GridT, typename InterruptT>
+LevelSetTracker<GridT, InterruptT>::
+LevelSetTracker(GridT& grid, InterruptT* interrupt):
+    mGrid(&grid),
+    mLeafs(new LeafManagerType(grid.tree())),
+    mInterrupter(interrupt),
+    mDx(static_cast<ValueType>(grid.voxelSize()[0])),
+    mState()
+{
+    if ( !grid.hasUniformVoxels() ) {
+         OPENVDB_THROW(RuntimeError,
+             "The transform must have uniform scale for the LevelSetTracker to function");
+    }
+    if ( grid.getGridClass() != GRID_LEVEL_SET) {
+        OPENVDB_THROW(RuntimeError,
+            "LevelSetTracker expected a level set, got a grid of class \""
+            + grid.gridClassToString(grid.getGridClass())
+            + "\" [hint: Grid::setGridClass(openvdb::GRID_LEVEL_SET)]");
+    }
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+prune()
+{
+    this->startInterrupter("Pruning Level Set");
+
+    // Prune voxels that are too far away from the zero-crossing
+    Trim t(*this);
+    t.trim();
+
+    // Remove inactive nodes from tree
+    tools::pruneLevelSet(mGrid->tree());
+
+    // The tree topology has changes so rebuild the list of leafs
+    mLeafs->rebuildLeafArray();
+    this->endInterrupter();
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+track()
+{
+    // Dilate narrow-band (this also rebuilds the leaf array!)
+    tools::dilateActiveValues( *mLeafs, 1, tools::NN_FACE, tools::IGNORE_TILES);
+
+    // Compute signed distances in dilated narrow-band
+    this->normalize();
+
+    // Remove voxels that are outside the narrow band
+    this->prune();
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+dilate(int iterations)
+{
+    if (this->getNormCount() == 0) {
+        for (int i=0; i < iterations; ++i) {
+            tools::dilateActiveValues( *mLeafs, 1, tools::NN_FACE, tools::IGNORE_TILES);
+            tools::changeLevelSetBackground(this->leafs(), mDx + mGrid->background());
+        }
+    } else {
+        for (int i=0; i < iterations; ++i) {
+            MaskTreeType mask0(mGrid->tree(), false, TopologyCopy());
+            tools::dilateActiveValues( *mLeafs, 1, tools::NN_FACE, tools::IGNORE_TILES);
+            tools::changeLevelSetBackground(this->leafs(), mDx + mGrid->background());
+            MaskTreeType mask(mGrid->tree(), false, TopologyCopy());
+            mask.topologyDifference(mask0);
+            this->normalize(&mask);
+        }
+    }
+}
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+erode(int iterations)
+{
+    tools::erodeVoxels(*mLeafs, iterations);
+    mLeafs->rebuildLeafArray();
+    const ValueType background = mGrid->background() - iterations*mDx;
+    tools::changeLevelSetBackground(this->leafs(), background);
+}
+
+template<typename GridT, typename InterruptT>
+inline bool
+LevelSetTracker<GridT, InterruptT>::
+resize(Index halfWidth)
+{
+    const int wOld = static_cast<int>(math::RoundDown(this->getHalfWidth()));
+    const int wNew = static_cast<int>(halfWidth);
+    if (wOld < wNew) {
+        this->dilate(wNew - wOld);
+    } else if (wOld > wNew) {
+        this->erode(wOld - wNew);
+    }
+    return wOld != wNew;
+}
+
+template<typename GridT,  typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+startInterrupter(const char* msg)
+{
+    if (mInterrupter) mInterrupter->start(msg);
+}
+
+template<typename GridT,  typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+endInterrupter()
+{
+    if (mInterrupter) mInterrupter->end();
+}
+
+template<typename GridT,  typename InterruptT>
+inline bool
+LevelSetTracker<GridT, InterruptT>::
+checkInterrupter()
+{
+    if (util::wasInterrupted(mInterrupter)) {
+        tbb::task::self().cancel_group_execution();
+        return false;
+    }
+    return true;
+}
+
+template<typename GridT, typename InterruptT>
+template<typename MaskT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+normalize(const MaskT* mask)
+{
+    switch (this->getSpatialScheme()) {
+    case math::FIRST_BIAS:
+        this->normalize1<math::FIRST_BIAS ,  MaskT>(mask); break;
+    case math::SECOND_BIAS:
+        this->normalize1<math::SECOND_BIAS,  MaskT>(mask); break;
+    case math::THIRD_BIAS:
+        this->normalize1<math::THIRD_BIAS,   MaskT>(mask); break;
+    case math::WENO5_BIAS:
+        this->normalize1<math::WENO5_BIAS,   MaskT>(mask); break;
+    case math::HJWENO5_BIAS:
+        this->normalize1<math::HJWENO5_BIAS, MaskT>(mask); break;
+    default:
+        OPENVDB_THROW(ValueError, "Spatial difference scheme not supported!");
+    }
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme, typename MaskT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+normalize1(const MaskT* mask)
+{
+    switch (this->getTemporalScheme()) {
+    case math::TVD_RK1:
+        this->normalize2<SpatialScheme, math::TVD_RK1, MaskT>(mask); break;
+    case math::TVD_RK2:
+        this->normalize2<SpatialScheme, math::TVD_RK2, MaskT>(mask); break;
+    case math::TVD_RK3:
+        this->normalize2<SpatialScheme, math::TVD_RK3, MaskT>(mask); break;
+    default:
+        OPENVDB_THROW(ValueError, "Temporal integration scheme not supported!");
+    }
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MaskT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+normalize2(const MaskT* mask)
+{
+    Normalizer<SpatialScheme, TemporalScheme, MaskT> tmp(*this, mask);
+    tmp.normalize();
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+Trim::trim()
+{
+    const int grainSize = mTracker.getGrainSize();
+    const LeafRange range = mTracker.leafs().leafRange(grainSize);
+
+    if (grainSize>0) {
+        tbb::parallel_for(range, *this);
+    } else {
+        (*this)(range);
+    }
+}
+
+/// Prunes away voxels that have moved outside the narrow band
+template<typename GridT, typename InterruptT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+Trim::operator()(const LeafRange& range) const
+{
+    typedef typename LeafType::ValueOnIter VoxelIterT;
+    mTracker.checkInterrupter();
+    const ValueType gamma = mTracker.mGrid->background();
+
+    for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+        LeafType &leaf = *leafIter;
+        for (VoxelIterT iter = leaf.beginValueOn(); iter; ++iter) {
+            const ValueType val = *iter;
+            if (val <= -gamma)
+                leaf.setValueOff(iter.pos(), -gamma);
+            else if (val >= gamma)
+                leaf.setValueOff(iter.pos(),  gamma);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MaskT>
+inline
+LevelSetTracker<GridT, InterruptT>::
+Normalizer<SpatialScheme, TemporalScheme, MaskT>::
+Normalizer(LevelSetTracker& tracker, const MaskT* mask)
+    : mTracker(tracker)
+    , mMask(mask)
+    , mDt(tracker.voxelSize()*(TemporalScheme == math::TVD_RK1 ? 0.3f :
+                               TemporalScheme == math::TVD_RK2 ? 0.9f : 1.0f))
+    , mInvDx(1.0f/tracker.voxelSize())
+    , mTask(0)
+{
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MaskT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+Normalizer<SpatialScheme, TemporalScheme, MaskT>::
+normalize()
+{
+    /// Make sure we have enough temporal auxiliary buffers
+    mTracker.mLeafs->rebuildAuxBuffers(TemporalScheme == math::TVD_RK3 ? 2 : 1);
+
+    for (int n=0, e=mTracker.getNormCount(); n < e; ++n) {
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        switch(TemporalScheme) {//switch is resolved at compile-time
+        case math::TVD_RK1:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(0) = Phi_t0(0) - dt * VdotG_t0(1)
+            mTask = boost::bind(&Normalizer::euler01, _1, _2);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook("Normalizing level set using TVD_RK1", 1);
+            break;
+        case math::TVD_RK2:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * VdotG_t0(1)
+            mTask = boost::bind(&Normalizer::euler01, _1, _2);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook("Normalizing level set using TVD_RK1 (step 1 of 2)", 1);
+
+            // Convex combine explicit Euler step: t2 = t0 + dt
+            // Phi_t2(1) = 1/2 * Phi_t0(1) + 1/2 * (Phi_t1(0) - dt * V.Grad_t1(0))
+            mTask = boost::bind(&Normalizer::euler12, _1, _2);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t2(0) and Phi_t1(1)
+            this->cook("Normalizing level set using TVD_RK1 (step 2 of 2)", 1);
+            break;
+        case math::TVD_RK3:
+            // Perform one explicit Euler step: t1 = t0 + dt
+            // Phi_t1(1) = Phi_t0(0) - dt * VdotG_t0(1)
+            mTask = boost::bind(&Normalizer::euler01, _1, _2);
+
+            // Cook and swap buffer 0 and 1 such that Phi_t1(0) and Phi_t0(1)
+            this->cook("Normalizing level set using TVD_RK3 (step 1 of 3)", 1);
+
+            // Convex combine explicit Euler step: t2 = t0 + dt/2
+            // Phi_t2(2) = 3/4 * Phi_t0(1) + 1/4 * (Phi_t1(0) - dt * V.Grad_t1(0))
+            mTask = boost::bind(&Normalizer::euler34, _1, _2);
+
+            // Cook and swap buffer 0 and 2 such that Phi_t2(0) and Phi_t1(2)
+            this->cook("Normalizing level set using TVD_RK3 (step 2 of 3)", 2);
+
+            // Convex combine explicit Euler step: t3 = t0 + dt
+            // Phi_t3(2) = 1/3 * Phi_t0(1) + 2/3 * (Phi_t2(0) - dt * V.Grad_t2(0)
+            mTask = boost::bind(&Normalizer::euler13, _1, _2);
+
+            // Cook and swap buffer 0 and 2 such that Phi_t3(0) and Phi_t2(2)
+            this->cook("Normalizing level set using TVD_RK3 (step 3 of 3)", 2);
+            break;
+        default:
+            OPENVDB_THROW(ValueError, "Temporal integration scheme not supported!");
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    mTracker.mLeafs->removeAuxBuffers();
+}
+
+/// Private method to perform the task (serial or threaded) and
+/// subsequently swap the leaf buffers.
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme      SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MaskT>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+Normalizer<SpatialScheme, TemporalScheme, MaskT>::
+cook(const char* msg, int swapBuffer)
+{
+    mTracker.startInterrupter( msg );
+
+    const int grainSize   = mTracker.getGrainSize();
+    const LeafRange range = mTracker.leafs().leafRange(grainSize);
+
+    grainSize>0 ? tbb::parallel_for(range, *this) : (*this)(range);
+
+    mTracker.leafs().swapLeafBuffer(swapBuffer, grainSize==0);
+
+    mTracker.endInterrupter();
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme      SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MaskT>
+template <int Nominator, int Denominator>
+inline void
+LevelSetTracker<GridT, InterruptT>::
+Normalizer<SpatialScheme, TemporalScheme, MaskT>::
+eval(StencilT& stencil, const ValueType* phi, ValueType* result, Index n) const
+{
+    typedef typename math::ISGradientNormSqrd<SpatialScheme> GradientT;
+    static const ValueType alpha = ValueType(Nominator)/ValueType(Denominator);
+    static const ValueType beta  = ValueType(1) - alpha;
+
+    const ValueType normSqGradPhi = GradientT::result(stencil);
+    const ValueType phi0 = stencil.getValue();
+    ValueType v = phi0 / ( math::Sqrt(math::Pow2(phi0) + normSqGradPhi) +
+                           math::Tolerance<ValueType>::value() );
+    v = phi0 - mDt * v * (math::Sqrt(normSqGradPhi) * mInvDx - 1.0f);
+    result[n] = Nominator ? alpha * phi[n] + beta * v : v;
+}
+
+template<typename GridT, typename InterruptT>
+template<math::BiasedGradientScheme      SpatialScheme,
+         math::TemporalIntegrationScheme TemporalScheme,
+         typename MaskT>
+template <int Nominator, int Denominator>
+inline void
+LevelSetTracker<GridT,InterruptT>::
+Normalizer<SpatialScheme, TemporalScheme, MaskT>::
+euler(const LeafRange& range, Index phiBuffer, Index resultBuffer)
+{
+    typedef typename LeafType::ValueOnCIter VoxelIterT;
+
+    mTracker.checkInterrupter();
+
+    StencilT stencil(mTracker.grid());
+
+    for (typename LeafRange::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+        const ValueType* phi = leafIter.buffer(phiBuffer).data();
+        ValueType* result = leafIter.buffer(resultBuffer).data();
+        if (mMask == NULL) {
+            for (VoxelIterT iter = leafIter->cbeginValueOn(); iter; ++iter) {
+                stencil.moveTo(iter);
+                this->eval<Nominator, Denominator>(stencil, phi, result, iter.pos());
+            }//loop over active voxels in the leaf of the level set
+        } else if (const MaskLeafT* mask = mMask->probeLeaf(leafIter->origin())) {
+            const ValueType* phi0 = leafIter->buffer().data();
+            for (MaskIterT iter  = mask->cbeginValueOn(); iter; ++iter) {
+                const Index i = iter.pos();
+                stencil.moveTo(iter.getCoord(), phi0[i]);
+                this->eval<Nominator, Denominator>(stencil, phi, result, i);
+            }//loop over active voxels in the leaf of the mask
+        }
+    }//loop over leafs of the level set
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVEL_SET_TRACKER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/LevelSetUtil.h b/nuparu/include/openvdb_new/tools/LevelSetUtil.h
new file mode 100644
index 00000000..97a91b87
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/LevelSetUtil.h
@@ -0,0 +1,2554 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file tools/LevelSetUtil.h
+///
+/// @brief  Miscellaneous utility methods that operate primarily
+///         or exclusively on level set grids.
+///
+/// @author Mihai Alden
+
+
+#ifndef OPENVDB_TOOLS_LEVEL_SET_UTIL_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_LEVEL_SET_UTIL_HAS_BEEN_INCLUDED
+
+#include "MeshToVolume.h" // for traceExteriorBoundaries
+#include "SignedFloodFill.h" // for signedFloodFillWithValues
+
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/parallel_sort.h>
+
+#include <limits>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+// MS Visual C++ requires this extra level of indirection in order to compile
+// THIS MUST EXIST IN AN UNNAMED NAMESPACE IN ORDER TO COMPILE ON WINDOWS
+namespace {
+
+template<typename GridType>
+inline typename GridType::ValueType lsutilGridMax()
+{
+    return std::numeric_limits<typename GridType::ValueType>::max();
+}
+
+template<typename GridType>
+inline typename GridType::ValueType lsutilGridZero()
+{
+    return zeroVal<typename GridType::ValueType>();
+}
+
+} // unnamed namespace
+
+
+////////////////////////////////////////
+
+
+/// @brief Threaded method to convert a sparse level set/SDF into a sparse fog volume
+///
+/// @details For a level set, the active and negative-valued interior half of the
+/// narrow band becomes a linear ramp from 0 to 1; the inactive interior becomes
+/// active with a constant value of 1; and the exterior, including the background
+/// and the active exterior half of the narrow band, becomes inactive with a constant
+/// value of 0.  The interior, though active, remains sparse.
+/// @details For a generic SDF, a specified cutoff distance determines the width
+/// of the ramp, but otherwise the result is the same as for a level set.
+///
+/// @param grid            level set/SDF grid to transform
+/// @param cutoffDistance  optional world space cutoff distance for the ramp
+///                        (automatically clamped if greater than the interior
+///                        narrow band width)
+template<class GridType>
+inline void
+sdfToFogVolume(
+    GridType& grid,
+    typename GridType::ValueType cutoffDistance = lsutilGridMax<GridType>());
+
+
+/// @brief Threaded method to construct a boolean mask that represents interior regions
+///        in a signed distance field.
+///
+/// @return A shared pointer to either a boolean grid or tree with the same tree
+///         configuration and potentially transform as the input @c volume and whose active
+///         and @c true values correspond to the interior of the input signed distance field.
+///
+/// @param volume               Signed distance field / level set volume.
+/// @param isovalue             Threshold below which values are considered part of the
+///                             interior region.
+template<class GridOrTreeType>
+inline typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr
+sdfInteriorMask(
+    const GridOrTreeType& volume,
+    typename GridOrTreeType::ValueType isovalue = lsutilGridZero<GridOrTreeType>());
+
+
+/// @brief  Extracts the interior regions of a signed distance field and topologically enclosed
+///         (watertight) regions of value greater than the @a isovalue (cavities) that can arise
+///         as the result of CSG union operations between different shapes where at least one of
+///         the shapes has a concavity that is capped.
+///
+///         For example the enclosed region of a capped bottle would include the walls and
+///         the interior cavity.
+///
+/// @return A shared pointer to either a boolean grid or tree with the same tree configuration
+///         and potentially transform as the input @c volume and whose active and @c true values
+///         correspond to the interior and enclosed regions in the input signed distance field.
+///
+/// @param volume       Signed distance field / level set volume.
+/// @param isovalue     Threshold below which values are considered part of the interior region.
+/// @param fillMask     Optional boolean tree, when provided enclosed cavity regions that are not
+///                     completely filled by this mask are ignored.
+///
+///                     For instance if the fill mask does not completely fill the bottle in the
+///                     previous example only the walls and cap are returned and the interior
+///                     cavity will be ignored.
+template<typename GridOrTreeType>
+inline typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr
+extractEnclosedRegion(const GridOrTreeType& volume,
+    typename GridOrTreeType::ValueType isovalue = lsutilGridZero<GridOrTreeType>(),
+    const typename TreeAdapter<GridOrTreeType>::TreeType::template ValueConverter<bool>::Type* fillMask = NULL);
+
+
+/// @brief Return a mask of the voxels that intersect the implicit surface with the given @a isovalue.
+///
+/// @param volume       Signed distance field / level set volume.
+/// @param isovalue     The crossing point that is considered the surface.
+template<typename GridOrTreeType>
+inline typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr
+extractIsosurfaceMask(const GridOrTreeType& volume, typename GridOrTreeType::ValueType isovalue);
+
+
+/// @brief Return a mask for each connected component of the given grid's active voxels.
+///
+/// @param volume   Input grid or tree
+/// @param masks    Output set of disjoint active topology masks sorted in descending order
+///                 based on the active voxel count.
+template<typename GridOrTreeType>
+inline void
+extractActiveVoxelSegmentMasks(const GridOrTreeType& volume,
+    std::vector<typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr>& masks);
+
+
+/// @brief  Separates disjoint active topology components into distinct grids or trees.
+///
+/// @details Supports volumes with active tiles.
+///
+/// @param volume       Input grid or tree
+/// @param segments     Output set of disjoint active topology components sorted in
+///                     descending order based on the active voxel count.
+template<typename GridOrTreeType>
+inline void
+segmentActiveVoxels(const GridOrTreeType& volume, std::vector<typename GridOrTreeType::Ptr>& segments);
+
+
+/// @brief  Separates disjoint SDF surfaces into distinct grids or trees.
+///
+/// @details Supports asymmetric interior / exterior narrowband widths and
+///          SDF volumes with dense interior regions.
+///
+/// @param volume       Input signed distance field / level set volume
+/// @param segments     Output set of disjoint SDF surfaces found in @a volume sorted in
+///                     descending order based on the surface intersecting voxel count.
+template<typename GridOrTreeType>
+inline void
+segmentSDF(const GridOrTreeType& volume, std::vector<typename GridOrTreeType::Ptr>& segments);
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// Internal utility objects and implementation details
+
+
+namespace level_set_util_internal {
+
+
+template<typename LeafNodeType>
+struct MaskInteriorVoxels {
+
+    typedef typename LeafNodeType::ValueType                ValueType;
+    typedef tree::LeafNode<bool, LeafNodeType::LOG2DIM>     BoolLeafNodeType;
+
+    MaskInteriorVoxels(
+        ValueType isovalue, const LeafNodeType ** nodes, BoolLeafNodeType ** maskNodes)
+        : mNodes(nodes), mMaskNodes(maskNodes), mIsovalue(isovalue)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        BoolLeafNodeType * maskNodePt = NULL;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            mMaskNodes[n] = NULL;
+            const LeafNodeType& node = *mNodes[n];
+
+            if (!maskNodePt) {
+                maskNodePt = new BoolLeafNodeType(node.origin(), false);
+            } else {
+                maskNodePt->setOrigin(node.origin());
+            }
+
+            const ValueType* values = &node.getValue(0);
+            for (Index i = 0; i < LeafNodeType::SIZE; ++i) {
+                if (values[i] < mIsovalue) maskNodePt->setValueOn(i, true);
+            }
+
+            if (maskNodePt->onVoxelCount() > 0) {
+                mMaskNodes[n] = maskNodePt;
+                maskNodePt = NULL;
+            }
+        }
+
+        if (maskNodePt) delete maskNodePt;
+    }
+
+    LeafNodeType        const * const * const mNodes;
+    BoolLeafNodeType                 ** const mMaskNodes;
+    ValueType                           const mIsovalue;
+}; // MaskInteriorVoxels
+
+
+template<typename TreeType, typename InternalNodeType>
+struct MaskInteriorTiles {
+
+    typedef typename TreeType::ValueType    ValueType;
+
+    MaskInteriorTiles(ValueType isovalue, const TreeType& tree, InternalNodeType ** maskNodes)
+        : mTree(&tree), mMaskNodes(maskNodes), mIsovalue(isovalue) { }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            typename InternalNodeType::ValueAllIter it = mMaskNodes[n]->beginValueAll();
+            for (; it; ++it) {
+                if (acc.getValue(it.getCoord()) < mIsovalue) {
+                    it.setValue(true);
+                    it.setValueOn(true);
+                }
+            }
+        }
+    }
+
+    TreeType            const * const mTree;
+    InternalNodeType         ** const mMaskNodes;
+    ValueType                   const mIsovalue;
+}; // MaskInteriorTiles
+
+
+template<typename TreeType>
+struct PopulateTree {
+
+    typedef typename TreeType::ValueType    ValueType;
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    PopulateTree(TreeType& tree, LeafNodeType** leafnodes,
+        const size_t * nodexIndexMap, ValueType background)
+        : mNewTree(background)
+        , mTreePt(&tree)
+        , mNodes(leafnodes)
+        , mNodeIndexMap(nodexIndexMap)
+    {
+    }
+
+    PopulateTree(PopulateTree& rhs, tbb::split)
+        : mNewTree(rhs.mNewTree.background())
+        , mTreePt(&mNewTree)
+        , mNodes(rhs.mNodes)
+        , mNodeIndexMap(rhs.mNodeIndexMap)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+
+        tree::ValueAccessor<TreeType> acc(*mTreePt);
+
+        if (mNodeIndexMap) {
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+                for (size_t i = mNodeIndexMap[n], I = mNodeIndexMap[n + 1]; i < I; ++i) {
+                    if (mNodes[i] != NULL) acc.addLeaf(mNodes[i]);
+                }
+            }
+        } else {
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+                acc.addLeaf(mNodes[n]);
+            }
+        }
+    }
+
+    void join(PopulateTree& rhs) { mTreePt->merge(*rhs.mTreePt); }
+
+private:
+    TreeType                      mNewTree;
+    TreeType              * const mTreePt;
+    LeafNodeType         ** const mNodes;
+    size_t          const * const mNodeIndexMap;
+}; // PopulateTree
+
+
+/// @brief Negative active values are set @c 0, everything else is set to @c 1.
+template<typename LeafNodeType>
+struct LabelBoundaryVoxels {
+
+    typedef typename LeafNodeType::ValueType                ValueType;
+    typedef tree::LeafNode<char, LeafNodeType::LOG2DIM>     CharLeafNodeType;
+
+    LabelBoundaryVoxels(
+        ValueType isovalue, const LeafNodeType ** nodes, CharLeafNodeType ** maskNodes)
+        : mNodes(nodes), mMaskNodes(maskNodes), mIsovalue(isovalue)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        CharLeafNodeType * maskNodePt = NULL;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            mMaskNodes[n] = NULL;
+            const LeafNodeType& node = *mNodes[n];
+
+            if (!maskNodePt) {
+                maskNodePt = new CharLeafNodeType(node.origin(), 1);
+            } else {
+                maskNodePt->setOrigin(node.origin());
+            }
+
+            typename LeafNodeType::ValueOnCIter it;
+            for (it = node.cbeginValueOn(); it; ++it) {
+                maskNodePt->setValueOn(it.pos(), ((*it - mIsovalue) < 0.0) ? 0 : 1);
+            }
+
+            if (maskNodePt->onVoxelCount() > 0) {
+                mMaskNodes[n] = maskNodePt;
+                maskNodePt = NULL;
+            }
+        }
+
+        if (maskNodePt) delete maskNodePt;
+    }
+
+    LeafNodeType        const * const * const mNodes;
+    CharLeafNodeType                 ** const mMaskNodes;
+    ValueType                           const mIsovalue;
+}; // LabelBoundaryVoxels
+
+
+template<typename LeafNodeType>
+struct FlipRegionSign {
+    typedef typename LeafNodeType::ValueType ValueType;
+
+    FlipRegionSign(LeafNodeType ** nodes) : mNodes(nodes) { }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            ValueType* values = const_cast<ValueType*>(&mNodes[n]->getValue(0));
+            for (Index i = 0; i < LeafNodeType::SIZE; ++i) {
+                values[i] = values[i] < 0 ? 1 : -1;
+            }
+        }
+    }
+
+    LeafNodeType ** const mNodes;
+}; // FlipRegionSign
+
+
+template<typename LeafNodeType>
+struct FindMinVoxelValue {
+
+    typedef typename LeafNodeType::ValueType    ValueType;
+
+    FindMinVoxelValue(LeafNodeType const * const * const leafnodes)
+        : minValue(std::numeric_limits<ValueType>::max())
+        , mNodes(leafnodes)
+    {
+    }
+
+    FindMinVoxelValue(FindMinVoxelValue& rhs, tbb::split)
+        : minValue(std::numeric_limits<ValueType>::max())
+        , mNodes(rhs.mNodes)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            const ValueType* data = mNodes[n]->buffer().data();
+            for (Index i = 0; i < LeafNodeType::SIZE; ++i) {
+                minValue = std::min(minValue, data[i]);
+            }
+        }
+    }
+
+    void join(FindMinVoxelValue& rhs) { minValue = std::min(minValue, rhs.minValue); }
+
+    ValueType minValue;
+
+    LeafNodeType const * const * const mNodes;
+}; // FindMinVoxelValue
+
+
+template<typename InternalNodeType>
+struct FindMinTileValue {
+
+    typedef typename InternalNodeType::ValueType    ValueType;
+
+    FindMinTileValue(InternalNodeType const * const * const nodes)
+        : minValue(std::numeric_limits<ValueType>::max())
+        , mNodes(nodes)
+    {
+    }
+
+    FindMinTileValue(FindMinTileValue& rhs, tbb::split)
+        : minValue(std::numeric_limits<ValueType>::max())
+        , mNodes(rhs.mNodes)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            typename InternalNodeType::ValueAllCIter it = mNodes[n]->beginValueAll();
+            for (; it; ++it) {
+                minValue = std::min(minValue, *it);
+            }
+        }
+    }
+
+    void join(FindMinTileValue& rhs) { minValue = std::min(minValue, rhs.minValue); }
+
+    ValueType minValue;
+
+    InternalNodeType const * const * const mNodes;
+}; // FindMinTileValue
+
+
+template<typename LeafNodeType>
+struct SDFVoxelsToFogVolume {
+
+    typedef typename LeafNodeType::ValueType ValueType;
+
+    SDFVoxelsToFogVolume(LeafNodeType ** nodes, ValueType cutoffDistance)
+        : mNodes(nodes), mWeight(ValueType(1.0) / cutoffDistance)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            LeafNodeType& node = *mNodes[n];
+            node.setValuesOff();
+
+            ValueType* values = node.buffer().data();
+            for (Index i = 0; i < LeafNodeType::SIZE; ++i) {
+                values[i] = values[i] > ValueType(0.0) ? ValueType(0.0) : values[i] * mWeight;
+                if (values[i] > ValueType(0.0)) node.setValueOn(i);
+            }
+
+            if (node.onVoxelCount() == 0) {
+                delete mNodes[n];
+                mNodes[n] = NULL;
+            }
+        }
+    }
+
+    LeafNodeType    ** const mNodes;
+    ValueType          const mWeight;
+}; // SDFVoxelsToFogVolume
+
+
+template<typename TreeType, typename InternalNodeType>
+struct SDFTilesToFogVolume {
+
+    SDFTilesToFogVolume(const TreeType& tree, InternalNodeType ** nodes)
+        : mTree(&tree), mNodes(nodes) { }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typedef typename TreeType::ValueType ValueType;
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            typename InternalNodeType::ValueAllIter it = mNodes[n]->beginValueAll();
+            for (; it; ++it) {
+                if (acc.getValue(it.getCoord()) < ValueType(0.0)) {
+                    it.setValue(ValueType(1.0));
+                    it.setValueOn(true);
+                }
+            }
+        }
+    }
+
+    TreeType            const * const mTree;
+    InternalNodeType         ** const mNodes;
+}; // SDFTilesToFogVolume
+
+
+template<typename TreeType>
+struct FillMaskBoundary {
+
+    typedef typename TreeType::ValueType                            ValueType;
+    typedef typename TreeType::LeafNodeType                         LeafNodeType;
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+
+    FillMaskBoundary(const TreeType& tree, ValueType isovalue, const BoolTreeType& fillMask,
+        const BoolLeafNodeType ** fillNodes, BoolLeafNodeType ** newNodes)
+        : mTree(&tree), mFillMask(&fillMask), mFillNodes(fillNodes), mNewNodes(newNodes), mIsovalue(isovalue)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        tree::ValueAccessor<const BoolTreeType> maskAcc(*mFillMask);
+        tree::ValueAccessor<const TreeType> distAcc(*mTree);
+
+        boost::scoped_array<char> valueMask(new char[BoolLeafNodeType::SIZE]);
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            mNewNodes[n] = NULL;
+            const BoolLeafNodeType& node = *mFillNodes[n];
+            const Coord& origin = node.origin();
+
+            const bool denseNode = node.isDense();
+
+            // possible early out if the fill mask is dense
+            if (denseNode) {
+
+                int denseNeighbors = 0;
+
+                const BoolLeafNodeType* neighborNode = maskAcc.probeConstLeaf(origin.offsetBy(-1, 0, 0));
+                if (neighborNode && neighborNode->isDense()) ++denseNeighbors;
+
+                neighborNode = maskAcc.probeConstLeaf(origin.offsetBy(BoolLeafNodeType::DIM, 0, 0));
+                if (neighborNode && neighborNode->isDense()) ++denseNeighbors;
+
+                neighborNode = maskAcc.probeConstLeaf(origin.offsetBy(0, -1, 0));
+                if (neighborNode && neighborNode->isDense()) ++denseNeighbors;
+
+                neighborNode = maskAcc.probeConstLeaf(origin.offsetBy(0, BoolLeafNodeType::DIM, 0));
+                if (neighborNode && neighborNode->isDense()) ++denseNeighbors;
+
+                neighborNode = maskAcc.probeConstLeaf(origin.offsetBy(0, 0, -1));
+                if (neighborNode && neighborNode->isDense()) ++denseNeighbors;
+
+                neighborNode = maskAcc.probeConstLeaf(origin.offsetBy(0, 0, BoolLeafNodeType::DIM));
+                if (neighborNode && neighborNode->isDense()) ++denseNeighbors;
+
+                if (denseNeighbors == 6) continue;
+            }
+
+            // rest value mask
+            memset(valueMask.get(), 0, sizeof(char) * BoolLeafNodeType::SIZE);
+
+            const typename TreeType::LeafNodeType* distNode = distAcc.probeConstLeaf(origin);
+
+            // check internal voxel neighbors
+
+            bool earlyTermination = false;
+
+            if (!denseNode) {
+                if (distNode) {
+                    evalInternalNeighborsP(valueMask.get(), node, *distNode);
+                    evalInternalNeighborsN(valueMask.get(), node, *distNode);
+                } else if (distAcc.getValue(origin) > mIsovalue) {
+                    earlyTermination = evalInternalNeighborsP(valueMask.get(), node);
+                    if (!earlyTermination) earlyTermination = evalInternalNeighborsN(valueMask.get(), node);
+                }
+            }
+
+            // check external voxel neighbors
+
+            if (!earlyTermination) {
+                evalExternalNeighborsX<true>(valueMask.get(), node, maskAcc, distAcc);
+                evalExternalNeighborsX<false>(valueMask.get(), node, maskAcc, distAcc);
+                evalExternalNeighborsY<true>(valueMask.get(), node, maskAcc, distAcc);
+                evalExternalNeighborsY<false>(valueMask.get(), node, maskAcc, distAcc);
+                evalExternalNeighborsZ<true>(valueMask.get(), node, maskAcc, distAcc);
+                evalExternalNeighborsZ<false>(valueMask.get(), node, maskAcc, distAcc);
+            }
+
+            // Export marked boundary voxels.
+
+            int numBoundaryValues = 0;
+            for (Index i = 0, I = BoolLeafNodeType::SIZE; i < I; ++i) {
+                numBoundaryValues += valueMask[i] == 1;
+            }
+
+            if (numBoundaryValues > 0) {
+                mNewNodes[n] = new BoolLeafNodeType(origin, false);
+                for (Index i = 0, I = BoolLeafNodeType::SIZE; i < I; ++i) {
+                    if (valueMask[i] == 1) mNewNodes[n]->setValueOn(i);
+                }
+            }
+        }
+    }
+
+private:
+
+    // Check internal voxel neighbors in positive {x, y, z} directions.
+
+    void evalInternalNeighborsP(char* valueMask, const BoolLeafNodeType& node, const LeafNodeType& distNode) const {
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM - 1; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (valueMask[pos] != 0 || !node.isValueOn(pos)) continue;
+
+                    if (!node.isValueOn(pos + 1) && distNode.getValue(pos + 1)  > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM - 1; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (valueMask[pos] != 0 || !node.isValueOn(pos)) continue;
+
+                    if (!node.isValueOn(pos + BoolLeafNodeType::DIM) &&
+                        distNode.getValue(pos + BoolLeafNodeType::DIM)  > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM - 1; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (valueMask[pos] != 0 || !node.isValueOn(pos)) continue;
+
+                    if (!node.isValueOn(pos + BoolLeafNodeType::DIM * BoolLeafNodeType::DIM) &&
+                        distNode.getValue(pos + BoolLeafNodeType::DIM * BoolLeafNodeType::DIM)  > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    bool evalInternalNeighborsP(char* valueMask, const BoolLeafNodeType& node) const {
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM - 1; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (node.isValueOn(pos) && !node.isValueOn(pos + 1)) {
+                        valueMask[pos] = 1;
+                        return true;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM - 1; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (node.isValueOn(pos) && !node.isValueOn(pos + BoolLeafNodeType::DIM)) {
+                        valueMask[pos] = 1;
+                        return true;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM - 1; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (node.isValueOn(pos) &&
+                        !node.isValueOn(pos + BoolLeafNodeType::DIM * BoolLeafNodeType::DIM)) {
+                        valueMask[pos] = 1;
+                        return true;
+                    }
+                }
+            }
+        }
+
+        return false;
+    }
+
+    // Check internal voxel neighbors in negative {x, y, z} directions.
+
+    void evalInternalNeighborsN(char* valueMask, const BoolLeafNodeType& node, const LeafNodeType& distNode) const {
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 1; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (valueMask[pos] != 0 || !node.isValueOn(pos)) continue;
+
+                    if (!node.isValueOn(pos - 1) && distNode.getValue(pos - 1)  > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 1; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (valueMask[pos] != 0 || !node.isValueOn(pos)) continue;
+
+                    if (!node.isValueOn(pos - BoolLeafNodeType::DIM) &&
+                        distNode.getValue(pos - BoolLeafNodeType::DIM)  > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 1; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (valueMask[pos] != 0 || !node.isValueOn(pos)) continue;
+
+                    if (!node.isValueOn(pos - BoolLeafNodeType::DIM * BoolLeafNodeType::DIM) &&
+                        distNode.getValue(pos - BoolLeafNodeType::DIM * BoolLeafNodeType::DIM)  > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+
+    bool evalInternalNeighborsN(char* valueMask, const BoolLeafNodeType& node) const {
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 1; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (node.isValueOn(pos) && !node.isValueOn(pos - 1)) {
+                        valueMask[pos] = 1;
+                        return true;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 0; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 1; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (node.isValueOn(pos) && !node.isValueOn(pos - BoolLeafNodeType::DIM)) {
+                        valueMask[pos] = 1;
+                        return true;
+                    }
+                }
+            }
+        }
+
+        for (Index x = 1; x < BoolLeafNodeType::DIM; ++x) {
+            const Index xPos = x << (2 * BoolLeafNodeType::LOG2DIM);
+            for (Index y = 0; y < BoolLeafNodeType::DIM; ++y) {
+                const Index yPos = xPos + (y << BoolLeafNodeType::LOG2DIM);
+                for (Index z = 0; z < BoolLeafNodeType::DIM; ++z) {
+                    const Index pos = yPos + z;
+
+                    if (node.isValueOn(pos) &&
+                        !node.isValueOn(pos - BoolLeafNodeType::DIM * BoolLeafNodeType::DIM)) {
+                        valueMask[pos] = 1;
+                        return true;
+                    }
+                }
+            }
+        }
+
+        return false;
+    }
+
+
+    // Check external voxel neighbors
+
+    // If UpWind is true check the X+ oriented node face, else the X- oriented face.
+    template<bool UpWind>
+    void evalExternalNeighborsX(char* valueMask, const BoolLeafNodeType& node,
+        const tree::ValueAccessor<const BoolTreeType>& maskAcc,
+        const tree::ValueAccessor<const TreeType>& distAcc) const {
+
+        const Coord& origin = node.origin();
+        Coord ijk(0, 0, 0), nijk;
+        int step = -1;
+
+        if (UpWind) {
+            step = 1;
+            ijk[0] = int(BoolLeafNodeType::DIM) - 1;
+        }
+
+        const Index xPos = ijk[0] << (2 * int(BoolLeafNodeType::LOG2DIM));
+
+        for (ijk[1] = 0; ijk[1] < int(BoolLeafNodeType::DIM); ++ijk[1]) {
+            const Index yPos = xPos + (ijk[1] << int(BoolLeafNodeType::LOG2DIM));
+
+            for (ijk[2] = 0; ijk[2] < int(BoolLeafNodeType::DIM); ++ijk[2]) {
+                const Index pos = yPos + ijk[2];
+
+                if (valueMask[pos] == 0 && node.isValueOn(pos)) {
+
+                    nijk = origin + ijk.offsetBy(step, 0, 0);
+
+                    if (!maskAcc.isValueOn(nijk) && distAcc.getValue(nijk) > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    // If UpWind is true check the Y+ oriented node face, else the Y- oriented face.
+    template<bool UpWind>
+    void evalExternalNeighborsY(char* valueMask, const BoolLeafNodeType& node,
+        const tree::ValueAccessor<const BoolTreeType>& maskAcc,
+        const tree::ValueAccessor<const TreeType>& distAcc) const {
+
+        const Coord& origin = node.origin();
+        Coord ijk(0, 0, 0), nijk;
+        int step = -1;
+
+        if (UpWind) {
+            step = 1;
+            ijk[1] = int(BoolLeafNodeType::DIM) - 1;
+        }
+
+        const Index yPos = ijk[1] << int(BoolLeafNodeType::LOG2DIM);
+
+        for (ijk[0] = 0;  ijk[0] < int(BoolLeafNodeType::DIM); ++ijk[0]) {
+            const Index xPos = yPos + (ijk[0] << (2 * int(BoolLeafNodeType::LOG2DIM)));
+
+            for (ijk[2] = 0; ijk[2] < int(BoolLeafNodeType::DIM); ++ijk[2]) {
+                const Index pos = xPos + ijk[2];
+
+                if (valueMask[pos] == 0 && node.isValueOn(pos)) {
+
+                    nijk = origin + ijk.offsetBy(0, step, 0);
+                    if (!maskAcc.isValueOn(nijk) && distAcc.getValue(nijk) > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    // If UpWind is true check the Z+ oriented node face, else the Z- oriented face.
+    template<bool UpWind>
+    void evalExternalNeighborsZ(char* valueMask, const BoolLeafNodeType& node,
+        const tree::ValueAccessor<const BoolTreeType>& maskAcc,
+        const tree::ValueAccessor<const TreeType>& distAcc) const {
+
+        const Coord& origin = node.origin();
+        Coord ijk(0, 0, 0), nijk;
+        int step = -1;
+
+        if (UpWind) {
+            step = 1;
+            ijk[2] = int(BoolLeafNodeType::DIM) - 1;
+        }
+
+        for (ijk[0] = 0;  ijk[0] < int(BoolLeafNodeType::DIM); ++ijk[0]) {
+            const Index xPos = ijk[0] << (2 * int(BoolLeafNodeType::LOG2DIM));
+
+            for (ijk[1] = 0; ijk[1] < int(BoolLeafNodeType::DIM); ++ijk[1]) {
+                const Index pos = ijk[2] + xPos + (ijk[1] << int(BoolLeafNodeType::LOG2DIM));
+
+                if (valueMask[pos] == 0 && node.isValueOn(pos)) {
+
+                    nijk = origin + ijk.offsetBy(0, 0, step);
+                    if (!maskAcc.isValueOn(nijk) && distAcc.getValue(nijk) > mIsovalue) {
+                        valueMask[pos] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    //////////
+
+    TreeType                    const * const mTree;
+    BoolTreeType                const * const mFillMask;
+    BoolLeafNodeType    const * const * const mFillNodes;
+    BoolLeafNodeType                 ** const mNewNodes;
+    ValueType                           const mIsovalue;
+}; // FillMaskBoundary
+
+
+/// @brief Constructs a memory light char tree that represents the exterior region with @c +1
+///        and the interior regions with @c -1.
+template <class TreeType>
+inline typename TreeType::template ValueConverter<char>::Type::Ptr
+computeEnclosedRegionMask(const TreeType& tree, typename TreeType::ValueType isovalue,
+    const typename TreeType::template ValueConverter<bool>::Type* fillMask)
+{
+    typedef typename TreeType::LeafNodeType                                         LeafNodeType;
+    typedef typename TreeType::RootNodeType                                         RootNodeType;
+    typedef typename RootNodeType::NodeChainType                                    NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type      InternalNodeType;
+
+    typedef typename TreeType::template ValueConverter<char>::Type                  CharTreeType;
+    typedef typename CharTreeType::LeafNodeType                                     CharLeafNodeType;
+    typedef typename CharTreeType::RootNodeType                                     CharRootNodeType;
+    typedef typename CharRootNodeType::NodeChainType                                CharNodeChainType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type                  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                                     BoolLeafNodeType;
+
+    /////
+
+    const TreeType* treePt = &tree;
+
+    size_t numLeafNodes = 0, numInternalNodes = 0;
+
+    std::vector<const LeafNodeType*> nodes;
+    std::vector<size_t> leafnodeCount;
+
+    {
+        // compute the prefix sum of the leafnode count in each internal node.
+        std::vector<const InternalNodeType*> internalNodes;
+        treePt->getNodes(internalNodes);
+
+        numInternalNodes = internalNodes.size();
+
+        leafnodeCount.push_back(0);
+        for (size_t n = 0; n < numInternalNodes; ++n) {
+            leafnodeCount.push_back(leafnodeCount.back() + internalNodes[n]->leafCount());
+        }
+
+        numLeafNodes = leafnodeCount.back();
+
+        // extract all leafnodes
+        nodes.reserve(numLeafNodes);
+
+        for (size_t n = 0; n < numInternalNodes; ++n) {
+            internalNodes[n]->getNodes(nodes);
+        }
+    }
+
+    // create mask leafnodes
+    boost::scoped_array<CharLeafNodeType*> maskNodes(new CharLeafNodeType*[numLeafNodes]);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, numLeafNodes),
+        LabelBoundaryVoxels<LeafNodeType>(isovalue, &nodes[0], maskNodes.get()));
+
+    // create mask grid
+    typename CharTreeType::Ptr maskTree(new CharTreeType(1));
+
+    PopulateTree<CharTreeType> populate(*maskTree, maskNodes.get(), &leafnodeCount[0], 1);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, numInternalNodes), populate);
+
+    // optionally evaluate the fill mask
+
+    std::vector<CharLeafNodeType*> extraMaskNodes;
+
+    if (fillMask) {
+
+        std::vector<const BoolLeafNodeType*> fillMaskNodes;
+        fillMask->getNodes(fillMaskNodes);
+
+        boost::scoped_array<BoolLeafNodeType*> boundaryMaskNodes(new BoolLeafNodeType*[fillMaskNodes.size()]);
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, fillMaskNodes.size()),
+            FillMaskBoundary<TreeType>(tree, isovalue, *fillMask, &fillMaskNodes[0], boundaryMaskNodes.get()));
+
+        tree::ValueAccessor<CharTreeType> maskAcc(*maskTree);
+
+        for (size_t n = 0, N = fillMaskNodes.size(); n < N; ++n) {
+
+            if (boundaryMaskNodes[n] == NULL) continue;
+
+            const BoolLeafNodeType& boundaryNode = *boundaryMaskNodes[n];
+            const Coord& origin = boundaryNode.origin();
+
+            CharLeafNodeType* maskNodePt = maskAcc.probeLeaf(origin);
+
+            if (!maskNodePt) {
+                maskNodePt = maskAcc.touchLeaf(origin);
+                extraMaskNodes.push_back(maskNodePt);
+            }
+
+            char* data = maskNodePt->buffer().data();
+
+            typename BoolLeafNodeType::ValueOnCIter it = boundaryNode.cbeginValueOn();
+            for (; it; ++it) {
+                if (data[it.pos()] != 0) data[it.pos()] = -1;
+            }
+
+            delete boundaryMaskNodes[n];
+        }
+    }
+
+    // eliminate enclosed regions
+    tools::traceExteriorBoundaries(*maskTree);
+
+    // flip voxel sign to negative inside and positive outside.
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, numLeafNodes),
+        FlipRegionSign<CharLeafNodeType>(maskNodes.get()));
+
+    if (!extraMaskNodes.empty()) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, extraMaskNodes.size()),
+            FlipRegionSign<CharLeafNodeType>(&extraMaskNodes[0]));
+    }
+
+    // propagate sign information into tile region
+    tools::signedFloodFill(*maskTree);
+
+    return maskTree;
+} // computeEnclosedRegionMask()
+
+
+template <class TreeType>
+inline typename TreeType::template ValueConverter<bool>::Type::Ptr
+computeInteriorMask(const TreeType& tree, typename TreeType::ValueType iso)
+{
+    typedef typename TreeType::LeafNodeType                                         LeafNodeType;
+    typedef typename TreeType::RootNodeType                                         RootNodeType;
+    typedef typename RootNodeType::NodeChainType                                    NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type      InternalNodeType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type                  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                                     BoolLeafNodeType;
+    typedef typename BoolTreeType::RootNodeType                                     BoolRootNodeType;
+    typedef typename BoolRootNodeType::NodeChainType                                BoolNodeChainType;
+    typedef typename boost::mpl::at<BoolNodeChainType, boost::mpl::int_<1> >::type  BoolInternalNodeType;
+
+    /////
+    size_t numLeafNodes = 0, numInternalNodes = 0;
+
+    std::vector<const LeafNodeType*> nodes;
+    std::vector<size_t> leafnodeCount;
+
+    {
+        // compute the prefix sum of the leafnode count in each internal node.
+        std::vector<const InternalNodeType*> internalNodes;
+        tree.getNodes(internalNodes);
+
+        numInternalNodes = internalNodes.size();
+
+        leafnodeCount.push_back(0);
+        for (size_t n = 0; n < numInternalNodes; ++n) {
+            leafnodeCount.push_back(leafnodeCount.back() + internalNodes[n]->leafCount());
+        }
+
+        numLeafNodes = leafnodeCount.back();
+
+        // extract all leafnodes
+        nodes.reserve(numLeafNodes);
+
+        for (size_t n = 0; n < numInternalNodes; ++n) {
+            internalNodes[n]->getNodes(nodes);
+        }
+    }
+
+    // create mask leafnodes
+    boost::scoped_array<BoolLeafNodeType*> maskNodes(new BoolLeafNodeType*[numLeafNodes]);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, numLeafNodes),
+        MaskInteriorVoxels<LeafNodeType>(iso, &nodes[0], maskNodes.get()));
+
+
+    // create mask grid
+    typename BoolTreeType::Ptr maskTree(new BoolTreeType(false));
+
+    PopulateTree<BoolTreeType> populate(*maskTree, maskNodes.get(), &leafnodeCount[0], false);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, numInternalNodes), populate);
+
+
+    // evaluate tile values
+    std::vector<BoolInternalNodeType*> internalMaskNodes;
+    maskTree->getNodes(internalMaskNodes);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, internalMaskNodes.size()),
+        MaskInteriorTiles<TreeType, BoolInternalNodeType>(iso, tree, &internalMaskNodes[0]));
+
+    tree::ValueAccessor<const TreeType> acc(tree);
+
+    typename BoolTreeType::ValueAllIter it(*maskTree);
+    it.setMaxDepth(BoolTreeType::ValueAllIter::LEAF_DEPTH - 2);
+
+    for ( ; it; ++it) {
+        if (acc.getValue(it.getCoord()) < iso) {
+            it.setValue(true);
+            it.setActiveState(true);
+        }
+    }
+
+    return maskTree;
+} // computeInteriorMask()
+
+
+template<typename InputTreeType>
+struct MaskIsovalueCrossingVoxels
+{
+    typedef typename InputTreeType::ValueType                               InputValueType;
+    typedef typename InputTreeType::LeafNodeType                            InputLeafNodeType;
+    typedef typename InputTreeType::template ValueConverter<bool>::Type     BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                             BoolLeafNodeType;
+
+    MaskIsovalueCrossingVoxels(
+        const InputTreeType& inputTree,
+        const std::vector<const InputLeafNodeType*>& inputLeafNodes,
+        BoolTreeType& maskTree,
+        InputValueType iso)
+        : mInputAccessor(inputTree)
+        , mInputNodes(!inputLeafNodes.empty() ? &inputLeafNodes.front() : NULL)
+        , mMaskTree(false)
+        , mMaskAccessor(maskTree)
+        , mIsovalue(iso)
+    {
+    }
+
+    MaskIsovalueCrossingVoxels(MaskIsovalueCrossingVoxels& rhs, tbb::split)
+        : mInputAccessor(rhs.mInputAccessor.tree())
+        , mInputNodes(rhs.mInputNodes)
+        , mMaskTree(false)
+        , mMaskAccessor(mMaskTree)
+        , mIsovalue(rhs.mIsovalue)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+
+        const InputValueType iso = mIsovalue;
+        Coord ijk(0, 0, 0);
+
+        BoolLeafNodeType* maskNodePt = NULL;
+
+        for (size_t n = range.begin(); mInputNodes && (n != range.end()); ++n) {
+
+            const InputLeafNodeType& node = *mInputNodes[n];
+
+            if (!maskNodePt) maskNodePt = new BoolLeafNodeType(node.origin(), false);
+            else maskNodePt->setOrigin(node.origin());
+
+            bool collectedData = false;
+
+            for (typename InputLeafNodeType::ValueOnCIter it = node.cbeginValueOn(); it; ++it) {
+
+                bool isUnder = *it < iso;
+
+                ijk = it.getCoord();
+
+                ++ijk[2];
+                bool signChange = isUnder != (mInputAccessor.getValue(ijk) < iso); // +z edge
+                --ijk[2];
+
+                if (!signChange) {
+                    --ijk[2];
+                    signChange = isUnder != (mInputAccessor.getValue(ijk) < iso); // -z edge
+                    ++ijk[2];
+                }
+
+                if (!signChange) {
+                    ++ijk[1];
+                    signChange = isUnder != (mInputAccessor.getValue(ijk) < iso); // +y edge
+                    --ijk[1];
+                }
+
+                if (!signChange) {
+                    --ijk[1];
+                    signChange = isUnder != (mInputAccessor.getValue(ijk) < iso); // -y edge
+                    ++ijk[1];
+                }
+
+                if (!signChange) {
+                    ++ijk[0];
+                    signChange = isUnder != (mInputAccessor.getValue(ijk) < iso); // +x edge
+                    --ijk[0];
+                }
+
+                if (!signChange) {
+                    --ijk[0];
+                    signChange = isUnder != (mInputAccessor.getValue(ijk) < iso); // -x edge
+                    ++ijk[0];
+                }
+
+                if (signChange) {
+                    collectedData = true;
+                    maskNodePt->setValueOn(it.pos(), true);
+                }
+            }
+
+            if (collectedData) {
+                mMaskAccessor.addLeaf(maskNodePt);
+                maskNodePt = NULL;
+            }
+        }
+
+        if (maskNodePt) delete maskNodePt;
+    }
+
+    void join(MaskIsovalueCrossingVoxels& rhs) {
+        mMaskAccessor.tree().merge(rhs.mMaskAccessor.tree());
+    }
+
+private:
+    tree::ValueAccessor<const InputTreeType>    mInputAccessor;
+    InputLeafNodeType const * const * const     mInputNodes;
+
+    BoolTreeType                                mMaskTree;
+    tree::ValueAccessor<BoolTreeType>           mMaskAccessor;
+
+    InputValueType                              mIsovalue;
+}; // MaskIsovalueCrossingVoxels
+
+
+////////////////////////////////////////
+
+
+template<typename NodeType>
+struct NodeMaskSegment
+{
+    typedef boost::shared_ptr<NodeMaskSegment>  Ptr;
+    typedef typename NodeType::NodeMaskType     NodeMaskType;
+
+    NodeMaskSegment() : connections(), mask(false), origin(0,0,0), visited(false) {}
+
+    std::vector<NodeMaskSegment*>   connections;
+    NodeMaskType                    mask;
+    Coord                           origin;
+    bool                            visited;
+}; // struct NodeMaskSegment
+
+
+template<typename NodeType>
+inline void
+nodeMaskSegmentation(const NodeType& node,
+    std::vector<typename NodeMaskSegment<NodeType>::Ptr>& segments)
+{
+    typedef typename NodeType::NodeMaskType     NodeMaskType;
+    typedef NodeMaskSegment<NodeType>           NodeMaskSegmentType;
+    typedef typename NodeMaskSegmentType::Ptr   NodeMaskSegmentTypePtr;
+
+    NodeMaskType nodeMask(node.getValueMask());
+    std::deque<Index> indexList;
+
+    while (!nodeMask.isOff()) {
+
+        NodeMaskSegmentTypePtr segment(new NodeMaskSegmentType());
+        segment->origin = node.origin();
+
+        NodeMaskType& mask = segment->mask;
+
+        indexList.push_back(nodeMask.findFirstOn());
+        nodeMask.setOff(indexList.back()); // mark as visited
+        Coord ijk(0, 0, 0);
+
+        while (!indexList.empty()) {
+
+            const Index pos = indexList.back();
+            indexList.pop_back();
+
+            if (mask.isOn(pos)) continue;
+            mask.setOn(pos);
+
+            ijk = NodeType::offsetToLocalCoord(pos);
+
+            Index npos = pos - 1;
+            if (ijk[2] != 0 && nodeMask.isOn(npos)) {
+                nodeMask.setOff(npos);
+                indexList.push_back(npos);
+            }
+
+            npos = pos + 1;
+            if (ijk[2] != (NodeType::DIM - 1) && nodeMask.isOn(npos)) {
+                nodeMask.setOff(npos);
+                indexList.push_back(npos);
+            }
+
+            npos = pos - NodeType::DIM;
+            if (ijk[1] != 0 && nodeMask.isOn(npos)) {
+                nodeMask.setOff(npos);
+                indexList.push_back(npos);
+            }
+
+            npos = pos + NodeType::DIM;
+            if (ijk[1] != (NodeType::DIM - 1) && nodeMask.isOn(npos)) {
+                nodeMask.setOff(npos);
+                indexList.push_back(npos);
+            }
+
+            npos = pos - NodeType::DIM * NodeType::DIM;
+            if (ijk[0] != 0 && nodeMask.isOn(npos)) {
+                nodeMask.setOff(npos);
+                indexList.push_back(npos);
+            }
+
+            npos = pos + NodeType::DIM * NodeType::DIM;
+            if (ijk[0] != (NodeType::DIM - 1) && nodeMask.isOn(npos)) {
+                nodeMask.setOff(npos);
+                indexList.push_back(npos);
+            }
+
+        }
+
+        segments.push_back(segment);
+    }
+}
+
+
+template<typename NodeType>
+struct SegmentNodeMask
+{
+    typedef NodeMaskSegment<NodeType>                       NodeMaskSegmentType;
+    typedef typename NodeMaskSegmentType::Ptr               NodeMaskSegmentTypePtr;
+    typedef typename std::vector<NodeMaskSegmentTypePtr>    NodeMaskSegmentVector;
+
+    SegmentNodeMask(std::vector<NodeType*>& nodes, NodeMaskSegmentVector* nodeMaskArray)
+        : mNodes(!nodes.empty() ? &nodes.front() : NULL)
+        , mNodeMaskArray(nodeMaskArray)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            NodeType& node = *mNodes[n];
+            nodeMaskSegmentation(node, mNodeMaskArray[n]);
+
+            // hack origin data to store array offset
+            Coord& origin = const_cast<Coord&>(node.origin());
+            origin[0] = static_cast<int>(n);
+        }
+    }
+
+    NodeType                * const * const mNodes;
+    NodeMaskSegmentVector           * const mNodeMaskArray;
+}; // struct SegmentNodeMask
+
+
+template<typename TreeType, typename NodeType>
+struct ConnectNodeMaskSegments
+{
+    typedef typename NodeType::NodeMaskType                 NodeMaskType;
+    typedef NodeMaskSegment<NodeType>                       NodeMaskSegmentType;
+    typedef typename NodeMaskSegmentType::Ptr               NodeMaskSegmentTypePtr;
+    typedef typename std::vector<NodeMaskSegmentTypePtr>    NodeMaskSegmentVector;
+
+    ConnectNodeMaskSegments(const TreeType& tree, NodeMaskSegmentVector* nodeMaskArray)
+        : mTree(&tree)
+        , mNodeMaskArray(nodeMaskArray)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            NodeMaskSegmentVector& segments = mNodeMaskArray[n];
+            if (segments.empty()) continue;
+
+            std::vector<std::set<NodeMaskSegmentType*> > connections(segments.size());
+
+            Coord ijk = segments[0]->origin;
+
+            const NodeType* node = acc.template probeConstNode<NodeType>(ijk);
+            if (!node) continue;
+
+            // get neighbour nodes
+
+            ijk[2] += NodeType::DIM;
+            const NodeType* nodeZUp = acc.template probeConstNode<NodeType>(ijk);
+            ijk[2] -= (NodeType::DIM + NodeType::DIM);
+            const NodeType* nodeZDown = acc.template probeConstNode<NodeType>(ijk);
+            ijk[2] += NodeType::DIM;
+
+            ijk[1] += NodeType::DIM;
+            const NodeType* nodeYUp = acc.template probeConstNode<NodeType>(ijk);
+            ijk[1] -= (NodeType::DIM + NodeType::DIM);
+            const NodeType* nodeYDown = acc.template probeConstNode<NodeType>(ijk);
+            ijk[1] += NodeType::DIM;
+
+            ijk[0] += NodeType::DIM;
+            const NodeType* nodeXUp = acc.template probeConstNode<NodeType>(ijk);
+            ijk[0] -= (NodeType::DIM + NodeType::DIM);
+            const NodeType* nodeXDown = acc.template probeConstNode<NodeType>(ijk);
+            ijk[0] += NodeType::DIM;
+
+            const Index startPos = node->getValueMask().findFirstOn();
+            for (Index pos = startPos; pos < NodeMaskType::SIZE; ++pos) {
+
+                if (!node->isValueOn(pos)) continue;
+
+                ijk = NodeType::offsetToLocalCoord(pos);
+                Index npos = 0;
+
+                if (ijk[2] == 0) {
+                    npos = pos + (NodeType::DIM - 1);
+                    if (nodeZDown && nodeZDown->isValueOn(npos)) {
+                        NodeMaskSegmentType* nsegment =
+                            findNodeMaskSegment(mNodeMaskArray[getNodeOffset(*nodeZDown)], npos);
+                        const Index idx = findNodeMaskSegmentIndex(segments, pos);
+                        connections[idx].insert(nsegment);
+                    }
+                } else if (ijk[2] == (NodeType::DIM - 1)) {
+                    npos = pos - (NodeType::DIM - 1);
+                    if (nodeZUp && nodeZUp->isValueOn(npos)) {
+                        NodeMaskSegmentType* nsegment =
+                            findNodeMaskSegment(mNodeMaskArray[getNodeOffset(*nodeZUp)], npos);
+                        const Index idx = findNodeMaskSegmentIndex(segments, pos);
+                        connections[idx].insert(nsegment);
+                    }
+                }
+
+                if (ijk[1] == 0) {
+                    npos = pos + (NodeType::DIM - 1) * NodeType::DIM;
+                    if (nodeYDown && nodeYDown->isValueOn(npos)) {
+                        NodeMaskSegmentType* nsegment =
+                            findNodeMaskSegment(mNodeMaskArray[getNodeOffset(*nodeYDown)], npos);
+                        const Index idx = findNodeMaskSegmentIndex(segments, pos);
+                        connections[idx].insert(nsegment);
+                    }
+                } else if (ijk[1] == (NodeType::DIM - 1)) {
+                    npos = pos - (NodeType::DIM - 1) * NodeType::DIM;
+                    if (nodeYUp && nodeYUp->isValueOn(npos)) {
+                        NodeMaskSegmentType* nsegment =
+                            findNodeMaskSegment(mNodeMaskArray[getNodeOffset(*nodeYUp)], npos);
+                        const Index idx = findNodeMaskSegmentIndex(segments, pos);
+                        connections[idx].insert(nsegment);
+                    }
+                }
+
+                if (ijk[0] == 0) {
+                    npos = pos + (NodeType::DIM - 1) * NodeType::DIM * NodeType::DIM;
+                    if (nodeXDown && nodeXDown->isValueOn(npos)) {
+                        NodeMaskSegmentType* nsegment =
+                            findNodeMaskSegment(mNodeMaskArray[getNodeOffset(*nodeXDown)], npos);
+                        const Index idx = findNodeMaskSegmentIndex(segments, pos);
+                        connections[idx].insert(nsegment);
+                    }
+                } else if (ijk[0] == (NodeType::DIM - 1)) {
+                    npos = pos - (NodeType::DIM - 1) * NodeType::DIM * NodeType::DIM;
+                    if (nodeXUp && nodeXUp->isValueOn(npos)) {
+                        NodeMaskSegmentType* nsegment =
+                            findNodeMaskSegment(mNodeMaskArray[getNodeOffset(*nodeXUp)], npos);
+                        const Index idx = findNodeMaskSegmentIndex(segments, pos);
+                        connections[idx].insert(nsegment);
+                    }
+                }
+            }
+
+            for (size_t i = 0, I = connections.size(); i < I; ++i) {
+
+                typename std::set<NodeMaskSegmentType*>::iterator
+                    it = connections[i].begin(), end =  connections[i].end();
+
+                std::vector<NodeMaskSegmentType*>& segmentConnections = segments[i]->connections;
+                segmentConnections.reserve(connections.size());
+                for (; it != end; ++it) {
+                    segmentConnections.push_back(*it);
+                }
+            }
+        } // end range loop
+    }
+
+private:
+
+    static inline size_t getNodeOffset(const NodeType& node) {
+        return static_cast<size_t>(node.origin()[0]);
+    }
+
+    static inline NodeMaskSegmentType*
+    findNodeMaskSegment(NodeMaskSegmentVector& segments, Index pos)
+    {
+        NodeMaskSegmentType* segment = NULL;
+
+        for (size_t n = 0, N = segments.size(); n < N; ++n) {
+            if (segments[n]->mask.isOn(pos)) {
+                segment = segments[n].get();
+                break;
+            }
+        }
+
+        return segment;
+    }
+
+    static inline Index
+    findNodeMaskSegmentIndex(NodeMaskSegmentVector& segments, Index pos)
+    {
+        for (Index n = 0, N = Index(segments.size()); n < N; ++n) {
+            if (segments[n]->mask.isOn(pos)) return n;
+        }
+        return Index(-1);
+    }
+
+    TreeType                const * const mTree;
+    NodeMaskSegmentVector         * const mNodeMaskArray;
+}; // struct ConnectNodeMaskSegments
+
+
+template<typename TreeType>
+struct MaskSegmentGroup
+{
+    typedef typename TreeType::LeafNodeType     LeafNodeType;
+    typedef typename TreeType::Ptr              TreeTypePtr;
+    typedef NodeMaskSegment<LeafNodeType>       NodeMaskSegmentType;
+
+    MaskSegmentGroup(const std::vector<NodeMaskSegmentType*>& segments)
+        : mSegments(!segments.empty() ? &segments.front() : NULL)
+        , mTree(new TreeType(false))
+    {
+    }
+
+    MaskSegmentGroup(const MaskSegmentGroup& rhs, tbb::split)
+        : mSegments(rhs.mSegments)
+        , mTree(new TreeType(false))
+    {
+    }
+
+    TreeTypePtr& mask() { return mTree; }
+
+    void join(MaskSegmentGroup& rhs) { mTree->merge(*rhs.mTree); }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+
+        tree::ValueAccessor<TreeType> acc(*mTree);
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            NodeMaskSegmentType& segment = *mSegments[n];
+            LeafNodeType* node = acc.touchLeaf(segment.origin);
+            node->getValueMask() |= segment.mask;
+        }
+    }
+
+private:
+    NodeMaskSegmentType * const * const mSegments;
+    TreeTypePtr                         mTree;
+}; // struct MaskSegmentGroup
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct ExpandLeafNodeRegion
+{
+    typedef typename TreeType::ValueType                            ValueType;
+    typedef typename TreeType::LeafNodeType                         LeafNodeType;
+    typedef typename LeafNodeType::NodeMaskType                     NodeMaskType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+
+    /////
+
+    ExpandLeafNodeRegion(const TreeType& distTree, BoolTreeType& maskTree, std::vector<BoolLeafNodeType*>& maskNodes)
+        : mDistTree(&distTree)
+        , mMaskTree(&maskTree)
+        , mMaskNodes(!maskNodes.empty() ? &maskNodes.front() : NULL)
+        , mNewMaskTree(false)
+    {
+    }
+
+    ExpandLeafNodeRegion(const ExpandLeafNodeRegion& rhs, tbb::split)
+        : mDistTree(rhs.mDistTree)
+        , mMaskTree(rhs.mMaskTree)
+        , mMaskNodes(rhs.mMaskNodes)
+        , mNewMaskTree(false)
+    {
+    }
+
+    BoolTreeType& newMaskTree() { return mNewMaskTree; }
+
+    void join(ExpandLeafNodeRegion& rhs) { mNewMaskTree.merge(rhs.mNewMaskTree); }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+
+        typedef LeafNodeType    NodeType;
+
+        tree::ValueAccessor<const TreeType>         distAcc(*mDistTree);
+        tree::ValueAccessor<const BoolTreeType>     maskAcc(*mMaskTree);
+        tree::ValueAccessor<BoolTreeType>           newMaskAcc(mNewMaskTree);
+
+        NodeMaskType maskZUp, maskZDown, maskYUp, maskYDown, maskXUp, maskXDown;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            BoolLeafNodeType& maskNode = *mMaskNodes[n];
+            if (maskNode.isEmpty()) continue;
+
+            Coord ijk = maskNode.origin(), nijk;
+
+            const LeafNodeType* distNode = distAcc.probeConstLeaf(ijk);
+            if (!distNode) continue;
+
+            const ValueType *dataZUp = NULL, *dataZDown = NULL,
+                            *dataYUp = NULL, *dataYDown = NULL,
+                            *dataXUp = NULL, *dataXDown = NULL;
+
+            ijk[2] += NodeType::DIM;
+            getData(ijk, distAcc, maskAcc, maskZUp, dataZUp);
+            ijk[2] -= (NodeType::DIM + NodeType::DIM);
+            getData(ijk, distAcc, maskAcc, maskZDown, dataZDown);
+            ijk[2] += NodeType::DIM;
+
+            ijk[1] += NodeType::DIM;
+            getData(ijk, distAcc, maskAcc, maskYUp, dataYUp);
+            ijk[1] -= (NodeType::DIM + NodeType::DIM);
+            getData(ijk, distAcc, maskAcc, maskYDown, dataYDown);
+            ijk[1] += NodeType::DIM;
+
+            ijk[0] += NodeType::DIM;
+            getData(ijk, distAcc, maskAcc, maskXUp, dataXUp);
+            ijk[0] -= (NodeType::DIM + NodeType::DIM);
+            getData(ijk, distAcc, maskAcc, maskXDown, dataXDown);
+            ijk[0] += NodeType::DIM;
+
+            for (typename BoolLeafNodeType::ValueOnIter it = maskNode.beginValueOn(); it; ++it) {
+
+                const Index pos = it.pos();
+                const ValueType val = std::abs(distNode->getValue(pos));
+
+                ijk = BoolLeafNodeType::offsetToLocalCoord(pos);
+                nijk = ijk + maskNode.origin();
+
+                if (dataZUp && ijk[2] == (BoolLeafNodeType::DIM - 1)) {
+                    const Index npos = pos - (NodeType::DIM - 1);
+                    if (maskZUp.isOn(npos) && std::abs(dataZUp[npos]) > val) {
+                        newMaskAcc.setValueOn(nijk.offsetBy(0, 0, 1));
+                    }
+                } else if (dataZDown && ijk[2] == 0) {
+                    const Index npos = pos + (NodeType::DIM - 1);
+                    if (maskZDown.isOn(npos) && std::abs(dataZDown[npos]) > val) {
+                        newMaskAcc.setValueOn(nijk.offsetBy(0, 0, -1));
+                    }
+                }
+
+                if (dataYUp && ijk[1] == (BoolLeafNodeType::DIM - 1)) {
+                    const Index npos = pos - (NodeType::DIM - 1) * NodeType::DIM;
+                    if (maskYUp.isOn(npos) && std::abs(dataYUp[npos]) > val) {
+                        newMaskAcc.setValueOn(nijk.offsetBy(0, 1, 0));
+                    }
+                } else if (dataYDown && ijk[1] == 0) {
+                    const Index npos = pos + (NodeType::DIM - 1) * NodeType::DIM;
+                    if (maskYDown.isOn(npos) && std::abs(dataYDown[npos]) > val) {
+                        newMaskAcc.setValueOn(nijk.offsetBy(0, -1, 0));
+                    }
+                }
+
+                if (dataXUp && ijk[0] == (BoolLeafNodeType::DIM - 1)) {
+                    const Index npos = pos - (NodeType::DIM - 1) * NodeType::DIM * NodeType::DIM;
+                    if (maskXUp.isOn(npos) && std::abs(dataXUp[npos]) > val) {
+                        newMaskAcc.setValueOn(nijk.offsetBy(1, 0, 0));
+                    }
+                } else if (dataXDown && ijk[0] == 0) {
+                    const Index npos = pos + (NodeType::DIM - 1) * NodeType::DIM * NodeType::DIM;
+                    if (maskXDown.isOn(npos) && std::abs(dataXDown[npos]) > val) {
+                        newMaskAcc.setValueOn(nijk.offsetBy(-1, 0, 0));
+                    }
+                }
+
+            } // end value on loop
+        } // end range loop
+    }
+
+private:
+
+    static inline void
+    getData(const Coord& ijk, tree::ValueAccessor<const TreeType>& distAcc,
+        tree::ValueAccessor<const BoolTreeType>& maskAcc, NodeMaskType& mask, const ValueType*& data)
+    {
+        const LeafNodeType* node = distAcc.probeConstLeaf(ijk);
+        if (node) {
+            data = node->buffer().data();
+            mask = node->getValueMask();
+            const BoolLeafNodeType* maskNodePt = maskAcc.probeConstLeaf(ijk);
+            if (maskNodePt) mask -= maskNodePt->getValueMask();
+        }
+    }
+
+    TreeType        const * const mDistTree;
+    BoolTreeType          * const mMaskTree;
+    BoolLeafNodeType     ** const mMaskNodes;
+
+    BoolTreeType mNewMaskTree;
+}; // struct ExpandLeafNodeRegion
+
+
+template<typename TreeType>
+struct FillLeafNodeVoxels
+{
+    typedef typename TreeType::ValueType                    ValueType;
+    typedef typename TreeType::LeafNodeType                 LeafNodeType;
+    typedef typename LeafNodeType::NodeMaskType             NodeMaskType;
+    typedef tree::LeafNode<bool, LeafNodeType::LOG2DIM>     BoolLeafNodeType;
+
+    FillLeafNodeVoxels(const TreeType& tree, std::vector<BoolLeafNodeType*>& maskNodes)
+        : mTree(&tree), mMaskNodes(!maskNodes.empty() ? &maskNodes.front() : NULL)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        tree::ValueAccessor<const TreeType> distAcc(*mTree);
+
+        std::vector<Index> indexList;
+        indexList.reserve(NodeMaskType::SIZE);
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            BoolLeafNodeType& maskNode = *mMaskNodes[n];
+
+            const LeafNodeType * distNode = distAcc.probeConstLeaf(maskNode.origin());
+            if (!distNode) continue;
+
+            NodeMaskType mask(distNode->getValueMask());
+            NodeMaskType& narrowbandMask = maskNode.getValueMask();
+
+            for (Index pos = narrowbandMask.findFirstOn(); pos < NodeMaskType::SIZE; ++pos) {
+                if (narrowbandMask.isOn(pos)) indexList.push_back(pos);
+            }
+
+            mask -= narrowbandMask; // bitwise difference
+            narrowbandMask.setOff();
+
+            const ValueType* data = distNode->buffer().data();
+            Coord ijk(0, 0, 0);
+
+            while (!indexList.empty()) {
+
+                const Index pos = indexList.back();
+                indexList.pop_back();
+
+                if (narrowbandMask.isOn(pos)) continue;
+                narrowbandMask.setOn(pos);
+
+                const ValueType dist = std::abs(data[pos]);
+
+                ijk = LeafNodeType::offsetToLocalCoord(pos);
+
+                Index npos = pos - 1;
+                if (ijk[2] != 0 && mask.isOn(npos) && std::abs(data[npos]) > dist) {
+                    mask.setOff(npos);
+                    indexList.push_back(npos);
+                }
+
+                npos = pos + 1;
+                if (ijk[2] != (LeafNodeType::DIM - 1) && mask.isOn(npos) && std::abs(data[npos]) > dist) {
+                    mask.setOff(npos);
+                    indexList.push_back(npos);
+                }
+
+                npos = pos - LeafNodeType::DIM;
+                if (ijk[1] != 0 && mask.isOn(npos) && std::abs(data[npos]) > dist) {
+                    mask.setOff(npos);
+                    indexList.push_back(npos);
+                }
+
+                npos = pos + LeafNodeType::DIM;
+                if (ijk[1] != (LeafNodeType::DIM - 1) && mask.isOn(npos) && std::abs(data[npos]) > dist) {
+                    mask.setOff(npos);
+                    indexList.push_back(npos);
+                }
+
+                npos = pos - LeafNodeType::DIM * LeafNodeType::DIM;
+                if (ijk[0] != 0 && mask.isOn(npos) && std::abs(data[npos]) > dist) {
+                    mask.setOff(npos);
+                    indexList.push_back(npos);
+                }
+
+                npos = pos + LeafNodeType::DIM * LeafNodeType::DIM;
+                if (ijk[0] != (LeafNodeType::DIM - 1) && mask.isOn(npos) && std::abs(data[npos]) > dist) {
+                    mask.setOff(npos);
+                    indexList.push_back(npos);
+                }
+            } // end flood fill loop
+        } // end range loop
+    }
+
+    TreeType            const * const mTree;
+    BoolLeafNodeType         ** const mMaskNodes;
+}; // FillLeafNodeVoxels
+
+
+template<typename TreeType>
+struct ExpandNarrowbandMask
+{
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+    typedef typename BoolTreeType::Ptr                              BoolTreeTypePtr;
+
+    ExpandNarrowbandMask(const TreeType& tree, std::vector<BoolTreeTypePtr>& segments)
+        : mTree(&tree), mSegments(!segments.empty() ? &segments.front() : NULL)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        const TreeType& distTree = *mTree;
+        std::vector<BoolLeafNodeType*> nodes;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            BoolTreeType& narrowBandMask = *mSegments[n];
+
+            BoolTreeType candidateMask(narrowBandMask, false, TopologyCopy());
+
+            while (true) {
+
+                nodes.clear();
+                candidateMask.getNodes(nodes);
+                if (nodes.empty()) break;
+
+                const tbb::blocked_range<size_t> nodeRange(0, nodes.size());
+
+                tbb::parallel_for(nodeRange, FillLeafNodeVoxels<TreeType>(distTree, nodes));
+
+                narrowBandMask.topologyUnion(candidateMask);
+
+                ExpandLeafNodeRegion<TreeType> op(distTree, narrowBandMask, nodes);
+                tbb::parallel_reduce(nodeRange, op);
+
+                if (op.newMaskTree().empty()) break;
+
+                candidateMask.clear();
+                candidateMask.merge(op.newMaskTree());
+            } // end expand loop
+        } // end range loop
+    }
+
+    TreeType            const * const mTree;
+    BoolTreeTypePtr           * const mSegments;
+}; // ExpandNarrowbandMask
+
+
+template<typename TreeType>
+struct FloodFillSign
+{
+    typedef typename TreeType::Ptr                                              TreeTypePtr;
+    typedef typename TreeType::ValueType                                        ValueType;
+    typedef typename TreeType::LeafNodeType                                     LeafNodeType;
+    typedef typename TreeType::RootNodeType                                     RootNodeType;
+    typedef typename RootNodeType::NodeChainType                                NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type  InternalNodeType;
+
+    FloodFillSign(const TreeType& tree, std::vector<TreeTypePtr>& segments)
+        : mTree(&tree)
+        , mSegments(!segments.empty() ? &segments.front() : NULL)
+        , mMinValue(ValueType(0.0))
+    {
+        ValueType minSDFValue = std::numeric_limits<ValueType>::max();
+
+        {
+            std::vector<const InternalNodeType*> nodes;
+            tree.getNodes(nodes);
+
+            if (!nodes.empty()) {
+                FindMinTileValue<InternalNodeType> minOp(&nodes[0]);
+                tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), minOp);
+                minSDFValue = std::min(minSDFValue, minOp.minValue);
+            }
+        }
+
+        if (minSDFValue > ValueType(0.0)) {
+            std::vector<const LeafNodeType*> nodes;
+            tree.getNodes(nodes);
+            if (!nodes.empty()) {
+                FindMinVoxelValue<LeafNodeType> minOp(&nodes[0]);
+                tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), minOp);
+                minSDFValue = std::min(minSDFValue, minOp.minValue);
+            }
+        }
+
+        mMinValue = minSDFValue;
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        const ValueType interiorValue = -std::abs(mMinValue);
+        const ValueType exteriorValue = std::abs(mTree->background());
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            tools::signedFloodFillWithValues(*mSegments[n], exteriorValue, interiorValue);
+        }
+    }
+
+private:
+
+    TreeType    const * const mTree;
+    TreeTypePtr       * const mSegments;
+    ValueType                 mMinValue;
+}; // FloodFillSign
+
+
+template<typename TreeType>
+struct MaskedCopy
+{
+    typedef typename TreeType::Ptr                                  TreeTypePtr;
+    typedef typename TreeType::ValueType                            ValueType;
+    typedef typename TreeType::LeafNodeType                         LeafNodeType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::Ptr                              BoolTreeTypePtr;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+
+    MaskedCopy(const TreeType& tree, std::vector<TreeTypePtr>& segments, std::vector<BoolTreeTypePtr>& masks)
+        : mTree(&tree)
+        , mSegments(!segments.empty() ? &segments.front() : NULL)
+        , mMasks(!masks.empty() ? &masks.front() : NULL)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        std::vector<const BoolLeafNodeType*> nodes;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            const BoolTreeType& mask = *mMasks[n];
+
+            nodes.clear();
+            mask.getNodes(nodes);
+
+            Copy op(*mTree, nodes);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), op);
+            mSegments[n] = op.outputTree();
+        }
+    }
+
+private:
+
+    struct Copy {
+        Copy(const TreeType& inputTree, std::vector<const BoolLeafNodeType*>& maskNodes)
+            : mInputTree(&inputTree)
+            , mMaskNodes(!maskNodes.empty() ? &maskNodes.front() : NULL)
+            , mOutputTreePtr(new TreeType(inputTree.background()))
+        {
+        }
+
+        Copy(const Copy& rhs, tbb::split)
+            : mInputTree(rhs.mInputTree)
+            , mMaskNodes(rhs.mMaskNodes)
+            , mOutputTreePtr(new TreeType(mInputTree->background()))
+        {
+        }
+
+        TreeTypePtr& outputTree() { return mOutputTreePtr; }
+
+        void join(Copy& rhs) { mOutputTreePtr->merge(*rhs.mOutputTreePtr); }
+
+        void operator()(const tbb::blocked_range<size_t>& range) {
+
+            tree::ValueAccessor<const TreeType> inputAcc(*mInputTree);
+            tree::ValueAccessor<TreeType>       outputAcc(*mOutputTreePtr);
+
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+                const BoolLeafNodeType& maskNode = *mMaskNodes[n];
+                if (maskNode.isEmpty()) continue;
+
+                const Coord& ijk = maskNode.origin();
+
+                const LeafNodeType* inputNode = inputAcc.probeConstLeaf(ijk);
+                if (inputNode) {
+
+                    LeafNodeType* outputNode = outputAcc.touchLeaf(ijk);
+
+                    for (typename BoolLeafNodeType::ValueOnCIter it = maskNode.cbeginValueOn(); it; ++it) {
+                        const Index idx = it.pos();
+                        outputNode->setValueOn(idx, inputNode->getValue(idx));
+                    }
+                } else {
+                    const int valueDepth = inputAcc.getValueDepth(ijk);
+                    if (valueDepth >= 0) {
+                        outputAcc.addTile(TreeType::RootNodeType::LEVEL - valueDepth,
+                            ijk, inputAcc.getValue(ijk), true);
+                    }
+                }
+            }
+        }
+
+    private:
+        TreeType                 const * const mInputTree;
+        BoolLeafNodeType const * const * const mMaskNodes;
+        TreeTypePtr                            mOutputTreePtr;
+    }; // struct Copy
+
+    TreeType            const * const mTree;
+    TreeTypePtr               * const mSegments;
+    BoolTreeTypePtr           * const mMasks;
+}; // MaskedCopy
+
+
+////////////////////////////////////////
+
+
+template<typename VolumePtrType>
+struct ComputeActiveVoxelCount
+{
+    ComputeActiveVoxelCount(std::vector<VolumePtrType>& segments, size_t *countArray)
+        : mSegments(!segments.empty() ? &segments.front() : NULL)
+        , mCountArray(countArray)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            mCountArray[n] = mSegments[n]->activeVoxelCount();
+        }
+    }
+
+    VolumePtrType   * const mSegments;
+    size_t          * const mCountArray;
+};
+
+
+struct GreaterCount
+{
+    GreaterCount(const size_t *countArray) : mCountArray(countArray) {}
+
+    inline bool operator() (const size_t& lhs, const size_t& rhs) const
+    {
+        return (mCountArray[lhs] > mCountArray[rhs]);
+    }
+
+    size_t const * const mCountArray;
+};
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct GridOrTreeConstructor
+{
+    typedef typename TreeType::Ptr TreeTypePtr;
+    typedef typename TreeType::template ValueConverter<bool>::Type::Ptr BoolTreePtrType;
+
+    static BoolTreePtrType constructMask(const TreeType&, BoolTreePtrType& maskTree) { return maskTree; }
+    static TreeTypePtr construct(const TreeType&, TreeTypePtr& tree) { return tree; }
+};
+
+
+template<typename TreeType>
+struct GridOrTreeConstructor<Grid<TreeType> >
+{
+    typedef Grid<TreeType>                                          GridType;
+    typedef typename Grid<TreeType>::Ptr                             GridTypePtr;
+    typedef typename TreeType::Ptr TreeTypePtr;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::Ptr                              BoolTreePtrType;
+    typedef Grid<BoolTreeType>                                      BoolGridType;
+    typedef typename BoolGridType::Ptr                              BoolGridPtrType;
+
+    static BoolGridPtrType constructMask(const GridType& grid, BoolTreePtrType& maskTree) {
+        BoolGridPtrType maskGrid(BoolGridType::create(maskTree));
+        maskGrid->setTransform(grid.transform().copy());
+        return maskGrid;
+    }
+
+    static GridTypePtr construct(const GridType& grid, TreeTypePtr& maskTree) {
+        GridTypePtr maskGrid(GridType::create(maskTree));
+        maskGrid->setTransform(grid.transform().copy());
+        maskGrid->insertMeta(grid);
+        return maskGrid;
+    }
+};
+
+
+} // namespace level_set_util_internal
+
+
+////////////////////////////////////////
+
+
+template <class GridType>
+inline void
+sdfToFogVolume(GridType& grid, typename GridType::ValueType cutoffDistance)
+{
+    typedef typename GridType::ValueType                                            ValueType;
+    typedef typename GridType::TreeType                                             TreeType;
+    typedef typename TreeType::LeafNodeType                                         LeafNodeType;
+    typedef typename TreeType::RootNodeType                                         RootNodeType;
+    typedef typename RootNodeType::NodeChainType                                    NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type      InternalNodeType;
+
+    //////////
+
+    TreeType& tree = grid.tree();
+
+    size_t numLeafNodes = 0, numInternalNodes = 0;
+
+    std::vector<LeafNodeType*> nodes;
+    std::vector<size_t> leafnodeCount;
+
+    {
+        // Compute the prefix sum of the leafnode count in each internal node.
+        std::vector<InternalNodeType*> internalNodes;
+        tree.getNodes(internalNodes);
+
+        numInternalNodes = internalNodes.size();
+
+        leafnodeCount.push_back(0);
+        for (size_t n = 0; n < numInternalNodes; ++n) {
+            leafnodeCount.push_back(leafnodeCount.back() + internalNodes[n]->leafCount());
+        }
+
+        numLeafNodes = leafnodeCount.back();
+
+        // Steal all leafnodes (Removes them from the tree and transfers ownership.)
+        nodes.reserve(numLeafNodes);
+
+        for (size_t n = 0; n < numInternalNodes; ++n) {
+            internalNodes[n]->stealNodes(nodes, tree.background(), false);
+        }
+
+        // Clamp cutoffDistance to min sdf value
+        ValueType minSDFValue = std::numeric_limits<ValueType>::max();
+
+        {
+            level_set_util_internal::FindMinTileValue<InternalNodeType> minOp(&internalNodes[0]);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, internalNodes.size()), minOp);
+            minSDFValue = std::min(minSDFValue, minOp.minValue);
+        }
+
+        if (minSDFValue > ValueType(0.0)) {
+            level_set_util_internal::FindMinVoxelValue<LeafNodeType> minOp(&nodes[0]);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), minOp);
+            minSDFValue = std::min(minSDFValue, minOp.minValue);
+        }
+
+        cutoffDistance = -std::abs(cutoffDistance);
+        cutoffDistance = minSDFValue > cutoffDistance ? minSDFValue : cutoffDistance;
+    }
+
+    // Transform voxel values and delete leafnodes that are uniformly zero after the transformation.
+    // (Positive values are set to zero with inactive state and negative values are remapped
+    // from zero to one with active state.)
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+        level_set_util_internal::SDFVoxelsToFogVolume<LeafNodeType>(&nodes[0], cutoffDistance));
+
+    // Populate a new tree with the remaining leafnodes
+    typename TreeType::Ptr newTree(new TreeType(ValueType(0.0)));
+
+    level_set_util_internal::PopulateTree<TreeType> populate(*newTree, &nodes[0], &leafnodeCount[0], 0);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, numInternalNodes), populate);
+
+    // Transform tile values (Negative valued tiles are set to 1.0 with active state.)
+    std::vector<InternalNodeType*> internalNodes;
+    newTree->getNodes(internalNodes);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, internalNodes.size()),
+        level_set_util_internal::SDFTilesToFogVolume<TreeType, InternalNodeType>(tree, &internalNodes[0]));
+
+    {
+        tree::ValueAccessor<const TreeType> acc(tree);
+
+        typename TreeType::ValueAllIter it(*newTree);
+        it.setMaxDepth(TreeType::ValueAllIter::LEAF_DEPTH - 2);
+
+        for ( ; it; ++it) {
+            if (acc.getValue(it.getCoord()) < ValueType(0.0)) {
+                it.setValue(ValueType(1.0));
+                it.setActiveState(true);
+            }
+        }
+    }
+
+    // Insert missing root level tiles. (The new tree is constructed from the remaining leafnodes
+    // and will therefore not contain any root level tiles that may exist in the original tree.)
+    {
+        typename TreeType::ValueAllIter it(tree);
+        it.setMaxDepth(TreeType::ValueAllIter::ROOT_DEPTH);
+        for ( ; it; ++it) {
+            if (it.getValue() <  ValueType(0.0)) {
+                newTree->addTile(TreeType::ValueAllIter::ROOT_LEVEL, it.getCoord(), ValueType(1.0), true);
+            }
+        }
+    }
+
+    grid.setTree(newTree);
+    grid.setGridClass(GRID_FOG_VOLUME);
+}
+
+
+////////////////////////////////////////
+
+
+template <class GridOrTreeType>
+inline typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr
+sdfInteriorMask(const GridOrTreeType& volume, typename GridOrTreeType::ValueType isovalue)
+{
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType TreeType;
+    const TreeType& tree = TreeAdapter<GridOrTreeType>::tree(volume);
+
+    typedef typename TreeType::template ValueConverter<bool>::Type::Ptr BoolTreePtrType;
+    BoolTreePtrType mask = level_set_util_internal::computeInteriorMask(tree, isovalue);
+
+    return level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::constructMask(volume, mask);
+}
+
+
+template<typename GridOrTreeType>
+inline typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr
+extractEnclosedRegion(const GridOrTreeType& volume,
+    typename GridOrTreeType::ValueType isovalue,
+    const typename TreeAdapter<GridOrTreeType>::TreeType::template ValueConverter<bool>::Type* fillMask)
+{
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType TreeType;
+    const TreeType& tree = TreeAdapter<GridOrTreeType>::tree(volume);
+
+    typedef typename TreeType::template ValueConverter<char>::Type::Ptr CharTreePtrType;
+    CharTreePtrType regionMask = level_set_util_internal::computeEnclosedRegionMask(tree, isovalue, fillMask);
+
+    typedef typename TreeType::template ValueConverter<bool>::Type::Ptr BoolTreePtrType;
+    BoolTreePtrType mask = level_set_util_internal::computeInteriorMask(*regionMask, 0);
+
+    return level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::constructMask(volume, mask);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridOrTreeType>
+inline typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr
+extractIsosurfaceMask(const GridOrTreeType& volume, typename GridOrTreeType::ValueType isovalue)
+{
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType TreeType;
+    const TreeType& tree = TreeAdapter<GridOrTreeType>::tree(volume);
+
+    std::vector<const typename TreeType::LeafNodeType*> nodes;
+    tree.getNodes(nodes);
+
+    typedef typename TreeType::template ValueConverter<bool>::Type BoolTreeType;
+    typename BoolTreeType::Ptr mask(new BoolTreeType(false));
+
+    level_set_util_internal::MaskIsovalueCrossingVoxels<TreeType> op(tree, nodes, *mask, isovalue);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), op);
+
+    return level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::constructMask(volume, mask);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridOrTreeType>
+inline void
+extractActiveVoxelSegmentMasks(const GridOrTreeType& volume,
+    std::vector<typename GridOrTreeType::template ValueConverter<bool>::Type::Ptr>& masks)
+{
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType              TreeType;
+    typedef typename TreeType::template ValueConverter<bool>::Type      BoolTreeType;
+    typedef typename BoolTreeType::Ptr                                  BoolTreePtrType;
+    typedef typename BoolTreeType::LeafNodeType                         BoolLeafNodeType;
+
+    typedef level_set_util_internal::NodeMaskSegment<BoolLeafNodeType>  NodeMaskSegmentType;
+    typedef typename NodeMaskSegmentType::Ptr                           NodeMaskSegmentPtrType;
+    typedef typename std::vector<NodeMaskSegmentPtrType>                NodeMaskSegmentPtrVector;
+    typedef typename std::vector<NodeMaskSegmentType*>                  NodeMaskSegmentRawPtrVector;
+
+    /////
+
+    const TreeType& tree = TreeAdapter<GridOrTreeType>::tree(volume);
+
+    BoolTreeType topologyMask(tree, false, TopologyCopy());
+
+    if (topologyMask.hasActiveTiles()) {
+        topologyMask.voxelizeActiveTiles();
+    }
+
+    std::vector<BoolLeafNodeType*> leafnodes;
+    topologyMask.getNodes(leafnodes);
+
+    if (leafnodes.empty()) return;
+
+    // 1. Split node masks into disjoint segments
+    // Note: The LeafNode origin coord is modified to record the 'leafnodes' array offset.
+
+    boost::scoped_array<NodeMaskSegmentPtrVector> nodeSegmentArray(new NodeMaskSegmentPtrVector[leafnodes.size()]);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, leafnodes.size()),
+        level_set_util_internal::SegmentNodeMask<BoolLeafNodeType>(leafnodes, nodeSegmentArray.get()));
+
+
+    // 2. Compute segment connectivity
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, leafnodes.size()),
+        level_set_util_internal::ConnectNodeMaskSegments<BoolTreeType, BoolLeafNodeType>(
+            topologyMask, nodeSegmentArray.get()));
+
+    topologyMask.clear();
+
+    size_t nodeSegmentCount = 0;
+    for (size_t n = 0, N = leafnodes.size(); n < N; ++n) {
+        nodeSegmentCount += nodeSegmentArray[n].size();
+    }
+
+    // 3. Group connected segments
+
+    std::deque<NodeMaskSegmentRawPtrVector> nodeSegmentGroups;
+
+    NodeMaskSegmentType* nextSegment = nodeSegmentArray[0][0].get();
+    while (nextSegment) {
+
+        nodeSegmentGroups.push_back(NodeMaskSegmentRawPtrVector());
+
+        std::vector<NodeMaskSegmentType*>& segmentGroup = nodeSegmentGroups.back();
+        segmentGroup.reserve(nodeSegmentCount);
+
+        std::deque<NodeMaskSegmentType*> segmentQueue;
+        segmentQueue.push_back(nextSegment);
+        nextSegment = NULL;
+
+        while (!segmentQueue.empty()) {
+
+            NodeMaskSegmentType* segment = segmentQueue.back();
+            segmentQueue.pop_back();
+
+            if (segment->visited) continue;
+            segment->visited = true;
+
+            segmentGroup.push_back(segment);
+
+            // queue connected segments
+            std::vector<NodeMaskSegmentType*>& connections = segment->connections;
+            for (size_t n = 0, N = connections.size(); n < N; ++n) {
+                if (!connections[n]->visited) segmentQueue.push_back(connections[n]);
+            }
+        }
+
+        // find first unvisited segment
+        for (size_t n = 0, N = leafnodes.size(); n < N; ++n) {
+            NodeMaskSegmentPtrVector& nodeSegments = nodeSegmentArray[n];
+            for (size_t i = 0, I = nodeSegments.size(); i < I; ++i) {
+                if (!nodeSegments[i]->visited) nextSegment = nodeSegments[i].get();
+            }
+        }
+    }
+
+    // 4. Mask segment groups
+
+    if (nodeSegmentGroups.size() == 1) {
+
+        BoolTreePtrType mask(new BoolTreeType(tree, false, TopologyCopy()));
+
+        if (mask->hasActiveTiles()) {
+            mask->voxelizeActiveTiles();
+        }
+
+        masks.push_back(
+            level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::constructMask(volume, mask));
+
+    } else if (nodeSegmentGroups.size() > 1) {
+
+        for (size_t n = 0, N = nodeSegmentGroups.size(); n < N; ++n) {
+
+            NodeMaskSegmentRawPtrVector& segmentGroup = nodeSegmentGroups[n];
+
+            level_set_util_internal::MaskSegmentGroup<BoolTreeType> op(segmentGroup);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, segmentGroup.size()), op);
+
+            masks.push_back(
+                level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::constructMask(volume, op.mask()));
+        }
+    }
+
+    // 5. Sort segments in descending order based on the active voxel count.
+
+    if (masks.size() > 1) {
+        const size_t segmentCount = masks.size();
+
+        boost::scoped_array<size_t> segmentOrderArray(new size_t[segmentCount]);
+        boost::scoped_array<size_t> voxelCountArray(new size_t[segmentCount]);
+
+        for (size_t n = 0; n < segmentCount; ++n) {
+            segmentOrderArray[n] = n;
+        }
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, segmentCount),
+            level_set_util_internal::ComputeActiveVoxelCount<BoolTreePtrType>(masks, voxelCountArray.get()));
+
+        size_t *begin = segmentOrderArray.get();
+        tbb::parallel_sort(begin, begin + masks.size(), level_set_util_internal::GreaterCount(voxelCountArray.get()));
+
+        std::vector<BoolTreePtrType> orderedMasks;
+        orderedMasks.reserve(masks.size());
+
+        for (size_t n = 0; n < segmentCount; ++n) {
+            orderedMasks.push_back(masks[segmentOrderArray[n]]);
+        }
+
+        masks.swap(orderedMasks);
+    }
+
+} // extractActiveVoxelSegmentMasks()
+
+
+template<typename GridOrTreeType>
+inline void
+segmentActiveVoxels(const GridOrTreeType& volume, std::vector<typename GridOrTreeType::Ptr>& segments)
+{
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType          TreeType;
+    typedef typename TreeType::Ptr                                  TreePtrType;
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::Ptr                              BoolTreePtrType;
+
+    const TreeType& inputTree = TreeAdapter<GridOrTreeType>::tree(volume);
+
+    // 1. Segment active topology mask
+    std::vector<BoolTreePtrType> maskSegmentArray;
+    extractActiveVoxelSegmentMasks(inputTree, maskSegmentArray);
+
+    const size_t numSegments = maskSegmentArray.size();
+
+    if (numSegments < 2) {
+        // single segment early-out
+        TreePtrType segment(new TreeType(inputTree));
+        segments.push_back(
+            level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::construct(volume, segment));
+        return;
+    }
+
+    const tbb::blocked_range<size_t> segmentRange(0, numSegments);
+
+    // 2. Export segments
+    std::vector<TreePtrType> outputSegmentArray(numSegments);
+
+    tbb::parallel_for(segmentRange,
+        level_set_util_internal::MaskedCopy<TreeType>(inputTree, outputSegmentArray, maskSegmentArray));
+
+    for (size_t n = 0, N = numSegments; n < N; ++n) {
+        segments.push_back(
+            level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::construct(volume, outputSegmentArray[n]));
+    }
+}
+
+
+template<typename GridOrTreeType>
+inline void
+segmentSDF(const GridOrTreeType& volume, std::vector<typename GridOrTreeType::Ptr>& segments)
+{
+    typedef typename TreeAdapter<GridOrTreeType>::TreeType          TreeType;
+    typedef typename TreeType::Ptr                                  TreePtrType;
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::Ptr                              BoolTreePtrType;
+
+    const TreeType& inputTree = TreeAdapter<GridOrTreeType>::tree(volume);
+
+    // 1. Mask zero crossing voxels
+    BoolTreePtrType mask = extractIsosurfaceMask(inputTree, lsutilGridZero<GridOrTreeType>());
+
+    // 2. Segment the zero crossing mask
+    std::vector<BoolTreePtrType> maskSegmentArray;
+    extractActiveVoxelSegmentMasks(*mask, maskSegmentArray);
+
+    const size_t numSegments = maskSegmentArray.size();
+
+    if (numSegments < 2) {
+        // single segment early-out
+        TreePtrType segment(new TreeType(inputTree));
+        segments.push_back(
+            level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::construct(volume, segment));
+        return;
+    }
+
+    const tbb::blocked_range<size_t> segmentRange(0, numSegments);
+
+
+    // 3. Expand zero crossing mask to capture sdf narrow band
+    tbb::parallel_for(segmentRange,
+        level_set_util_internal::ExpandNarrowbandMask<TreeType>(inputTree, maskSegmentArray));
+
+    // 4. Export sdf segments
+    std::vector<TreePtrType> outputSegmentArray(numSegments);
+
+    tbb::parallel_for(segmentRange,
+        level_set_util_internal::MaskedCopy<TreeType>(inputTree, outputSegmentArray, maskSegmentArray));
+
+    tbb::parallel_for(segmentRange,
+        level_set_util_internal::FloodFillSign<TreeType>(inputTree, outputSegmentArray));
+
+
+    for (size_t n = 0, N = numSegments; n < N; ++n) {
+        segments.push_back(
+            level_set_util_internal::GridOrTreeConstructor<GridOrTreeType>::construct(volume, outputSegmentArray[n]));
+    }
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_LEVEL_SET_UTIL_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+
diff --git a/nuparu/include/openvdb_new/tools/MaskToLevelSet.h b/nuparu/include/openvdb_new/tools/MaskToLevelSet.h
new file mode 100644
index 00000000..391883fe
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/MaskToLevelSet.h
@@ -0,0 +1,199 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file    MaskToLevelSet.h
+///
+/// @brief   This tool generated a narrow band level set from the
+///          interface between the active and inactive voxels of an
+///          input grid.
+///
+/// @par Example:
+/// Combine with @c tools::createPointMaskGrid for fast point cloud to level set conversion.
+///
+/// @author FX R&D OpenVDB team
+
+#ifndef OPENVDB_TOOLS_MASK_TO_LEVELSET_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_MASK_TO_LEVELSET_HAS_BEEN_INCLUDED
+
+#include <tbb/task_group.h>
+#include <openvdb/Grid.h>
+#include <openvdb/util/CpuTimer.h>
+#include <openvdb/Types.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <openvdb/math/Math.h> // for isNegative
+#include <openvdb/tree/LeafManager.h>
+#include "LevelSetFilter.h"
+#include "Morphology.h" // for erodeVoxels and dilateActiveValues
+#include "SignedFloodFill.h" // for signedFloodFill
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Compute the narrow band level set to the interface
+/// between active and inactive voxels in the input grid.
+///
+/// @return a shared pointer to a new narrow band level set (SDF) of type @c float
+///
+/// @param grid       an incoming grid of arbitrary data type whose active voxels
+///                   are used in constructing the narrow band level set
+/// @param halfWidth  half of the narrow band level set width in voxel units
+/// @param dilation   amount of dilation in voxel units
+/// @param erosion    amount of erosion in voxel units
+/// @param interrupt  optional interrupter (see NullInterrupter.h)    
+///
+template<typename GridT,
+         math::BiasedGradientScheme Scheme,
+         typename InterrupterT>
+inline typename Grid<typename GridT::TreeType::template ValueConverter<float>::Type>::Ptr
+maskToLevelSet(const GridT& grid,
+               int halfWidth = 3,
+               int dilation = 1,
+               int erosion = 1,
+               InterrupterT* interrupt = NULL);       
+    
+/// @brief Compute the narrow band level set to the interface
+/// between active and inactive voxels in the input grid.
+///
+/// @return a shared pointer to a new narrow band level set (SDF) of type @c float
+///
+/// @param grid       an incoming grid of arbitrary data type whose active voxels
+///                   are used in constructing the narrow band level set
+/// @param halfWidth  half of the narrow band level set width in voxel units
+/// @param dilation   amount of dilation in voxel units
+/// @param erosion    amount of erosion in voxel units
+///
+/// @note This template specialized version uses first order upwinding and no interrupter.
+/// C++03 compilers do not allow functions to have default template arguments -
+/// hence this workaround.
+///    
+template<typename GridT>
+inline typename Grid<typename GridT::TreeType::template ValueConverter<float>::Type>::Ptr
+maskToLevelSet(const GridT& grid,
+               int halfWidth = 3,
+               int dilation = 1,
+               int erosion = 1)
+{
+    typedef util::NullInterrupter T;
+    return maskToLevelSet<GridT, math::FIRST_BIAS, T>(grid, halfWidth, dilation, erosion);
+}
+    
+namespace {
+
+template<typename TreeT>
+struct DilateOp
+{
+    DilateOp(TreeT& t, int n) : tree(&t), size(n) {}
+    void operator()() const {
+        dilateActiveValues( *tree, size, tools::NN_FACE, tools::IGNORE_TILES);
+    }
+    TreeT* tree;
+    const int size;
+};
+
+
+template<typename TreeT>
+struct ErodeOp
+{
+    ErodeOp(TreeT& t, int n) : tree(&t), size(n) {}
+    void operator()() const { erodeVoxels( *tree, size); }
+    TreeT* tree;
+    const int size;
+};        
+  
+}// unnamed namespace
+    
+
+template<typename GridT, math::BiasedGradientScheme Scheme, typename InterrupterT>
+inline typename Grid<typename GridT::TreeType::template ValueConverter<float>::Type>::Ptr
+maskToLevelSet(const GridT& grid, int halfWidth, int dilation, int erosion, InterrupterT* interrupt)
+{
+    typedef typename GridT::TreeType::template ValueConverter<ValueMask>::Type MaskTreeT;
+    typedef typename GridT::TreeType::template ValueConverter<float>::Type     FloatTreeT;
+    typedef Grid<FloatTreeT>                                                   FloatGridT;
+
+    // Check input.
+    if ( halfWidth <= 0 ) {
+        OPENVDB_THROW(ValueError, "Narrow band width must be non-zero!");
+    }
+    if ( !grid.hasUniformVoxels() ) {
+        OPENVDB_THROW(ValueError, "Non-uniform voxels are not supported!");
+    }
+
+    // background value = outside value 
+    const float outside = static_cast<float>(grid.voxelSize()[0]) * halfWidth;
+    
+    // Copy the topology into a MaskGrid.
+    MaskTreeT maskTree( grid.tree(), false/*background*/, openvdb::TopologyCopy() );
+
+    // Morphological closing operation.
+    dilateActiveValues( maskTree, dilation, tools::NN_FACE, tools::IGNORE_TILES);
+    erodeVoxels(  maskTree, erosion);
+
+    // Generate a volume with an implicit zero crossing at the boundary
+    // between active and inactive values in the input grid.
+    typename FloatTreeT::Ptr lsTree( new FloatTreeT(maskTree,
+                                                    outside,// = inactive
+                                                    -outside,//= active
+                                                    openvdb::TopologyCopy()) );
+    
+    tbb::task_group pool;
+    pool.run( ErodeOp< MaskTreeT >( maskTree, halfWidth ) );
+    pool.run( DilateOp<FloatTreeT>( *lsTree , halfWidth ) );
+    pool.wait();// wait for both tasks to complete
+ 
+    lsTree->topologyDifference( maskTree );
+    tools::pruneLevelSet( *lsTree,  /*threading=*/true);
+    
+    // Create a level set grid from the tree
+    typename FloatGridT::Ptr lsGrid = FloatGridT::create( lsTree );
+    lsGrid->setTransform( grid.transform().copy() );
+    lsGrid->setGridClass( openvdb::GRID_LEVEL_SET );
+
+    // Normalize and prune level set
+    LevelSetTracker<FloatGridT, InterrupterT> tracker( *lsGrid, interrupt );
+    tracker.setSpatialScheme( Scheme );
+    tracker.setNormCount( 3 * halfWidth );
+    tracker.normalize();
+    tracker.prune();
+    
+    return lsGrid;
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif //OPENVDB_TOOLS_MASK_TO_LEVELSET_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/MeshToVolume.h b/nuparu/include/openvdb_new/tools/MeshToVolume.h
new file mode 100644
index 00000000..8872fad2
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/MeshToVolume.h
@@ -0,0 +1,4032 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file   MeshToVolume.h
+///
+/// @brief  Convert polygonal meshes that consist of quads and/or triangles
+///         into signed or unsigned distance field volumes.
+///
+/// @note   The signed distance field conversion requires a closed surface
+///         but not necessarily a manifold surface. Supports surfaces with
+///         self intersections and degenerate faces and is independent of
+///         mesh surface normals / polygon orientation.
+///
+/// @author Mihai Alden
+
+
+#ifndef OPENVDB_TOOLS_MESH_TO_VOLUME_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_MESH_TO_VOLUME_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/math/FiniteDifference.h> // for GodunovsNormSqrd
+#include <openvdb/math/Proximity.h> // for closestPointOnTriangleToPoint
+#include <openvdb/util/NullInterrupter.h>
+#include <openvdb/util/Util.h>
+
+#include "ChangeBackground.h"
+#include "Prune.h" // for pruneInactive and pruneLevelSet
+#include "SignedFloodFill.h" // for signedFloodFillWithValues
+
+#include <tbb/blocked_range.h>
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/partitioner.h>
+#include <tbb/task_group.h>
+#include <tbb/task_scheduler_init.h>
+
+#include <boost/integer_traits.hpp> // const_max
+#include <boost/math/special_functions/fpclassify.hpp> // for isfinite
+#include <boost/scoped_array.hpp>
+
+#include <algorithm> // for std::sort
+#include <deque>
+#include <limits>
+#include <sstream>
+#include <vector>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+/// @brief Mesh to volume conversion flags
+enum MeshToVolumeFlags {
+
+    /// Switch from the default signed distance field conversion that classifies
+    /// regions as either inside or outside the mesh boundary to a unsigned distance
+    /// field conversion that only computes distance values. This conversion type
+    /// does not require a closed watertight mesh.
+    UNSIGNED_DISTANCE_FIELD = 0x1,
+
+    /// Disable the cleanup step that removes voxels created by self intersecting
+    /// portions of the mesh.
+    DISABLE_INTERSECTING_VOXEL_REMOVAL = 0x2,
+
+    /// Disable the distance renormalization step that smooths out bumps caused
+    /// by self intersecting or overlapping portions of the mesh
+    DISABLE_RENORMALIZATION = 0x4,
+
+    /// Disable the cleanup step that removes active voxels that exceed the
+    /// narrow band limits. (Only relevant for small limits)
+    DISABLE_NARROW_BAND_TRIMMING = 0x8
+};
+
+
+/// @brief  Convert polygonal meshes that consist of quads and/or triangles into
+///         signed or unsigned distance field volumes.
+///
+/// @note   Requires a closed surface but not necessarily a manifold surface.
+///         Supports surfaces with self intersections and degenerate faces
+///         and is independent of mesh surface normals.
+///
+/// @interface MeshDataAdapter
+/// Expected interface for the MeshDataAdapter class
+/// @code
+/// struct MeshDataAdapter {
+///   size_t polygonCount() const;        // Total number of polygons
+///   size_t pointCount() const;          // Total number of points
+///   size_t vertexCount(size_t n) const; // Vertex count for polygon n
+///
+///   // Return position pos in local grid index space for polygon n and vertex v
+///   void getIndexSpacePoint(size_t n, size_t v, openvdb::Vec3d& pos) const;
+/// };
+/// @endcode
+///
+/// @param mesh               mesh data access class that conforms to the MeshDataAdapter
+///                           interface
+/// @param transform          world-to-index-space transform
+/// @param exteriorBandWidth  exterior narrow band width in voxel units
+/// @param interiorBandWidth  interior narrow band width in voxel units
+///                           (set to std::numeric_limits<float>::max() to fill object
+///                           interior with distance values)
+/// @param flags              optional conversion flags defined in @c MeshToVolumeFlags
+/// @param polygonIndexGrid   optional grid output that will contain the closest-polygon
+///                           index for each voxel in the narrow band region
+template <typename GridType, typename MeshDataAdapter>
+inline typename GridType::Ptr
+meshToVolume(
+  const MeshDataAdapter& mesh,
+  const math::Transform& transform,
+  float exteriorBandWidth = 3.0f,
+  float interiorBandWidth = 3.0f,
+  int flags = 0,
+  typename GridType::template ValueConverter<Int32>::Type * polygonIndexGrid = NULL);
+
+
+/// @brief  Convert polygonal meshes that consist of quads and/or triangles into
+///         signed or unsigned distance field volumes.
+///
+/// @param interrupter        a callback to interrupt the conversion process that conforms
+///                           to the util::NullInterrupter interface
+/// @param mesh               mesh data access class that conforms to the MeshDataAdapter
+///                           interface
+/// @param transform          world-to-index-space transform
+/// @param exteriorBandWidth  exterior narrow band width in voxel units
+/// @param interiorBandWidth  interior narrow band width in voxel units (set this value to
+///                           std::numeric_limits<float>::max() to fill interior regions
+///                           with distance values)
+/// @param flags              optional conversion flags defined in @c MeshToVolumeFlags
+/// @param polygonIndexGrid   optional grid output that will contain the closest-polygon
+///                           index for each voxel in the active narrow band region
+template <typename GridType, typename MeshDataAdapter, typename Interrupter>
+inline typename GridType::Ptr
+meshToVolume(
+    Interrupter& interrupter,
+    const MeshDataAdapter& mesh,
+    const math::Transform& transform,
+    float exteriorBandWidth = 3.0f,
+    float interiorBandWidth = 3.0f,
+    int flags = 0,
+    typename GridType::template ValueConverter<Int32>::Type * polygonIndexGrid = NULL);
+
+
+////////////////////////////////////////
+
+
+/// @brief    Contiguous quad and triangle data adapter class
+///
+/// @details  PointType and PolygonType must provide element access
+///           through the square brackets operator.
+/// @details  Points are assumed to be in local grid index space.
+/// @details  The PolygonType tuple can have either three or four components
+///           this property must be specified in a static member variable
+///           named @c size, similar to the math::Tuple class.
+/// @details  A four component tuple can represent a quads or a triangle
+///           if the fourth component set to @c util::INVALID_INDEX
+template<typename PointType, typename PolygonType>
+struct QuadAndTriangleDataAdapter {
+
+    QuadAndTriangleDataAdapter(const std::vector<PointType>& points,
+        const std::vector<PolygonType>& polygons)
+        : mPointArray(points.empty() ? NULL : &points[0])
+        , mPointArraySize(points.size())
+        , mPolygonArray(polygons.empty() ? NULL : &polygons[0])
+        , mPolygonArraySize(polygons.size())
+    {
+    }
+
+    QuadAndTriangleDataAdapter(const PointType * pointArray, size_t pointArraySize,
+        const PolygonType* polygonArray, size_t polygonArraySize)
+        : mPointArray(pointArray)
+        , mPointArraySize(pointArraySize)
+        , mPolygonArray(polygonArray)
+        , mPolygonArraySize(polygonArraySize)
+    {
+    }
+
+    size_t polygonCount() const { return mPolygonArraySize; }
+    size_t pointCount() const { return mPointArraySize; }
+
+    /// @brief  Vertex count for polygon @a n
+    size_t vertexCount(size_t n) const {
+        return (PolygonType::size == 3 || mPolygonArray[n][3] == util::INVALID_IDX) ? 3 : 4;
+    }
+
+    /// @brief  Returns position @a pos in local grid index space
+    ///         for polygon @a n and vertex @a v
+    void getIndexSpacePoint(size_t n, size_t v, Vec3d& pos) const {
+        const PointType& p = mPointArray[mPolygonArray[n][int(v)]];
+        pos[0] = double(p[0]);
+        pos[1] = double(p[1]);
+        pos[2] = double(p[2]);
+    }
+
+private:
+    PointType     const * const mPointArray;
+    size_t                const mPointArraySize;
+    PolygonType   const * const mPolygonArray;
+    size_t                const mPolygonArraySize;
+}; // struct QuadAndTriangleDataAdapter
+
+
+////////////////////////////////////////
+
+
+// Wrapper functions for the mesh to volume converter
+
+
+/// @brief Convert a triangle mesh to a level set volume.
+///
+/// @return a grid of type @c GridType containing a narrow-band level set
+///         representation of the input mesh.
+///
+/// @throw  TypeError if @c GridType is not scalar or not floating-point
+///
+/// @note   Requires a closed surface but not necessarily a manifold surface.
+///         Supports surfaces with self intersections and degenerate faces
+///         and is independent of mesh surface normals.
+///
+/// @param xform        transform for the output grid
+/// @param points       list of world space point positions
+/// @param triangles    triangle index list
+/// @param halfWidth    half the width of the narrow band, in voxel units
+template<typename GridType>
+inline typename GridType::Ptr
+meshToLevelSet(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    float halfWidth = float(LEVEL_SET_HALF_WIDTH));
+
+
+/// @brief Convert a quad mesh to a level set volume.
+///
+/// @return a grid of type @c GridType containing a narrow-band level set
+///         representation of the input mesh.
+///
+/// @throw  TypeError if @c GridType is not scalar or not floating-point
+///
+/// @note   Requires a closed surface but not necessarily a manifold surface.
+///         Supports surfaces with self intersections and degenerate faces
+///         and is independent of mesh surface normals.
+///
+/// @param xform        transform for the output grid
+/// @param points       list of world space point positions
+/// @param quads        quad index list
+/// @param halfWidth    half the width of the narrow band, in voxel units
+template<typename GridType>
+inline typename GridType::Ptr
+meshToLevelSet(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec4I>& quads,
+    float halfWidth = float(LEVEL_SET_HALF_WIDTH));
+
+
+/// @brief Convert a triangle and quad mesh to a level set volume.
+///
+/// @return a grid of type @c GridType containing a narrow-band level set
+///         representation of the input mesh.
+///
+/// @throw  TypeError if @c GridType is not scalar or not floating-point
+///
+/// @note   Requires a closed surface but not necessarily a manifold surface.
+///         Supports surfaces with self intersections and degenerate faces
+///         and is independent of mesh surface normals.
+///
+/// @param xform        transform for the output grid
+/// @param points       list of world space point positions
+/// @param triangles    triangle index list
+/// @param quads        quad index list
+/// @param halfWidth    half the width of the narrow band, in voxel units
+template<typename GridType>
+inline typename GridType::Ptr
+meshToLevelSet(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float halfWidth = float(LEVEL_SET_HALF_WIDTH));
+
+
+/// @brief Convert a triangle and quad mesh to a signed distance field
+///        with an asymmetrical narrow band.
+///
+/// @return a grid of type @c GridType containing a narrow-band signed
+///         distance field representation of the input mesh.
+///
+/// @throw  TypeError if @c GridType is not scalar or not floating-point
+///
+/// @note   Requires a closed surface but not necessarily a manifold surface.
+///         Supports surfaces with self intersections and degenerate faces
+///         and is independent of mesh surface normals.
+///
+/// @param xform        transform for the output grid
+/// @param points       list of world space point positions
+/// @param triangles    triangle index list
+/// @param quads        quad index list
+/// @param exBandWidth  the exterior narrow-band width in voxel units
+/// @param inBandWidth  the interior narrow-band width in voxel units
+template<typename GridType>
+inline typename GridType::Ptr
+meshToSignedDistanceField(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float exBandWidth,
+    float inBandWidth);
+
+
+/// @brief Convert a triangle and quad mesh to an unsigned distance field.
+///
+/// @return a grid of type @c GridType containing a narrow-band unsigned
+///         distance field representation of the input mesh.
+///
+/// @throw  TypeError if @c GridType is not scalar or not floating-point
+///
+/// @note   Does not requires a closed surface.
+///
+/// @param xform        transform for the output grid
+/// @param points       list of world space point positions
+/// @param triangles    triangle index list
+/// @param quads        quad index list
+/// @param bandWidth    the width of the narrow band, in voxel units
+template<typename GridType>
+inline typename GridType::Ptr
+meshToUnsignedDistanceField(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float bandWidth);
+
+
+////////////////////////////////////////
+
+
+/// @brief  Return a grid of type @c GridType containing a narrow-band level set
+///         representation of a box.
+///
+/// @param bbox       a bounding box in world units
+/// @param xform      world-to-index-space transform
+/// @param halfWidth  half the width of the narrow band, in voxel units
+template<typename GridType, typename VecType>
+inline typename GridType::Ptr
+createLevelSetBox(const math::BBox<VecType>& bbox,
+    const openvdb::math::Transform& xform,
+    typename VecType::ValueType halfWidth = LEVEL_SET_HALF_WIDTH);
+
+
+////////////////////////////////////////
+
+
+/// @brief  Traces the exterior voxel boundary of closed objects in the input
+///         volume @a tree. Exterior voxels are marked with a negative sign,
+///         voxels with a value below @c 0.75 are left unchanged and act as
+///         the boundary layer.
+///
+/// @note   Does not propagate sign information into tile regions.
+template <typename FloatTreeT>
+inline void
+traceExteriorBoundaries(FloatTreeT& tree);
+
+
+////////////////////////////////////////
+
+
+/// @brief  Extracts and stores voxel edge intersection data from a mesh.
+class MeshToVoxelEdgeData
+{
+public:
+
+    //////////
+
+    ///@brief Internal edge data type.
+    struct EdgeData {
+        EdgeData(float dist = 1.0)
+            : mXDist(dist), mYDist(dist), mZDist(dist)
+            , mXPrim(util::INVALID_IDX)
+            , mYPrim(util::INVALID_IDX)
+            , mZPrim(util::INVALID_IDX)
+        {
+        }
+
+        //@{
+        /// Required by several of the tree nodes
+        /// @note These methods don't perform meaningful operations.
+        bool operator< (const EdgeData&) const { return false; }
+        bool operator> (const EdgeData&) const { return false; }
+        template<class T> EdgeData operator+(const T&) const { return *this; }
+        template<class T> EdgeData operator-(const T&) const { return *this; }
+        EdgeData operator-() const { return *this; }
+        //@}
+
+        bool operator==(const EdgeData& rhs) const
+        {
+            return mXPrim == rhs.mXPrim && mYPrim == rhs.mYPrim && mZPrim == rhs.mZPrim;
+        }
+
+        float mXDist, mYDist, mZDist;
+        Index32 mXPrim, mYPrim, mZPrim;
+    };
+
+    typedef tree::Tree4<EdgeData, 5, 4, 3>::Type    TreeType;
+    typedef tree::ValueAccessor<TreeType>           Accessor;
+
+
+    //////////
+
+
+    MeshToVoxelEdgeData();
+
+
+    /// @brief  Threaded method to extract voxel edge data, the closest
+    ///         intersection point and corresponding primitive index,
+    ///         from the given mesh.
+    ///
+    /// @param pointList    List of points in grid index space, preferably unique
+    ///                     and shared by different polygons.
+    /// @param polygonList  List of triangles and/or quads.
+    void convert(const std::vector<Vec3s>& pointList, const std::vector<Vec4I>& polygonList);
+
+
+    /// @brief  Returns intersection points with corresponding primitive
+    ///         indices for the given @c ijk voxel.
+    void getEdgeData(Accessor& acc, const Coord& ijk,
+        std::vector<Vec3d>& points, std::vector<Index32>& primitives);
+
+    /// @return An accessor of @c MeshToVoxelEdgeData::Accessor type that
+    ///         provides random read access to the internal tree.
+    Accessor getAccessor() { return Accessor(mTree); }
+
+private:
+    void operator=(const MeshToVoxelEdgeData&) {}
+    TreeType mTree;
+    class GenEdgeData;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+
+// Internal utility objects and implementation details
+
+namespace mesh_to_volume_internal {
+
+template<typename PointType>
+struct TransformPoints {
+
+    TransformPoints(const PointType* pointsIn, PointType* pointsOut,
+        const math::Transform& xform)
+        : mPointsIn(pointsIn), mPointsOut(pointsOut), mXform(&xform)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        Vec3d pos;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            const PointType& wsP = mPointsIn[n];
+            pos[0] = double(wsP[0]);
+            pos[1] = double(wsP[1]);
+            pos[2] = double(wsP[2]);
+
+            pos = mXform->worldToIndex(pos);
+
+            PointType& isP = mPointsOut[n];
+            isP[0] = typename PointType::value_type(pos[0]);
+            isP[1] = typename PointType::value_type(pos[1]);
+            isP[2] = typename PointType::value_type(pos[2]);
+        }
+    }
+
+    PointType        const * const mPointsIn;
+    PointType              * const mPointsOut;
+    math::Transform  const * const mXform;
+}; // TransformPoints
+
+
+template<typename ValueType>
+struct Tolerance
+{
+    static ValueType epsilon() { return ValueType(1e-7); }
+    static ValueType minNarrowBandWidth() { return ValueType(1.0 + 1e-6); }
+};
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+class CombineLeafNodes
+{
+public:
+
+    typedef typename TreeType::template ValueConverter<Int32>::Type     Int32TreeType;
+
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+    typedef typename Int32TreeType::LeafNodeType    Int32LeafNodeType;
+
+    CombineLeafNodes(TreeType& lhsDistTree, Int32TreeType& lhsIdxTree,
+        LeafNodeType ** rhsDistNodes, Int32LeafNodeType ** rhsIdxNodes)
+        : mDistTree(&lhsDistTree)
+        , mIdxTree(&lhsIdxTree)
+        , mRhsDistNodes(rhsDistNodes)
+        , mRhsIdxNodes(rhsIdxNodes)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        tree::ValueAccessor<TreeType> distAcc(*mDistTree);
+        tree::ValueAccessor<Int32TreeType> idxAcc(*mIdxTree);
+
+        typedef typename LeafNodeType::ValueType DistValueType;
+        typedef typename Int32LeafNodeType::ValueType IndexValueType;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            const Coord& origin = mRhsDistNodes[n]->origin();
+
+            LeafNodeType* lhsDistNode = distAcc.probeLeaf(origin);
+            Int32LeafNodeType* lhsIdxNode = idxAcc.probeLeaf(origin);
+
+            DistValueType* lhsDistData = lhsDistNode->buffer().data();
+            IndexValueType* lhsIdxData = lhsIdxNode->buffer().data();
+
+            const DistValueType* rhsDistData = mRhsDistNodes[n]->buffer().data();
+            const IndexValueType* rhsIdxData = mRhsIdxNodes[n]->buffer().data();
+
+
+            for (Index32 offset = 0; offset < LeafNodeType::SIZE; ++offset) {
+
+                if (rhsIdxData[offset] != Int32(util::INVALID_IDX)) {
+
+                    const DistValueType& lhsValue = lhsDistData[offset];
+                    const DistValueType& rhsValue = rhsDistData[offset];
+
+                    if (rhsValue < lhsValue) {
+                        lhsDistNode->setValueOn(offset, rhsValue);
+                        lhsIdxNode->setValueOn(offset, rhsIdxData[offset]);
+                    } else if (math::isExactlyEqual(rhsValue, lhsValue)) {
+                        lhsIdxNode->setValueOn(offset,
+                            std::min(lhsIdxData[offset], rhsIdxData[offset]));
+                    }
+                }
+            }
+
+            delete mRhsDistNodes[n];
+            delete mRhsIdxNodes[n];
+        }
+    }
+
+private:
+
+    TreeType * const mDistTree;
+    Int32TreeType * const mIdxTree;
+
+    LeafNodeType ** const mRhsDistNodes;
+    Int32LeafNodeType ** const mRhsIdxNodes;
+}; // class CombineLeafNodes
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct StashOriginAndStoreOffset
+{
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    StashOriginAndStoreOffset(std::vector<LeafNodeType*>& nodes, Coord* coordinates)
+        : mNodes(nodes.empty() ? NULL : &nodes[0]), mCoordinates(coordinates)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            Coord& origin = const_cast<Coord&>(mNodes[n]->origin());
+            mCoordinates[n] = origin;
+            origin[0] = static_cast<int>(n);
+        }
+    }
+
+    LeafNodeType ** const mNodes;
+    Coord * const mCoordinates;
+};
+
+
+template<typename TreeType>
+struct RestoreOrigin
+{
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    RestoreOrigin(std::vector<LeafNodeType*>& nodes, const Coord* coordinates)
+        : mNodes(nodes.empty() ? NULL : &nodes[0]), mCoordinates(coordinates)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            Coord& origin = const_cast<Coord&>(mNodes[n]->origin());
+            origin[0] = mCoordinates[n][0];
+        }
+    }
+
+    LeafNodeType         ** const mNodes;
+    Coord           const * const mCoordinates;
+};
+
+
+template<typename TreeType>
+class ComputeNodeConnectivity
+{
+public:
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    ComputeNodeConnectivity(const TreeType& tree, const Coord* coordinates,
+        size_t* offsets, size_t numNodes, const CoordBBox& bbox)
+        : mTree(&tree)
+        , mCoordinates(coordinates)
+        , mOffsets(offsets)
+        , mNumNodes(numNodes)
+        , mBBox(bbox)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        size_t* offsetsNextX = mOffsets;
+        size_t* offsetsPrevX = mOffsets + mNumNodes;
+        size_t* offsetsNextY = mOffsets + mNumNodes * 2;
+        size_t* offsetsPrevY = mOffsets + mNumNodes * 3;
+        size_t* offsetsNextZ = mOffsets + mNumNodes * 4;
+        size_t* offsetsPrevZ = mOffsets + mNumNodes * 5;
+
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+        Coord ijk;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            const Coord& origin = mCoordinates[n];
+            offsetsNextX[n] = findNeighbourNode(acc, origin, Coord(LeafNodeType::DIM, 0, 0));
+            offsetsPrevX[n] = findNeighbourNode(acc, origin, Coord(-LeafNodeType::DIM, 0, 0));
+            offsetsNextY[n] = findNeighbourNode(acc, origin, Coord(0, LeafNodeType::DIM, 0));
+            offsetsPrevY[n] = findNeighbourNode(acc, origin, Coord(0, -LeafNodeType::DIM, 0));
+            offsetsNextZ[n] = findNeighbourNode(acc, origin, Coord(0, 0, LeafNodeType::DIM));
+            offsetsPrevZ[n] = findNeighbourNode(acc, origin, Coord(0, 0, -LeafNodeType::DIM));
+        }
+    }
+
+    size_t findNeighbourNode(tree::ValueAccessor<const TreeType>& acc, const Coord& start, const Coord& step) const {
+
+        Coord ijk = start + step;
+        CoordBBox bbox(mBBox);
+
+        while (bbox.isInside(ijk)) {
+            const LeafNodeType* node = acc.probeConstLeaf(ijk);
+            if (node) return static_cast<size_t>(node->origin()[0]);
+            ijk += step;
+        }
+
+        return boost::integer_traits<size_t>::const_max;
+    }
+
+
+private:
+    // Disallow assignment
+    ComputeNodeConnectivity& operator=(const ComputeNodeConnectivity&);
+
+    TreeType    const * const mTree;
+    Coord       const * const mCoordinates;
+    size_t            * const mOffsets;
+
+    const size_t    mNumNodes;
+    const CoordBBox mBBox;
+}; // class ComputeNodeConnectivity
+
+
+template<typename TreeType>
+struct LeafNodeConnectivityTable {
+
+    enum { INVALID_OFFSET = boost::integer_traits<size_t>::const_max };
+
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    LeafNodeConnectivityTable(TreeType& tree)
+        : mLeafNodes()
+        , mOffsets(NULL)
+    {
+        mLeafNodes.reserve(tree.leafCount());
+        tree.getNodes(mLeafNodes);
+
+        if (mLeafNodes.empty()) return;
+
+        CoordBBox bbox;
+        tree.evalLeafBoundingBox(bbox);
+
+        const tbb::blocked_range<size_t> range(0, mLeafNodes.size());
+
+        // stash the leafnode origin coordinate and temporarily store the
+        // linear offset in the origin.x variable.
+        boost::scoped_array<Coord> coordinates(new Coord[mLeafNodes.size()]);
+        tbb::parallel_for(range, StashOriginAndStoreOffset<TreeType>(mLeafNodes, coordinates.get()));
+
+        // build the leafnode offset table
+        mOffsets.reset(new size_t[mLeafNodes.size() * 6]);
+
+
+        tbb::parallel_for(range,
+            ComputeNodeConnectivity<TreeType>(tree, coordinates.get(), mOffsets.get(), mLeafNodes.size(), bbox));
+
+        // restore the leafnode origin coordinate
+        tbb::parallel_for(range, RestoreOrigin<TreeType>(mLeafNodes, coordinates.get()));
+    }
+
+    size_t size() const { return mLeafNodes.size(); }
+
+    std::vector<LeafNodeType*>& nodes() { return mLeafNodes; }
+    const std::vector<LeafNodeType*>& nodes() const { return mLeafNodes; }
+
+
+    const size_t* offsetsNextX() const { return mOffsets.get(); }
+    const size_t* offsetsPrevX() const { return mOffsets.get() + mLeafNodes.size(); }
+
+    const size_t* offsetsNextY() const { return mOffsets.get() + mLeafNodes.size() * 2; }
+    const size_t* offsetsPrevY() const { return mOffsets.get() + mLeafNodes.size() * 3; }
+
+    const size_t* offsetsNextZ() const { return mOffsets.get() + mLeafNodes.size() * 4; }
+    const size_t* offsetsPrevZ() const { return mOffsets.get() + mLeafNodes.size() * 5; }
+
+private:
+    std::vector<LeafNodeType*> mLeafNodes;
+    boost::scoped_array<size_t> mOffsets;
+}; // struct LeafNodeConnectivityTable
+
+
+template<typename TreeType>
+class SweepExteriorSign
+{
+public:
+
+    enum Axis { X_AXIS = 0, Y_AXIS = 1, Z_AXIS = 2 };
+
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+    typedef LeafNodeConnectivityTable<TreeType>     ConnectivityTable;
+
+    SweepExteriorSign(Axis axis, const std::vector<size_t>& startNodeIndices, ConnectivityTable& connectivity)
+        : mStartNodeIndices(startNodeIndices.empty() ? NULL : &startNodeIndices[0])
+        , mConnectivity(&connectivity)
+        , mAxis(axis)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        std::vector<LeafNodeType*>& nodes = mConnectivity->nodes();
+
+        // Z Axis
+        size_t idxA = 0, idxB = 1;
+        Index step = 1;
+
+        const size_t* nextOffsets = mConnectivity->offsetsNextZ();
+        const size_t* prevOffsets = mConnectivity->offsetsPrevZ();
+
+        if (mAxis == Y_AXIS) {
+
+            idxA = 0;
+            idxB = 2;
+            step = LeafNodeType::DIM;
+
+            nextOffsets = mConnectivity->offsetsNextY();
+            prevOffsets = mConnectivity->offsetsPrevY();
+
+        } else if (mAxis == X_AXIS) {
+
+            idxA = 1;
+            idxB = 2;
+            step = LeafNodeType::DIM * LeafNodeType::DIM;
+
+            nextOffsets = mConnectivity->offsetsNextX();
+            prevOffsets = mConnectivity->offsetsPrevX();
+        }
+
+        Coord ijk(0, 0, 0);
+
+        int& a = ijk[idxA];
+        int& b = ijk[idxB];
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            size_t startOffset = mStartNodeIndices[n];
+            size_t lastOffset = startOffset;
+
+            Index pos(0);
+
+            for (a = 0; a < int(LeafNodeType::DIM); ++a) {
+                for (b = 0; b < int(LeafNodeType::DIM); ++b) {
+
+                    pos =  LeafNodeType::coordToOffset(ijk);
+                    size_t offset = startOffset;
+
+                    // sweep in +axis direction until a boundary voxel is hit.
+                    while ( offset != ConnectivityTable::INVALID_OFFSET &&
+                            traceVoxelLine(*nodes[offset], pos, step) ) {
+
+                        lastOffset = offset;
+                        offset = nextOffsets[offset];
+                    }
+
+                    // find last leafnode in +axis direction
+                    offset = lastOffset;
+                    while (offset != ConnectivityTable::INVALID_OFFSET) {
+                        lastOffset = offset;
+                        offset = nextOffsets[offset];
+                    }
+
+                    // sweep in -axis direction until a boundary voxel is hit.
+                    offset = lastOffset;
+                    pos += step * (LeafNodeType::DIM - 1);
+                    while ( offset != ConnectivityTable::INVALID_OFFSET &&
+                            traceVoxelLine(*nodes[offset], pos, -step)) {
+                        offset = prevOffsets[offset];
+                    }
+                }
+            }
+        }
+    }
+
+
+    bool traceVoxelLine(LeafNodeType& node, Index pos, Index step) const {
+
+        ValueType* data = node.buffer().data();
+
+        bool isOutside = true;
+
+        for (Index i = 0; i < LeafNodeType::DIM; ++i) {
+
+            ValueType& dist = data[pos];
+
+            if (dist < ValueType(0.0)) {
+                isOutside = true;
+            } else {
+                // Boundary voxel check. (Voxel that intersects the surface)
+                if (!(dist > ValueType(0.75))) isOutside = false;
+
+                if (isOutside) dist = ValueType(-dist);
+            }
+
+            pos += step;
+        }
+
+        return isOutside;
+    }
+
+
+private:
+    size_t              const * const mStartNodeIndices;
+    ConnectivityTable         * const mConnectivity;
+
+    const Axis mAxis;
+}; // class SweepExteriorSign
+
+
+template<typename LeafNodeType>
+inline void
+seedFill(LeafNodeType& node)
+{
+    typedef typename LeafNodeType::ValueType ValueType;
+    typedef std::deque<Index> Queue;
+
+
+    ValueType* data = node.buffer().data();
+
+    // find seed points
+    Queue seedPoints;
+    for (Index pos = 0; pos < LeafNodeType::SIZE; ++pos) {
+        if (data[pos] < 0.0) seedPoints.push_back(pos);
+    }
+
+    if (seedPoints.empty()) return;
+
+    // clear sign information
+    for (Queue::iterator it = seedPoints.begin(); it != seedPoints.end(); ++it) {
+        ValueType& dist = data[*it];
+        dist = -dist;
+    }
+
+    // flood fill
+
+    Coord ijk(0, 0, 0);
+    Index pos(0), nextPos(0);
+
+    while (!seedPoints.empty()) {
+
+        pos = seedPoints.back();
+        seedPoints.pop_back();
+
+        ValueType& dist = data[pos];
+
+        if (!(dist < ValueType(0.0))) {
+
+            dist = -dist; // flip sign
+
+            ijk = LeafNodeType::offsetToLocalCoord(pos);
+
+            if (ijk[0] != 0) { // i - 1, j, k
+                nextPos = pos - LeafNodeType::DIM * LeafNodeType::DIM;
+                if (data[nextPos] > ValueType(0.75)) seedPoints.push_back(nextPos);
+            }
+
+            if (ijk[0] != (LeafNodeType::DIM - 1)) { // i + 1, j, k
+                nextPos = pos + LeafNodeType::DIM * LeafNodeType::DIM;
+                if (data[nextPos] > ValueType(0.75)) seedPoints.push_back(nextPos);
+            }
+
+            if (ijk[1] != 0) { // i, j - 1, k
+                nextPos = pos - LeafNodeType::DIM;
+                if (data[nextPos] > ValueType(0.75)) seedPoints.push_back(nextPos);
+            }
+
+            if (ijk[1] != (LeafNodeType::DIM - 1)) { // i, j + 1, k
+                nextPos = pos + LeafNodeType::DIM;
+                if (data[nextPos] > ValueType(0.75)) seedPoints.push_back(nextPos);
+            }
+
+            if (ijk[2] != 0) { // i, j, k - 1
+                nextPos = pos - 1;
+                if (data[nextPos] > ValueType(0.75)) seedPoints.push_back(nextPos);
+            }
+
+            if (ijk[2] != (LeafNodeType::DIM - 1)) { // i, j, k + 1
+                nextPos = pos + 1;
+                if (data[nextPos] > ValueType(0.75)) seedPoints.push_back(nextPos);
+            }
+        }
+    }
+} // seedFill()
+
+
+template<typename LeafNodeType>
+inline bool
+scanFill(LeafNodeType& node)
+{
+    bool updatedNode = false;
+
+    typedef typename LeafNodeType::ValueType ValueType;
+    ValueType* data = node.buffer().data();
+
+    Coord ijk(0, 0, 0);
+
+    bool updatedSign = true;
+    while (updatedSign) {
+
+        updatedSign = false;
+
+        for (Index pos = 0; pos < LeafNodeType::SIZE; ++pos) {
+
+            ValueType& dist = data[pos];
+
+            if (!(dist < ValueType(0.0)) && dist > ValueType(0.75)) {
+
+                ijk = LeafNodeType::offsetToLocalCoord(pos);
+
+                // i, j, k - 1
+                if (ijk[2] != 0 && data[pos - 1] < ValueType(0.0)) {
+                    updatedSign = true;
+                    dist = ValueType(-dist);
+
+                // i, j, k + 1
+                } else if (ijk[2] != (LeafNodeType::DIM - 1) && data[pos + 1] < ValueType(0.0)) {
+                    updatedSign = true;
+                    dist = ValueType(-dist);
+
+                // i, j - 1, k
+                } else if (ijk[1] != 0 && data[pos - LeafNodeType::DIM] < ValueType(0.0)) {
+                    updatedSign = true;
+                    dist = ValueType(-dist);
+
+                // i, j + 1, k
+                } else if (ijk[1] != (LeafNodeType::DIM - 1) && data[pos + LeafNodeType::DIM] < ValueType(0.0)) {
+                    updatedSign = true;
+                    dist = ValueType(-dist);
+
+                // i - 1, j, k
+                } else if (ijk[0] != 0 && data[pos - LeafNodeType::DIM * LeafNodeType::DIM] < ValueType(0.0)) {
+                    updatedSign = true;
+                    dist = ValueType(-dist);
+
+                // i + 1, j, k
+                } else if (ijk[0] != (LeafNodeType::DIM - 1) && data[pos + LeafNodeType::DIM * LeafNodeType::DIM] < ValueType(0.0)) {
+                    updatedSign = true;
+                    dist = ValueType(-dist);
+                }
+            }
+        } // end value loop
+
+        updatedNode |= updatedSign;
+    } // end update loop
+
+    return updatedNode;
+} // scanFill()
+
+
+template<typename TreeType>
+class SeedFillExteriorSign
+{
+public:
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+
+    SeedFillExteriorSign(std::vector<LeafNodeType*>& nodes, bool* changedNodeMask)
+        : mNodes(nodes.empty() ? NULL : &nodes[0])
+        , mChangedNodeMask(changedNodeMask)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            if (mChangedNodeMask[n]) {
+                //seedFill(*mNodes[n]);
+                mChangedNodeMask[n] = scanFill(*mNodes[n]);
+            }
+        }
+    }
+
+    LeafNodeType    ** const mNodes;
+    bool             * const mChangedNodeMask;
+};
+
+
+template<typename ValueType>
+struct FillArray
+{
+    FillArray(ValueType* array, const ValueType v) : mArray(array), mValue(v) { }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        const ValueType v = mValue;
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            mArray[n] = v;
+        }
+    }
+
+    ValueType * const mArray;
+    const ValueType mValue;
+};
+
+
+template<typename ValueType>
+inline void
+fillArray(ValueType* array, const ValueType val, const size_t length)
+{
+    const size_t grainSize = length / tbb::task_scheduler_init::default_num_threads();
+    const tbb::blocked_range<size_t> range(0, length, grainSize);
+    tbb::parallel_for(range, FillArray<ValueType>(array, val), tbb::simple_partitioner());
+}
+
+
+template<typename TreeType>
+class SyncVoxelMask
+{
+public:
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+
+    SyncVoxelMask(std::vector<LeafNodeType*>& nodes, const bool* changedNodeMask,  bool* changedVoxelMask)
+        : mNodes(nodes.empty() ? NULL : &nodes[0])
+        , mChangedNodeMask(changedNodeMask)
+        , mChangedVoxelMask(changedVoxelMask)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            if (mChangedNodeMask[n]) {
+                bool* mask = &mChangedVoxelMask[n * LeafNodeType::SIZE];
+
+                ValueType* data = mNodes[n]->buffer().data();
+
+                for (Index pos = 0; pos < LeafNodeType::SIZE; ++pos) {
+                    if (mask[pos]) {
+                        data[pos] = ValueType(-data[pos]);
+                        mask[pos] = false;
+                    }
+                }
+            }
+        }
+    }
+
+    LeafNodeType      ** const mNodes;
+    bool         const * const mChangedNodeMask;
+    bool          * const mChangedVoxelMask;
+};
+
+
+template<typename TreeType>
+class SeedPoints
+{
+public:
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+    typedef LeafNodeConnectivityTable<TreeType>     ConnectivityTable;
+
+    SeedPoints(ConnectivityTable& connectivity, bool* changedNodeMask, bool* nodeMask, bool* changedVoxelMask)
+        : mConnectivity(&connectivity)
+        , mChangedNodeMask(changedNodeMask)
+        , mNodeMask(nodeMask)
+        , mChangedVoxelMask(changedVoxelMask)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            if (!mChangedNodeMask[n]) {
+
+                bool changedValue = false;
+
+                changedValue |= processZ(n, /*firstFace=*/true);
+                changedValue |= processZ(n, /*firstFace=*/false);
+
+                changedValue |= processY(n, /*firstFace=*/true);
+                changedValue |= processY(n, /*firstFace=*/false);
+
+                changedValue |= processX(n, /*firstFace=*/true);
+                changedValue |= processX(n, /*firstFace=*/false);
+
+                mNodeMask[n] = changedValue;
+            }
+        }
+    }
+
+
+    bool processZ(const size_t n, bool firstFace) const
+    {
+        const size_t offset = firstFace ? mConnectivity->offsetsPrevZ()[n] : mConnectivity->offsetsNextZ()[n];
+        if (offset != ConnectivityTable::INVALID_OFFSET && mChangedNodeMask[offset]) {
+
+            bool* mask = &mChangedVoxelMask[n * LeafNodeType::SIZE];
+
+            const ValueType* lhsData = mConnectivity->nodes()[n]->buffer().data();
+            const ValueType* rhsData = mConnectivity->nodes()[offset]->buffer().data();
+
+            const Index lastOffset = LeafNodeType::DIM - 1;
+            const Index lhsOffset = firstFace ? 0 : lastOffset, rhsOffset = firstFace ? lastOffset : 0;
+
+            Index tmpPos(0), pos(0);
+            bool changedValue = false;
+
+            for (Index x = 0; x < LeafNodeType::DIM; ++x) {
+                tmpPos = x << (2 * LeafNodeType::LOG2DIM);
+                for (Index y = 0; y < LeafNodeType::DIM; ++y) {
+                    pos = tmpPos + (y << LeafNodeType::LOG2DIM);
+
+                    if (lhsData[pos + lhsOffset] > ValueType(0.75)) {
+                        if (rhsData[pos + rhsOffset] < ValueType(0.0)) {
+                            changedValue = true;
+                            mask[pos + lhsOffset] = true;
+                        }
+                    }
+                }
+            }
+
+            return changedValue;
+        }
+
+        return false;
+    }
+
+    bool processY(const size_t n, bool firstFace) const
+    {
+        const size_t offset = firstFace ? mConnectivity->offsetsPrevY()[n] : mConnectivity->offsetsNextY()[n];
+        if (offset != ConnectivityTable::INVALID_OFFSET && mChangedNodeMask[offset]) {
+
+            bool* mask = &mChangedVoxelMask[n * LeafNodeType::SIZE];
+
+            const ValueType* lhsData = mConnectivity->nodes()[n]->buffer().data();
+            const ValueType* rhsData = mConnectivity->nodes()[offset]->buffer().data();
+
+            const Index lastOffset = LeafNodeType::DIM * (LeafNodeType::DIM - 1);
+            const Index lhsOffset = firstFace ? 0 : lastOffset, rhsOffset = firstFace ? lastOffset : 0;
+
+            Index tmpPos(0), pos(0);
+            bool changedValue = false;
+
+            for (Index x = 0; x < LeafNodeType::DIM; ++x) {
+                tmpPos = x << (2 * LeafNodeType::LOG2DIM);
+                for (Index z = 0; z < LeafNodeType::DIM; ++z) {
+                    pos = tmpPos + z;
+
+                    if (lhsData[pos + lhsOffset] > ValueType(0.75)) {
+                        if (rhsData[pos + rhsOffset] < ValueType(0.0)) {
+                            changedValue = true;
+                            mask[pos + lhsOffset] = true;
+                        }
+                    }
+                }
+            }
+
+            return changedValue;
+        }
+
+        return false;
+    }
+
+    bool processX(const size_t n, bool firstFace) const
+    {
+        const size_t offset = firstFace ? mConnectivity->offsetsPrevX()[n] : mConnectivity->offsetsNextX()[n];
+        if (offset != ConnectivityTable::INVALID_OFFSET && mChangedNodeMask[offset]) {
+
+            bool* mask = &mChangedVoxelMask[n * LeafNodeType::SIZE];
+
+            const ValueType* lhsData = mConnectivity->nodes()[n]->buffer().data();
+            const ValueType* rhsData = mConnectivity->nodes()[offset]->buffer().data();
+
+            const Index lastOffset =  LeafNodeType::DIM * LeafNodeType::DIM * (LeafNodeType::DIM - 1);
+            const Index lhsOffset = firstFace ? 0 : lastOffset, rhsOffset = firstFace ? lastOffset : 0;
+
+            Index tmpPos(0), pos(0);
+            bool changedValue = false;
+
+            for (Index y = 0; y < LeafNodeType::DIM; ++y) {
+                tmpPos = y << LeafNodeType::LOG2DIM;
+                for (Index z = 0; z < LeafNodeType::DIM; ++z) {
+                    pos = tmpPos + z;
+
+                    if (lhsData[pos + lhsOffset] > ValueType(0.75)) {
+                        if (rhsData[pos + rhsOffset] < ValueType(0.0)) {
+                            changedValue = true;
+                            mask[pos + lhsOffset] = true;
+                        }
+                    }
+                }
+            }
+
+            return changedValue;
+        }
+
+        return false;
+    }
+
+    ConnectivityTable   * const mConnectivity;
+    bool                * const mChangedNodeMask;
+    bool                * const mNodeMask;
+    bool                * const mChangedVoxelMask;
+};
+
+
+////////////////////////////////////////
+
+template<typename TreeType, typename MeshDataAdapter>
+struct ComputeIntersectingVoxelSign
+{
+    typedef typename TreeType::ValueType                            ValueType;
+    typedef typename TreeType::LeafNodeType                         LeafNodeType;
+    typedef typename TreeType::template ValueConverter<Int32>::Type Int32TreeType;
+    typedef typename Int32TreeType::LeafNodeType                    Int32LeafNodeType;
+
+    typedef std::pair<boost::shared_array<Vec3d>, boost::shared_array<bool> >   LocalData;
+    typedef tbb::enumerable_thread_specific<LocalData>                          LocalDataTable;
+
+    ComputeIntersectingVoxelSign(
+        std::vector<LeafNodeType*>& distNodes,
+        const TreeType& distTree,
+        const Int32TreeType& indexTree,
+        const MeshDataAdapter& mesh)
+        : mDistNodes(distNodes.empty() ? NULL : &distNodes[0])
+        , mDistTree(&distTree)
+        , mIndexTree(&indexTree)
+        , mMesh(&mesh)
+        , mLocalDataTable(new LocalDataTable())
+    {
+    }
+
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        tree::ValueAccessor<const TreeType> distAcc(*mDistTree);
+        tree::ValueAccessor<const Int32TreeType> idxAcc(*mIndexTree);
+
+        ValueType nval;
+        CoordBBox bbox;
+        Index xPos(0), yPos(0);
+        Coord ijk, nijk, nodeMin, nodeMax;
+        Vec3d cp, xyz, nxyz, dir1, dir2;
+
+        LocalData& localData = mLocalDataTable->local();
+
+        boost::shared_array<Vec3d>& points = localData.first;
+        if (!points) points.reset(new Vec3d[LeafNodeType::SIZE * 2]);
+
+        boost::shared_array<bool>& mask = localData.second;
+        if (!mask) mask.reset(new bool[LeafNodeType::SIZE]);
+
+
+        typename LeafNodeType::ValueOnCIter it;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            LeafNodeType& node = *mDistNodes[n];
+            ValueType* data = node.buffer().data();
+
+            const Int32LeafNodeType* idxNode = idxAcc.probeConstLeaf(node.origin());
+            const Int32* idxData = idxNode->buffer().data();
+
+            nodeMin = node.origin();
+            nodeMax = nodeMin.offsetBy(LeafNodeType::DIM - 1);
+
+            // reset computed voxel mask.
+            memset(mask.get(), 0, sizeof(bool) * LeafNodeType::SIZE);
+
+            for (it = node.cbeginValueOn(); it; ++it) {
+                Index pos = it.pos();
+
+                ValueType& dist = data[pos];
+                if (dist < 0.0 || dist > 0.75) continue;
+
+                ijk = node.offsetToGlobalCoord(pos);
+
+                xyz[0] = double(ijk[0]);
+                xyz[1] = double(ijk[1]);
+                xyz[2] = double(ijk[2]);
+
+
+                bbox.min() = Coord::maxComponent(ijk.offsetBy(-1), nodeMin);
+                bbox.max() = Coord::minComponent(ijk.offsetBy(1), nodeMax);
+
+                bool flipSign = false;
+
+                for (nijk[0] = bbox.min()[0]; nijk[0] <= bbox.max()[0] && !flipSign; ++nijk[0]) {
+                    xPos = (nijk[0] & (LeafNodeType::DIM - 1u)) << (2 * LeafNodeType::LOG2DIM);
+                    for (nijk[1] = bbox.min()[1]; nijk[1] <= bbox.max()[1] && !flipSign; ++nijk[1]) {
+                        yPos = xPos + ((nijk[1] & (LeafNodeType::DIM - 1u)) << LeafNodeType::LOG2DIM);
+                        for (nijk[2] = bbox.min()[2]; nijk[2] <= bbox.max()[2]; ++nijk[2]) {
+                            pos = yPos + (nijk[2] & (LeafNodeType::DIM - 1u));
+
+                            const Int32& polyIdx = idxData[pos];
+
+                            if (polyIdx == Int32(util::INVALID_IDX) || !(data[pos] < -0.75)) continue;
+
+                            const Index pointIndex = pos * 2;
+
+                            if (!mask[pos]) {
+
+                                mask[pos] = true;
+
+                                nxyz[0] = double(nijk[0]);
+                                nxyz[1] = double(nijk[1]);
+                                nxyz[2] = double(nijk[2]);
+
+                                Vec3d& point = points[pointIndex];
+
+                                point = closestPoint(nxyz, polyIdx);
+
+                                Vec3d& direction = points[pointIndex + 1];
+                                direction = nxyz - point;
+                                direction.normalize();
+                            }
+
+                            dir1 = xyz - points[pointIndex];
+                            dir1.normalize();
+
+                            if (points[pointIndex + 1].dot(dir1) > 0.0) {
+                                flipSign = true;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+                if (flipSign) {
+                    dist = -dist;
+                } else {
+                    for (Int32 m = 0; m < 26; ++m) {
+                        nijk = ijk + util::COORD_OFFSETS[m];
+
+                        if (!bbox.isInside(nijk) && distAcc.probeValue(nijk, nval) && nval < -0.75) {
+                            nxyz[0] = double(nijk[0]);
+                            nxyz[1] = double(nijk[1]);
+                            nxyz[2] = double(nijk[2]);
+
+                            cp = closestPoint(nxyz, idxAcc.getValue(nijk));
+
+                            dir1 = xyz - cp;
+                            dir1.normalize();
+
+                            dir2 = nxyz - cp;
+                            dir2.normalize();
+
+                            if (dir2.dot(dir1) > 0.0) {
+                                dist = -dist;
+                                break;
+                            }
+                        }
+                    }
+                }
+
+            } // active voxel loop
+        } // leaf node loop
+    }
+
+private:
+
+    Vec3d closestPoint(const Vec3d& center, Int32 polyIdx) const
+    {
+        Vec3d a, b, c, cp, uvw;
+
+        const size_t polygon = size_t(polyIdx);
+        mMesh->getIndexSpacePoint(polygon, 0, a);
+        mMesh->getIndexSpacePoint(polygon, 1, b);
+        mMesh->getIndexSpacePoint(polygon, 2, c);
+
+        cp = closestPointOnTriangleToPoint(a, c, b, center, uvw);
+
+        if (4 == mMesh->vertexCount(polygon)) {
+
+            mMesh->getIndexSpacePoint(polygon, 3, b);
+
+            c = closestPointOnTriangleToPoint(a, b, c, center, uvw);
+
+            if ((center - c).lengthSqr() < (center - cp).lengthSqr()) {
+                cp = c;
+            }
+        }
+
+        return cp;
+    }
+
+
+    LeafNodeType         ** const mDistNodes;
+    TreeType        const * const mDistTree;
+    Int32TreeType   const * const mIndexTree;
+    MeshDataAdapter const * const mMesh;
+
+    boost::shared_ptr<LocalDataTable> mLocalDataTable;
+}; // ComputeIntersectingVoxelSign
+
+
+////////////////////////////////////////
+
+
+template<typename LeafNodeType>
+inline void
+maskNodeInternalNeighbours(const Index pos, bool (&mask)[26])
+{
+    typedef LeafNodeType NodeT;
+
+    const Coord ijk = NodeT::offsetToLocalCoord(pos);
+
+    // Face adjacent neighbours
+    // i+1, j, k
+    mask[0] = ijk[0] != (NodeT::DIM - 1);
+    // i-1, j, k
+    mask[1] = ijk[0] != 0;
+    // i, j+1, k
+    mask[2] = ijk[1] != (NodeT::DIM - 1);
+    // i, j-1, k
+    mask[3] = ijk[1] != 0;
+    // i, j, k+1
+    mask[4] = ijk[2] != (NodeT::DIM - 1);
+    // i, j, k-1
+    mask[5] = ijk[2] != 0;
+
+    // Edge adjacent neighbour
+    // i+1, j, k-1
+    mask[6] = mask[0] && mask[5];
+    // i-1, j, k-1
+    mask[7] = mask[1] && mask[5];
+    // i+1, j, k+1
+    mask[8] = mask[0] && mask[4];
+    // i-1, j, k+1
+    mask[9] = mask[1] && mask[4];
+    // i+1, j+1, k
+    mask[10] = mask[0] && mask[2];
+    // i-1, j+1, k
+    mask[11] = mask[1] && mask[2];
+    // i+1, j-1, k
+    mask[12] = mask[0] && mask[3];
+    // i-1, j-1, k
+    mask[13] = mask[1] && mask[3];
+    // i, j-1, k+1
+    mask[14] = mask[3] && mask[4];
+    // i, j-1, k-1
+    mask[15] = mask[3] && mask[5];
+    // i, j+1, k+1
+    mask[16] = mask[2] && mask[4];
+    // i, j+1, k-1
+    mask[17] = mask[2] && mask[5];
+
+    // Corner adjacent neighbours
+    // i-1, j-1, k-1
+    mask[18] = mask[1] && mask[3] && mask[5];
+    // i-1, j-1, k+1
+    mask[19] = mask[1] && mask[3] && mask[4];
+    // i+1, j-1, k+1
+    mask[20] = mask[0] && mask[3] && mask[4];
+    // i+1, j-1, k-1
+    mask[21] = mask[0] && mask[3] && mask[5];
+    // i-1, j+1, k-1
+    mask[22] = mask[1] && mask[2] && mask[5];
+    // i-1, j+1, k+1
+    mask[23] = mask[1] && mask[2] && mask[4];
+    // i+1, j+1, k+1
+    mask[24] = mask[0] && mask[2] && mask[4];
+    // i+1, j+1, k-1
+    mask[25] = mask[0] && mask[2] && mask[5];
+}
+
+
+template<typename Compare, typename LeafNodeType>
+inline bool
+checkNeighbours(const Index pos, const typename LeafNodeType::ValueType * data, bool (&mask)[26])
+{
+    typedef LeafNodeType NodeT;
+
+    // i, j, k - 1
+    if (mask[5] && Compare::check(data[pos - 1]))                                         return true;
+    // i, j, k + 1
+    if (mask[4] && Compare::check(data[pos + 1]))                                         return true;
+    // i, j - 1, k
+    if (mask[3] && Compare::check(data[pos - NodeT::DIM]))                                return true;
+    // i, j + 1, k
+    if (mask[2] && Compare::check(data[pos + NodeT::DIM]))                                return true;
+    // i - 1, j, k
+    if (mask[1] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM]))                   return true;
+    // i + 1, j, k
+    if (mask[0] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM]))                   return true;
+    // i+1, j, k-1
+    if (mask[6] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM]))                   return true;
+    // i-1, j, k-1
+    if (mask[7] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM - 1]))               return true;
+    // i+1, j, k+1
+    if (mask[8] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM + 1]))               return true;
+    // i-1, j, k+1
+    if (mask[9] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM + 1]))               return true;
+    // i+1, j+1, k
+    if (mask[10] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM + NodeT::DIM]))     return true;
+    // i-1, j+1, k
+    if (mask[11] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM + NodeT::DIM]))     return true;
+    // i+1, j-1, k
+    if (mask[12] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM - NodeT::DIM]))     return true;
+    // i-1, j-1, k
+    if (mask[13] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM - NodeT::DIM]))     return true;
+    // i, j-1, k+1
+    if (mask[14] && Compare::check(data[pos - NodeT::DIM + 1]))                           return true;
+    // i, j-1, k-1
+    if (mask[15] && Compare::check(data[pos - NodeT::DIM - 1]))                           return true;
+    // i, j+1, k+1
+    if (mask[16] && Compare::check(data[pos + NodeT::DIM + 1]))                           return true;
+    // i, j+1, k-1
+    if (mask[17] && Compare::check(data[pos + NodeT::DIM - 1]))                           return true;
+    // i-1, j-1, k-1
+    if (mask[18] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM - NodeT::DIM - 1])) return true;
+    // i-1, j-1, k+1
+    if (mask[19] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM - NodeT::DIM + 1])) return true;
+    // i+1, j-1, k+1
+    if (mask[20] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM - NodeT::DIM + 1])) return true;
+    // i+1, j-1, k-1
+    if (mask[21] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM - NodeT::DIM - 1])) return true;
+    // i-1, j+1, k-1
+    if (mask[22] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM + NodeT::DIM - 1])) return true;
+    // i-1, j+1, k+1
+    if (mask[23] && Compare::check(data[pos - NodeT::DIM * NodeT::DIM + NodeT::DIM + 1])) return true;
+    // i+1, j+1, k+1
+    if (mask[24] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM + NodeT::DIM + 1])) return true;
+    // i+1, j+1, k-1
+    if (mask[25] && Compare::check(data[pos + NodeT::DIM * NodeT::DIM + NodeT::DIM - 1])) return true;
+
+    return false;
+}
+
+
+template<typename Compare, typename AccessorType>
+inline bool
+checkNeighbours(const Coord& ijk, AccessorType& acc, bool (&mask)[26])
+{
+    for (Int32 m = 0; m < 26; ++m) {
+        if (!mask[m] && Compare::check(acc.getValue(ijk + util::COORD_OFFSETS[m]))) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+template<typename TreeType>
+struct ValidateIntersectingVoxels
+{
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+
+    struct IsNegative { static bool check(const ValueType v) { return v < ValueType(0.0); } };
+
+    ValidateIntersectingVoxels(TreeType& tree, std::vector<LeafNodeType*>& nodes)
+        : mTree(&tree)
+        , mNodes(nodes.empty() ? NULL : &nodes[0])
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+        bool neighbourMask[26];
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            LeafNodeType& node = *mNodes[n];
+            ValueType* data = node.buffer().data();
+
+            typename LeafNodeType::ValueOnCIter it;
+            for (it = node.cbeginValueOn(); it; ++it) {
+
+                const Index pos = it.pos();
+
+                ValueType& dist = data[pos];
+                if (dist < 0.0 || dist > 0.75) continue;
+
+                // Mask node internal neighbours
+                maskNodeInternalNeighbours<LeafNodeType>(pos, neighbourMask);
+
+                const bool hasNegativeNeighbour =
+                    checkNeighbours<IsNegative, LeafNodeType>(pos, data, neighbourMask) ||
+                    checkNeighbours<IsNegative>(node.offsetToGlobalCoord(pos), acc, neighbourMask);
+
+                if (!hasNegativeNeighbour) {
+                    // push over boundary voxel distance
+                    dist = ValueType(0.75) + Tolerance<ValueType>::epsilon();
+                }
+            }
+        }
+    }
+
+    TreeType         * const mTree;
+    LeafNodeType    ** const mNodes;
+}; // ValidateIntersectingVoxels
+
+
+template<typename TreeType>
+struct RemoveSelfIntersectingSurface
+{
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+    typedef typename TreeType::template ValueConverter<Int32>::Type Int32TreeType;
+
+    struct Comp { static bool check(const ValueType v) { return !(v > ValueType(0.75)); } };
+
+    RemoveSelfIntersectingSurface(std::vector<LeafNodeType*>& nodes,
+        TreeType& distTree, Int32TreeType& indexTree)
+        : mNodes(nodes.empty() ? NULL : &nodes[0])
+        , mDistTree(&distTree)
+        , mIndexTree(&indexTree)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        tree::ValueAccessor<const TreeType> distAcc(*mDistTree);
+        tree::ValueAccessor<Int32TreeType> idxAcc(*mIndexTree);
+        bool neighbourMask[26];
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            LeafNodeType& distNode = *mNodes[n];
+            ValueType* data = distNode.buffer().data();
+
+            typename Int32TreeType::LeafNodeType* idxNode =
+                idxAcc.probeLeaf(distNode.origin());
+
+            typename LeafNodeType::ValueOnCIter it;
+            for (it = distNode.cbeginValueOn(); it; ++it) {
+
+                const Index pos = it.pos();
+
+                if (!(data[pos] > 0.75)) continue;
+
+                // Mask node internal neighbours
+                maskNodeInternalNeighbours<LeafNodeType>(pos, neighbourMask);
+
+                const bool hasBoundaryNeighbour =
+                    checkNeighbours<Comp, LeafNodeType>(pos, data, neighbourMask) ||
+                    checkNeighbours<Comp>(distNode.offsetToGlobalCoord(pos), distAcc, neighbourMask);
+
+                if (!hasBoundaryNeighbour) {
+                    distNode.setValueOff(pos);
+                    idxNode->setValueOff(pos);
+                }
+            }
+        }
+    }
+
+    LeafNodeType   * * const mNodes;
+    TreeType         * const mDistTree;
+    Int32TreeType    * const mIndexTree;
+}; // RemoveSelfIntersectingSurface
+
+
+////////////////////////////////////////
+
+
+template<typename NodeType>
+struct ReleaseChildNodes
+{
+    ReleaseChildNodes(NodeType ** nodes) : mNodes(nodes) {}
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typedef typename NodeType::NodeMaskType NodeMaskType;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            const_cast<NodeMaskType&>(mNodes[n]->getChildMask()).setOff();
+        }
+    }
+
+    NodeType ** const mNodes;
+};
+
+
+template<typename TreeType>
+inline void
+releaseLeafNodes(TreeType& tree)
+{
+    typedef typename TreeType::RootNodeType         RootNodeType;
+    typedef typename RootNodeType::NodeChainType    NodeChainType;
+    typedef typename boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type InternalNodeType;
+
+    std::vector<InternalNodeType*> nodes;
+    tree.getNodes(nodes);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+        ReleaseChildNodes<InternalNodeType>(nodes.empty() ? NULL : &nodes[0]));
+}
+
+
+template<typename TreeType>
+struct StealUniqueLeafNodes
+{
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    StealUniqueLeafNodes(TreeType& lhsTree, TreeType& rhsTree,
+        std::vector<LeafNodeType*>& overlappingNodes)
+        : mLhsTree(&lhsTree)
+        , mRhsTree(&rhsTree)
+        , mNodes(&overlappingNodes)
+    {
+    }
+
+    void operator()() const {
+
+        std::vector<LeafNodeType*> rhsLeafNodes;
+
+        rhsLeafNodes.reserve(mRhsTree->leafCount());
+        //mRhsTree->getNodes(rhsLeafNodes);
+        //releaseLeafNodes(*mRhsTree);
+        mRhsTree->stealNodes(rhsLeafNodes);
+
+        tree::ValueAccessor<TreeType> acc(*mLhsTree);
+
+        for (size_t n = 0, N = rhsLeafNodes.size(); n < N; ++n) {
+            if (!acc.probeLeaf(rhsLeafNodes[n]->origin())) {
+                acc.addLeaf(rhsLeafNodes[n]);
+            } else {
+                mNodes->push_back(rhsLeafNodes[n]);
+            }
+        }
+    }
+
+private:
+    TreeType * const mLhsTree;
+    TreeType * const mRhsTree;
+    std::vector<LeafNodeType*> * const mNodes;
+};
+
+
+template<typename DistTreeType, typename IndexTreeType>
+inline void
+combineData(DistTreeType& lhsDist, IndexTreeType& lhsIdx,
+    DistTreeType& rhsDist, IndexTreeType& rhsIdx)
+{
+    typedef typename DistTreeType::LeafNodeType     DistLeafNodeType;
+    typedef typename IndexTreeType::LeafNodeType    IndexLeafNodeType;
+
+    std::vector<DistLeafNodeType*>  overlappingDistNodes;
+    std::vector<IndexLeafNodeType*> overlappingIdxNodes;
+
+    // Steal unique leafnodes
+    tbb::task_group tasks;
+    tasks.run(StealUniqueLeafNodes<DistTreeType>(lhsDist, rhsDist, overlappingDistNodes));
+    tasks.run(StealUniqueLeafNodes<IndexTreeType>(lhsIdx, rhsIdx, overlappingIdxNodes));
+    tasks.wait();
+
+    // Combine overlapping leaf nodes
+    if (!overlappingDistNodes.empty() && !overlappingIdxNodes.empty()) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, overlappingDistNodes.size()),
+            CombineLeafNodes<DistTreeType>(lhsDist, lhsIdx, &overlappingDistNodes[0], &overlappingIdxNodes[0]));
+    }
+}
+
+/// @brief TBB body object to voxelize a mesh of triangles and/or quads into a collection
+/// of VDB grids, namely a squared distance grid, a closest primitive grid and an
+/// intersecting voxels grid (masks the mesh intersecting voxels)
+/// @note Only the leaf nodes that intersect the mesh are allocated, and only voxels in
+/// a narrow band (of two to three voxels in proximity to the mesh's surface) are activated.
+/// They are populated with distance values and primitive indices.
+template<typename TreeType>
+struct VoxelizationData {
+
+    typedef boost::scoped_ptr<VoxelizationData>                         Ptr;
+    typedef typename TreeType::ValueType                                ValueType;
+
+    typedef typename TreeType::template ValueConverter<Int32>::Type         Int32TreeType;
+    typedef typename TreeType::template ValueConverter<unsigned char>::Type UCharTreeType;
+
+    typedef tree::ValueAccessor<TreeType>       FloatTreeAcc;
+    typedef tree::ValueAccessor<Int32TreeType>  Int32TreeAcc;
+    typedef tree::ValueAccessor<UCharTreeType>  UCharTreeAcc;
+
+
+    VoxelizationData()
+        : distTree(std::numeric_limits<ValueType>::max())
+        , distAcc(distTree)
+        , indexTree(Int32(util::INVALID_IDX))
+        , indexAcc(indexTree)
+        , primIdTree(MaxPrimId)
+        , primIdAcc(primIdTree)
+        , mPrimCount(0)
+    {
+    }
+
+    TreeType        distTree;
+    FloatTreeAcc    distAcc;
+
+    Int32TreeType   indexTree;
+    Int32TreeAcc    indexAcc;
+
+    UCharTreeType   primIdTree;
+    UCharTreeAcc    primIdAcc;
+
+    unsigned char getNewPrimId() {
+
+        if (mPrimCount == MaxPrimId || primIdTree.leafCount() > 1000) {
+            mPrimCount = 0;
+            primIdTree.clear();
+        }
+
+        return mPrimCount++;
+    }
+
+private:
+
+    enum { MaxPrimId = 100 };
+
+    unsigned char mPrimCount;
+};
+
+
+template<typename TreeType, typename MeshDataAdapter, typename Interrupter = util::NullInterrupter>
+class VoxelizePolygons
+{
+public:
+
+    typedef VoxelizationData<TreeType>                                          VoxelizationDataType;
+    typedef tbb::enumerable_thread_specific<typename VoxelizationDataType::Ptr> DataTable;
+
+    VoxelizePolygons(DataTable& dataTable,
+        const MeshDataAdapter& mesh,
+        Interrupter* interrupter = NULL)
+        : mDataTable(&dataTable)
+        , mMesh(&mesh)
+        , mInterrupter(interrupter)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typename VoxelizationDataType::Ptr& dataPtr = mDataTable->local();
+        if (!dataPtr) dataPtr.reset(new VoxelizationDataType());
+
+        Triangle prim;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            if (this->wasInterrupted()) {
+                tbb::task::self().cancel_group_execution();
+                break;
+            }
+
+            const size_t numVerts = mMesh->vertexCount(n);
+
+            // rasterize triangles and quads.
+            if (numVerts == 3 || numVerts == 4) {
+
+                prim.index = Int32(n);
+
+                mMesh->getIndexSpacePoint(n, 0, prim.a);
+                mMesh->getIndexSpacePoint(n, 1, prim.b);
+                mMesh->getIndexSpacePoint(n, 2, prim.c);
+
+                evalTriangle(prim, *dataPtr);
+
+                if (numVerts == 4) {
+                    mMesh->getIndexSpacePoint(n, 3, prim.b);
+                    evalTriangle(prim, *dataPtr);
+                }
+            }
+        }
+    }
+
+private:
+
+    bool wasInterrupted() const { return mInterrupter && mInterrupter->wasInterrupted(); }
+
+    struct Triangle { Vec3d a, b, c; Int32 index; };
+
+    struct SubTask
+    {
+        enum { POLYGON_LIMIT = 1000 };
+
+        SubTask(const Triangle& prim, DataTable& dataTable, int subdivisionCount, size_t polygonCount)
+            : mLocalDataTable(&dataTable)
+            , mPrim(prim)
+            , mSubdivisionCount(subdivisionCount)
+            , mPolygonCount(polygonCount)
+        {
+        }
+
+        void operator()() const
+        {
+            if (mSubdivisionCount <= 0 || mPolygonCount >= POLYGON_LIMIT) {
+
+                typename VoxelizationDataType::Ptr& dataPtr = mLocalDataTable->local();
+                if (!dataPtr) dataPtr.reset(new VoxelizationDataType());
+
+                voxelizeTriangle(mPrim, *dataPtr);
+
+            } else {
+                spawnTasks(mPrim, *mLocalDataTable, mSubdivisionCount, mPolygonCount);
+            }
+        }
+
+        DataTable * const mLocalDataTable;
+        Triangle    const mPrim;
+        int         const mSubdivisionCount;
+        size_t      const mPolygonCount;
+    }; // struct SubTask
+
+    inline static int evalSubdivisionCount(const Triangle& prim)
+    {
+        const double ax = prim.a[0], bx = prim.b[0], cx = prim.c[0];
+        const double dx = std::max(ax, std::max(bx, cx)) - std::min(ax, std::min(bx, cx));
+
+        const double ay = prim.a[1], by = prim.b[1], cy = prim.c[1];
+        const double dy = std::max(ay, std::max(by, cy)) - std::min(ay, std::min(by, cy));
+
+        const double az = prim.a[2], bz = prim.b[2], cz = prim.c[2];
+        const double dz = std::max(az, std::max(bz, cz)) - std::min(az, std::min(bz, cz));
+
+        return int(std::max(dx, std::max(dy, dz)) / double(TreeType::LeafNodeType::DIM * 2));
+    }
+
+    void evalTriangle(const Triangle& prim, VoxelizationDataType& data) const
+    {
+        const size_t polygonCount = mMesh->polygonCount();
+        const int subdivisionCount = polygonCount < SubTask::POLYGON_LIMIT ? evalSubdivisionCount(prim) : 0;
+
+        if (subdivisionCount <= 0) {
+            voxelizeTriangle(prim, data);
+        } else {
+            spawnTasks(prim, *mDataTable, subdivisionCount, polygonCount);
+        }
+    }
+
+    static void spawnTasks(
+        const Triangle& mainPrim, DataTable& dataTable, int subdivisionCount, size_t polygonCount)
+    {
+        subdivisionCount -= 1;
+        polygonCount *= 4;
+
+        tbb::task_group tasks;
+
+        const Vec3d ac = (mainPrim.a + mainPrim.c) * 0.5;
+        const Vec3d bc = (mainPrim.b + mainPrim.c) * 0.5;
+        const Vec3d ab = (mainPrim.a + mainPrim.b) * 0.5;
+
+        Triangle prim;
+        prim.index = mainPrim.index;
+
+        prim.a = mainPrim.a;
+        prim.b = ab;
+        prim.c = ac;
+        tasks.run(SubTask(prim, dataTable, subdivisionCount, polygonCount));
+
+        prim.a = ab;
+        prim.b = bc;
+        prim.c = ac;
+        tasks.run(SubTask(prim, dataTable, subdivisionCount, polygonCount));
+
+        prim.a = ab;
+        prim.b = mainPrim.b;
+        prim.c = bc;
+        tasks.run(SubTask(prim, dataTable, subdivisionCount, polygonCount));
+
+        prim.a = ac;
+        prim.b = bc;
+        prim.c = mainPrim.c;
+        tasks.run(SubTask(prim, dataTable, subdivisionCount, polygonCount));
+
+        tasks.wait();
+    }
+
+    static void voxelizeTriangle(const Triangle& prim, VoxelizationDataType& data)
+    {
+        std::deque<Coord> coordList;
+        Coord ijk, nijk;
+
+        ijk = Coord::floor(prim.a);
+        coordList.push_back(ijk);
+
+        computeDistance(ijk, prim, data);
+
+        unsigned char primId = data.getNewPrimId();
+        data.primIdAcc.setValueOnly(ijk, primId);
+
+        while (!coordList.empty()) {
+            ijk = coordList.back();
+            coordList.pop_back();
+
+            for (Int32 i = 0; i < 26; ++i) {
+                nijk = ijk + util::COORD_OFFSETS[i];
+                if (primId != data.primIdAcc.getValue(nijk)) {
+                    data.primIdAcc.setValueOnly(nijk, primId);
+                    if(computeDistance(nijk, prim, data)) coordList.push_back(nijk);
+                }
+            }
+        }
+    }
+
+    static bool computeDistance(const Coord& ijk, const Triangle& prim, VoxelizationDataType& data)
+    {
+        Vec3d uvw, voxelCenter(ijk[0], ijk[1], ijk[2]);
+
+        typedef typename TreeType::ValueType ValueType;
+
+        const ValueType dist = ValueType((voxelCenter -
+                closestPointOnTriangleToPoint(prim.a, prim.c, prim.b, voxelCenter, uvw)).lengthSqr());
+
+        const ValueType oldDist = data.distAcc.getValue(ijk);
+
+        if (dist < oldDist) {
+            data.distAcc.setValue(ijk, dist);
+            data.indexAcc.setValue(ijk, prim.index);
+        } else if (math::isExactlyEqual(dist, oldDist)) {
+            // makes reduction deterministic when different polygons
+            // produce the same distance value.
+            data.indexAcc.setValueOnly(ijk, std::min(prim.index, data.indexAcc.getValue(ijk)));
+        }
+
+        return !(dist > 0.75); // true if the primitive intersects the voxel.
+    }
+
+    DataTable                 * const mDataTable;
+    MeshDataAdapter     const * const mMesh;
+    Interrupter               * const mInterrupter;
+}; // VoxelizePolygons
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+struct DiffLeafNodeMask
+{
+    typedef typename tree::ValueAccessor<TreeType>  AccessorType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+
+    DiffLeafNodeMask(const TreeType& rhsTree,
+        std::vector<BoolLeafNodeType*>& lhsNodes)
+        : mRhsTree(&rhsTree), mLhsNodes(lhsNodes.empty() ? NULL : &lhsNodes[0])
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        tree::ValueAccessor<const TreeType> acc(*mRhsTree);
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            BoolLeafNodeType* lhsNode = mLhsNodes[n];
+            const LeafNodeType* rhsNode = acc.probeConstLeaf(lhsNode->origin());
+
+            if (rhsNode) lhsNode->topologyDifference(*rhsNode, false);
+        }
+    }
+
+private:
+    TreeType            const * const mRhsTree;
+    BoolLeafNodeType         ** const mLhsNodes;
+};
+
+
+template<typename LeafNodeTypeA, typename LeafNodeTypeB>
+struct UnionValueMasks
+{
+    UnionValueMasks(std::vector<LeafNodeTypeA*>& nodesA, std::vector<LeafNodeTypeB*>& nodesB)
+        : mNodesA(nodesA.empty() ? NULL : &nodesA[0])
+        , mNodesB(nodesB.empty() ? NULL : &nodesB[0])
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+            mNodesA[n]->topologyUnion(*mNodesB[n]);
+        }
+    }
+
+private:
+    LeafNodeTypeA ** const  mNodesA;
+    LeafNodeTypeB ** const  mNodesB;
+};
+
+
+template<typename TreeType>
+struct ConstructVoxelMask
+{
+    typedef typename TreeType::LeafNodeType                         LeafNodeType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+
+    ConstructVoxelMask(BoolTreeType& maskTree, const TreeType& tree, std::vector<LeafNodeType*>& nodes)
+        : mTree(&tree)
+        , mNodes(nodes.empty() ? NULL : &nodes[0])
+        , mLocalMaskTree(false)
+        , mMaskTree(&maskTree)
+    {
+    }
+
+    ConstructVoxelMask(ConstructVoxelMask& rhs, tbb::split)
+        : mTree(rhs.mTree)
+        , mNodes(rhs.mNodes)
+        , mLocalMaskTree(false)
+        , mMaskTree(&mLocalMaskTree)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range)
+    {
+        typedef typename LeafNodeType::ValueOnCIter Iterator;
+
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+        tree::ValueAccessor<BoolTreeType> maskAcc(*mMaskTree);
+
+        Coord ijk, nijk, localCorod;
+        Index pos, npos;
+
+        for (size_t n = range.begin(); n != range.end(); ++n) {
+
+            LeafNodeType& node = *mNodes[n];
+
+            CoordBBox bbox = node.getNodeBoundingBox();
+            bbox.expand(-1);
+
+            BoolLeafNodeType& maskNode = *maskAcc.touchLeaf(node.origin());
+
+            for (Iterator it = node.cbeginValueOn(); it; ++it) {
+                ijk = it.getCoord();
+                pos = it.pos();
+
+                localCorod = LeafNodeType::offsetToLocalCoord(pos);
+
+                if (localCorod[2] < int(LeafNodeType::DIM - 1)) {
+                    npos = pos + 1;
+                    if (!node.isValueOn(npos)) maskNode.setValueOn(npos);
+                } else {
+                    nijk = ijk.offsetBy(0, 0, 1);
+                    if (!acc.isValueOn(nijk)) maskAcc.setValueOn(nijk);
+                }
+
+                if (localCorod[2] > 0) {
+                    npos = pos - 1;
+                    if (!node.isValueOn(npos)) maskNode.setValueOn(npos);
+                } else {
+                    nijk = ijk.offsetBy(0, 0, -1);
+                    if (!acc.isValueOn(nijk)) maskAcc.setValueOn(nijk);
+                }
+
+                if (localCorod[1] < int(LeafNodeType::DIM - 1)) {
+                    npos = pos + LeafNodeType::DIM;
+                    if (!node.isValueOn(npos)) maskNode.setValueOn(npos);
+                } else {
+                    nijk = ijk.offsetBy(0, 1, 0);
+                    if (!acc.isValueOn(nijk)) maskAcc.setValueOn(nijk);
+                }
+
+                if (localCorod[1] > 0) {
+                    npos = pos - LeafNodeType::DIM;
+                    if (!node.isValueOn(npos)) maskNode.setValueOn(npos);
+                } else {
+                    nijk = ijk.offsetBy(0, -1, 0);
+                    if (!acc.isValueOn(nijk)) maskAcc.setValueOn(nijk);
+                }
+
+                if (localCorod[0] < int(LeafNodeType::DIM - 1)) {
+                    npos = pos + LeafNodeType::DIM * LeafNodeType::DIM;
+                    if (!node.isValueOn(npos)) maskNode.setValueOn(npos);
+                } else {
+                    nijk = ijk.offsetBy(1, 0, 0);
+                    if (!acc.isValueOn(nijk)) maskAcc.setValueOn(nijk);
+                }
+
+                if (localCorod[0] > 0) {
+                    npos = pos - LeafNodeType::DIM * LeafNodeType::DIM;
+                    if (!node.isValueOn(npos)) maskNode.setValueOn(npos);
+                } else {
+                    nijk = ijk.offsetBy(-1, 0, 0);
+                    if (!acc.isValueOn(nijk)) maskAcc.setValueOn(nijk);
+                }
+            }
+        }
+    }
+
+    void join(ConstructVoxelMask& rhs) { mMaskTree->merge(*rhs.mMaskTree); }
+
+private:
+    TreeType        const   * const mTree;
+    LeafNodeType           ** const mNodes;
+
+    BoolTreeType         mLocalMaskTree;
+    BoolTreeType * const mMaskTree;
+};
+
+
+/// @note The interior and exterior widths should be in world space units and squared.
+template<typename TreeType, typename MeshDataAdapter>
+struct ExpandNarrowband
+{
+    typedef typename TreeType::ValueType                            ValueType;
+    typedef typename TreeType::LeafNodeType                         LeafNodeType;
+    typedef typename LeafNodeType::NodeMaskType                     NodeMaskType;
+    typedef typename TreeType::template ValueConverter<Int32>::Type Int32TreeType;
+    typedef typename Int32TreeType::LeafNodeType                    Int32LeafNodeType;
+    typedef typename TreeType::template ValueConverter<bool>::Type  BoolTreeType;
+    typedef typename BoolTreeType::LeafNodeType                     BoolLeafNodeType;
+
+    struct Fragment
+    {
+        Int32 idx, x, y, z;
+        ValueType dist;
+
+        Fragment() : idx(0), x(0), y(0), z(0), dist(0.0) {}
+
+        Fragment(Int32 idx_, Int32 x_, Int32 y_, Int32 z_, ValueType dist_)
+            : idx(idx_), x(x_), y(y_), z(z_), dist(dist_)
+        {
+        }
+
+        bool operator<(const Fragment& rhs) const { return idx < rhs.idx; }
+    }; // struct Fragment
+
+    ////////////////////
+
+    ExpandNarrowband(
+        std::vector<BoolLeafNodeType*>& maskNodes,
+        BoolTreeType& maskTree,
+        TreeType& distTree,
+        Int32TreeType& indexTree,
+        const MeshDataAdapter& mesh,
+        ValueType exteriorBandWidth,
+        ValueType interiorBandWidth,
+        ValueType voxelSize)
+        : mMaskNodes(maskNodes.empty() ? NULL : &maskNodes[0])
+        , mMaskTree(&maskTree)
+        , mDistTree(&distTree)
+        , mIndexTree(&indexTree)
+        , mMesh(&mesh)
+        , mNewMaskTree(false)
+        , mDistNodes()
+        , mUpdatedDistNodes()
+        , mIndexNodes()
+        , mUpdatedIndexNodes()
+        , mExteriorBandWidth(exteriorBandWidth)
+        , mInteriorBandWidth(interiorBandWidth)
+        , mVoxelSize(voxelSize)
+    {
+    }
+
+    ExpandNarrowband(const ExpandNarrowband& rhs, tbb::split)
+        : mMaskNodes(rhs.mMaskNodes)
+        , mMaskTree(rhs.mMaskTree)
+        , mDistTree(rhs.mDistTree)
+        , mIndexTree(rhs.mIndexTree)
+        , mMesh(rhs.mMesh)
+        , mNewMaskTree(false)
+        , mDistNodes()
+        , mUpdatedDistNodes()
+        , mIndexNodes()
+        , mUpdatedIndexNodes()
+        , mExteriorBandWidth(rhs.mExteriorBandWidth)
+        , mInteriorBandWidth(rhs.mInteriorBandWidth)
+        , mVoxelSize(rhs.mVoxelSize)
+    {
+    }
+
+    void join(ExpandNarrowband& rhs)
+    {
+        mDistNodes.insert(mDistNodes.end(), rhs.mDistNodes.begin(), rhs.mDistNodes.end());
+        mIndexNodes.insert(mIndexNodes.end(), rhs.mIndexNodes.begin(), rhs.mIndexNodes.end());
+
+        mUpdatedDistNodes.insert(mUpdatedDistNodes.end(),
+            rhs.mUpdatedDistNodes.begin(), rhs.mUpdatedDistNodes.end());
+
+        mUpdatedIndexNodes.insert(mUpdatedIndexNodes.end(),
+            rhs.mUpdatedIndexNodes.begin(), rhs.mUpdatedIndexNodes.end());
+
+        mNewMaskTree.merge(rhs.mNewMaskTree);
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range)
+    {
+        tree::ValueAccessor<BoolTreeType>   newMaskAcc(mNewMaskTree);
+        tree::ValueAccessor<TreeType>       distAcc(*mDistTree);
+        tree::ValueAccessor<Int32TreeType>  indexAcc(*mIndexTree);
+
+        std::vector<Fragment> fragments;
+        fragments.reserve(256);
+
+        LeafNodeType      * newDistNodePt = NULL;
+        Int32LeafNodeType * newIndexNodePt = NULL;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            BoolLeafNodeType& maskNode = *mMaskNodes[n];
+            if (maskNode.isEmpty()) continue;
+
+            // Setup local caches
+
+            const Coord& origin = maskNode.origin();
+
+            LeafNodeType      * distNodePt = distAcc.probeLeaf(origin);
+            Int32LeafNodeType * indexNodePt = indexAcc.probeLeaf(origin);
+
+            assert(!distNodePt == !indexNodePt);
+
+            bool usingNewNodes = false;
+
+            if (!distNodePt && !indexNodePt) {
+
+                const ValueType backgroundDist = distAcc.getValue(origin);
+
+                if (!newDistNodePt && !newIndexNodePt) {
+                    newDistNodePt = new LeafNodeType(origin, backgroundDist);
+                    newIndexNodePt = new Int32LeafNodeType(origin, indexAcc.getValue(origin));
+                } else {
+
+                    if ((backgroundDist < ValueType(0.0)) !=
+                            (newDistNodePt->getValue(0) < ValueType(0.0))) {
+                        newDistNodePt->buffer().fill(backgroundDist);
+                    }
+
+                    newDistNodePt->setOrigin(origin);
+                    newIndexNodePt->setOrigin(origin);
+                }
+
+                distNodePt = newDistNodePt;
+                indexNodePt = newIndexNodePt;
+
+                usingNewNodes = true;
+            }
+
+
+            // Gather neighbour information
+
+            CoordBBox bbox(Coord::max(), Coord::min());
+            for (typename BoolLeafNodeType::ValueOnIter it = maskNode.beginValueOn(); it; ++it) {
+                bbox.expand(it.getCoord());
+            }
+
+            bbox.expand(1);
+
+            gatherFragments(fragments, bbox, distAcc, indexAcc);
+
+
+            // Compute first voxel layer
+
+            bbox = maskNode.getNodeBoundingBox();
+            NodeMaskType mask;
+            bool updatedLeafNodes = false;
+
+            for (typename BoolLeafNodeType::ValueOnIter it = maskNode.beginValueOn(); it; ++it) {
+
+                const Coord ijk = it.getCoord();
+
+                if (updateVoxel(ijk, 5, fragments, *distNodePt, *indexNodePt, &updatedLeafNodes)) {
+
+                    for (Int32 i = 0; i < 6; ++i) {
+                        const Coord nijk = ijk + util::COORD_OFFSETS[i];
+                        if (bbox.isInside(nijk)) {
+                            mask.setOn(BoolLeafNodeType::coordToOffset(nijk));
+                        } else  {
+                            newMaskAcc.setValueOn(nijk);
+                        }
+                    }
+
+                    for (Int32 i = 6; i < 26; ++i) {
+                        const Coord nijk = ijk + util::COORD_OFFSETS[i];
+                        if (bbox.isInside(nijk)) {
+                            mask.setOn(BoolLeafNodeType::coordToOffset(nijk));
+                        }
+                    }
+                }
+            }
+
+            if (updatedLeafNodes) {
+
+                // Compute second voxel layer
+                mask -= indexNodePt->getValueMask();
+
+                for (typename NodeMaskType::OnIterator it = mask.beginOn(); it; ++it) {
+
+                    const Index pos = it.pos();
+                    const Coord ijk = maskNode.origin() + LeafNodeType::offsetToLocalCoord(pos);
+
+                    if (updateVoxel(ijk, 6, fragments, *distNodePt, *indexNodePt)) {
+                        for (Int32 i = 0; i < 6; ++i) {
+                            newMaskAcc.setValueOn(ijk + util::COORD_OFFSETS[i]);
+                        }
+                    }
+                }
+
+                // Export new distance values
+                if (usingNewNodes) {
+                    distNodePt->topologyUnion(*indexNodePt);
+
+                    mDistNodes.push_back(distNodePt);
+                    mIndexNodes.push_back(indexNodePt);
+
+                    newDistNodePt = NULL;
+                    newIndexNodePt = NULL;
+                } else {
+                    mUpdatedDistNodes.push_back(distNodePt);
+                    mUpdatedIndexNodes.push_back(indexNodePt);
+                }
+            }
+        } // end leafnode loop
+    }
+
+    //////////
+
+    BoolTreeType& newMaskTree() { return mNewMaskTree; }
+
+    std::vector<LeafNodeType*>& newDistNodes() { return mDistNodes; }
+    std::vector<LeafNodeType*>& updatedDistNodes() { return mUpdatedDistNodes; }
+
+    std::vector<Int32LeafNodeType*>& newIndexNodes() { return mIndexNodes; }
+    std::vector<Int32LeafNodeType*>& updatedIndexNodes() { return mUpdatedIndexNodes; }
+
+private:
+
+    /// @note   The output fragment list is ordered by the primitive index
+    void
+    gatherFragments(std::vector<Fragment>& fragments, const CoordBBox& bbox,
+        tree::ValueAccessor<TreeType>& distAcc, tree::ValueAccessor<Int32TreeType>& indexAcc)
+    {
+        fragments.clear();
+        const Coord nodeMin = bbox.min() & ~(LeafNodeType::DIM - 1);
+        const Coord nodeMax = bbox.max() & ~(LeafNodeType::DIM - 1);
+
+        CoordBBox region;
+        Coord ijk;
+
+        for (ijk[0] = nodeMin[0]; ijk[0] <= nodeMax[0]; ijk[0] += LeafNodeType::DIM) {
+            for (ijk[1] = nodeMin[1]; ijk[1] <= nodeMax[1]; ijk[1] += LeafNodeType::DIM) {
+                for (ijk[2] = nodeMin[2]; ijk[2] <= nodeMax[2]; ijk[2] += LeafNodeType::DIM) {
+                    if (LeafNodeType* distleaf = distAcc.probeLeaf(ijk)) {
+                        region.min() = Coord::maxComponent(bbox.min(), ijk);
+                        region.max() = Coord::minComponent(bbox.max(),
+                            ijk.offsetBy(LeafNodeType::DIM - 1));
+                        gatherFragments(fragments, region, *distleaf, *indexAcc.probeLeaf(ijk));
+                    }
+                }
+            }
+        }
+
+        std::sort(fragments.begin(), fragments.end());
+    }
+
+    void
+    gatherFragments(std::vector<Fragment>& fragments, const CoordBBox& bbox,
+        const LeafNodeType& distLeaf, const Int32LeafNodeType& idxLeaf) const
+    {
+        const typename LeafNodeType::NodeMaskType& mask = distLeaf.getValueMask();
+        const ValueType* distData = distLeaf.buffer().data();
+        const Int32* idxData = idxLeaf.buffer().data();
+
+        for (int x = bbox.min()[0]; x <= bbox.max()[0]; ++x) {
+            const Index xPos = (x & (LeafNodeType::DIM - 1u)) << (2 * LeafNodeType::LOG2DIM);
+            for (int y = bbox.min()[1]; y <= bbox.max()[1]; ++y) {
+                const Index yPos = xPos + ((y & (LeafNodeType::DIM - 1u)) << LeafNodeType::LOG2DIM);
+                for (int z = bbox.min()[2]; z <= bbox.max()[2]; ++z) {
+                    const Index pos = yPos + (z & (LeafNodeType::DIM - 1u));
+                    if (mask.isOn(pos)) {
+                        fragments.push_back(Fragment(idxData[pos],x,y,z, std::abs(distData[pos])));
+                    }
+                }
+            }
+        }
+    }
+
+    /// @note   This method expects the fragment list to be ordered by the primitive index
+    ///         to avoid redundant distance computations.
+    ValueType
+    computeDistance(const Coord& ijk, const Int32 manhattanLimit,
+        const std::vector<Fragment>& fragments, Int32& closestPrimIdx) const
+    {
+        Vec3d a, b, c, uvw, voxelCenter(ijk[0], ijk[1], ijk[2]);
+        double primDist, tmpDist, dist = std::numeric_limits<double>::max();
+        Int32 lastIdx = Int32(util::INVALID_IDX);
+
+        for (size_t n = 0, N = fragments.size(); n < N; ++n) {
+
+            const Fragment& fragment = fragments[n];
+            if (lastIdx == fragment.idx) continue;
+
+            const Int32 dx = std::abs(fragment.x - ijk[0]);
+            const Int32 dy = std::abs(fragment.y - ijk[1]);
+            const Int32 dz = std::abs(fragment.z - ijk[2]);
+
+            const Int32 manhattan = dx + dy + dz;
+            if (manhattan > manhattanLimit) continue;
+
+            lastIdx = fragment.idx;
+
+            const size_t polygon = size_t(lastIdx);
+
+            mMesh->getIndexSpacePoint(polygon, 0, a);
+            mMesh->getIndexSpacePoint(polygon, 1, b);
+            mMesh->getIndexSpacePoint(polygon, 2, c);
+
+            primDist = (voxelCenter -
+                closestPointOnTriangleToPoint(a, c, b, voxelCenter, uvw)).lengthSqr();
+
+            // Split quad into a second triangle
+            if (4 == mMesh->vertexCount(polygon)) {
+
+                mMesh->getIndexSpacePoint(polygon, 3, b);
+
+                tmpDist = (voxelCenter - closestPointOnTriangleToPoint(
+                    a, b, c, voxelCenter, uvw)).lengthSqr();
+
+                if (tmpDist < primDist) primDist = tmpDist;
+            }
+
+            if (primDist < dist) {
+                dist = primDist;
+                closestPrimIdx = lastIdx;
+            }
+        }
+
+        return ValueType(std::sqrt(dist)) * mVoxelSize;
+    }
+
+    /// @note   Returns true if the current voxel was updated and neighbouring
+    ///         voxels need to be evaluated.
+    bool
+    updateVoxel(const Coord& ijk, const Int32 manhattanLimit,
+        const std::vector<Fragment>& fragments,
+        LeafNodeType& distLeaf, Int32LeafNodeType& idxLeaf, bool* updatedLeafNodes = NULL)
+    {
+        Int32 closestPrimIdx = 0;
+        const ValueType distance = computeDistance(ijk, manhattanLimit, fragments, closestPrimIdx);
+
+        const Index pos = LeafNodeType::coordToOffset(ijk);
+        const bool inside = distLeaf.getValue(pos) < ValueType(0.0);
+
+        bool activateNeighbourVoxels = false;
+
+        if (!inside && distance < mExteriorBandWidth) {
+            if (updatedLeafNodes) *updatedLeafNodes = true;
+            activateNeighbourVoxels = (distance + mVoxelSize) < mExteriorBandWidth;
+            distLeaf.setValueOnly(pos, distance);
+            idxLeaf.setValueOn(pos, closestPrimIdx);
+        } else if (inside && distance < mInteriorBandWidth) {
+            if (updatedLeafNodes) *updatedLeafNodes = true;
+            activateNeighbourVoxels = (distance + mVoxelSize) < mInteriorBandWidth;
+            distLeaf.setValueOnly(pos, -distance);
+            idxLeaf.setValueOn(pos, closestPrimIdx);
+        }
+
+        return activateNeighbourVoxels;
+    }
+
+    //////////
+
+    BoolLeafNodeType     ** const mMaskNodes;
+    BoolTreeType          * const mMaskTree;
+    TreeType              * const mDistTree;
+    Int32TreeType         * const mIndexTree;
+
+    MeshDataAdapter const * const mMesh;
+
+    BoolTreeType mNewMaskTree;
+
+    std::vector<LeafNodeType*> mDistNodes, mUpdatedDistNodes;
+    std::vector<Int32LeafNodeType*> mIndexNodes, mUpdatedIndexNodes;
+
+    const ValueType mExteriorBandWidth, mInteriorBandWidth, mVoxelSize;
+}; // struct ExpandNarrowband
+
+
+template<typename TreeType>
+struct AddNodes {
+    typedef typename TreeType::LeafNodeType LeafNodeType;
+
+    AddNodes(TreeType& tree, std::vector<LeafNodeType*>& nodes)
+        : mTree(&tree) , mNodes(&nodes)
+    {
+    }
+
+    void operator()() const {
+        tree::ValueAccessor<TreeType> acc(*mTree);
+        std::vector<LeafNodeType*>& nodes = *mNodes;
+        for (size_t n = 0, N = nodes.size(); n < N; ++n) {
+            acc.addLeaf(nodes[n]);
+        }
+    }
+
+    TreeType                   * const mTree;
+    std::vector<LeafNodeType*> * const mNodes;
+}; // AddNodes
+
+
+template<typename TreeType, typename Int32TreeType, typename BoolTreeType, typename MeshDataAdapter>
+inline void
+expandNarrowband(
+    TreeType& distTree,
+    Int32TreeType& indexTree,
+    BoolTreeType& maskTree,
+    std::vector<typename BoolTreeType::LeafNodeType*>& maskNodes,
+    const MeshDataAdapter& mesh,
+    typename TreeType::ValueType exteriorBandWidth,
+    typename TreeType::ValueType interiorBandWidth,
+    typename TreeType::ValueType voxelSize)
+{
+    ExpandNarrowband<TreeType, MeshDataAdapter> expandOp(maskNodes, maskTree,
+        distTree, indexTree, mesh, exteriorBandWidth, interiorBandWidth, voxelSize);
+
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, maskNodes.size()), expandOp);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, expandOp.updatedIndexNodes().size()),
+        UnionValueMasks<typename TreeType::LeafNodeType, typename Int32TreeType::LeafNodeType>(
+            expandOp.updatedDistNodes(), expandOp.updatedIndexNodes()));
+
+    tbb::task_group tasks;
+    tasks.run(AddNodes<TreeType>(distTree, expandOp.newDistNodes()));
+    tasks.run(AddNodes<Int32TreeType>(indexTree, expandOp.newIndexNodes()));
+    tasks.wait();
+
+    maskTree.clear();
+    maskTree.merge(expandOp.newMaskTree());
+}
+
+
+////////////////////////////////////////
+
+
+// Transform values (sqrt, world space scaling and sign flip if sdf)
+template<typename TreeType>
+struct TransformValues
+{
+    typedef typename TreeType::LeafNodeType   LeafNodeType;
+    typedef typename TreeType::ValueType      ValueType;
+
+    TransformValues(std::vector<LeafNodeType*>& nodes,
+        ValueType voxelSize, bool unsignedDist)
+        : mNodes(&nodes[0])
+        , mVoxelSize(voxelSize)
+        , mUnsigned(unsignedDist)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typename LeafNodeType::ValueOnIter iter;
+
+        const bool udf = mUnsigned;
+        const ValueType w[2] = { -mVoxelSize, mVoxelSize };
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            for (iter = mNodes[n]->beginValueOn(); iter; ++iter) {
+                ValueType& val = const_cast<ValueType&>(iter.getValue());
+                val = w[udf || (val < ValueType(0.0))] * std::sqrt(std::abs(val));
+            }
+        }
+    }
+
+private:
+    LeafNodeType * * const  mNodes;
+    const ValueType         mVoxelSize;
+    const bool              mUnsigned;
+};
+
+
+// Inactivate values outside the (exBandWidth, inBandWidth) range.
+template<typename TreeType>
+struct InactivateValues
+{
+    typedef typename TreeType::LeafNodeType   LeafNodeType;
+    typedef typename TreeType::ValueType      ValueType;
+
+    InactivateValues(std::vector<LeafNodeType*>& nodes,
+        ValueType exBandWidth, ValueType inBandWidth)
+        : mNodes(nodes.empty() ? NULL : &nodes[0])
+        , mExBandWidth(exBandWidth)
+        , mInBandWidth(inBandWidth)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typename LeafNodeType::ValueOnIter iter;
+        const ValueType exVal = mExBandWidth;
+        const ValueType inVal = -mInBandWidth;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            for (iter = mNodes[n]->beginValueOn(); iter; ++iter) {
+
+                ValueType& val = const_cast<ValueType&>(iter.getValue());
+
+                const bool inside = val < ValueType(0.0);
+
+                if (inside && !(val > inVal)) {
+                    val = inVal;
+                    iter.setValueOff();
+                } else if (!inside && !(val < exVal)) {
+                    val = exVal;
+                    iter.setValueOff();
+                }
+            }
+        }
+    }
+
+private:
+    LeafNodeType * * const mNodes;
+    const ValueType mExBandWidth, mInBandWidth;
+};
+
+
+template<typename TreeType>
+struct OffsetValues
+{
+    typedef typename TreeType::LeafNodeType   LeafNodeType;
+    typedef typename TreeType::ValueType      ValueType;
+
+    OffsetValues(std::vector<LeafNodeType*>& nodes, ValueType offset)
+        : mNodes(nodes.empty() ? NULL : &nodes[0]), mOffset(offset)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        const ValueType offset = mOffset;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            typename LeafNodeType::ValueOnIter iter = mNodes[n]->beginValueOn();
+
+            for (; iter; ++iter) {
+                ValueType& val = const_cast<ValueType&>(iter.getValue());
+                val += offset;
+            }
+        }
+    }
+
+private:
+    LeafNodeType * * const mNodes;
+    const ValueType mOffset;
+};
+
+
+template<typename TreeType>
+struct Renormalize
+{
+    typedef typename TreeType::LeafNodeType     LeafNodeType;
+    typedef typename TreeType::ValueType        ValueType;
+
+    Renormalize(const TreeType& tree, const std::vector<LeafNodeType*>& nodes, ValueType* buffer, ValueType voxelSize)
+        : mTree(&tree)
+        , mNodes(nodes.empty() ? NULL : &nodes[0])
+        , mBuffer(buffer)
+        , mVoxelSize(voxelSize)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        typedef math::Vec3<ValueType>   Vec3Type;
+
+        tree::ValueAccessor<const TreeType> acc(*mTree);
+
+        Coord ijk;
+        Vec3Type up, down;
+
+        const ValueType dx = mVoxelSize, invDx = ValueType(1.0) / mVoxelSize;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            ValueType* bufferData = &mBuffer[n * LeafNodeType::SIZE];
+
+            typename LeafNodeType::ValueOnCIter iter = mNodes[n]->cbeginValueOn();
+            for (; iter; ++iter) {
+
+                const ValueType phi0 = *iter;
+
+                ijk = iter.getCoord();
+
+                up[0] = acc.getValue(ijk.offsetBy(1, 0, 0)) - phi0;
+                up[1] = acc.getValue(ijk.offsetBy(0, 1, 0)) - phi0;
+                up[2] = acc.getValue(ijk.offsetBy(0, 0, 1)) - phi0;
+
+                down[0] = phi0 - acc.getValue(ijk.offsetBy(-1, 0, 0));
+                down[1] = phi0 - acc.getValue(ijk.offsetBy(0, -1, 0));
+                down[2] = phi0 - acc.getValue(ijk.offsetBy(0, 0, -1));
+
+                const ValueType normSqGradPhi = math::GodunovsNormSqrd(phi0 > 0.0, down, up);
+
+                const ValueType diff = math::Sqrt(normSqGradPhi) * invDx - ValueType(1.0);
+                const ValueType S = phi0 / (math::Sqrt(math::Pow2(phi0) + normSqGradPhi));
+
+                bufferData[iter.pos()] = phi0 - dx * S * diff;
+            }
+        }
+    }
+
+private:
+    TreeType             const * const mTree;
+    LeafNodeType const * const * const mNodes;
+    ValueType                  * const mBuffer;
+
+    const ValueType mVoxelSize;
+};
+
+
+template<typename TreeType>
+struct MinCombine
+{
+    typedef typename TreeType::LeafNodeType   LeafNodeType;
+    typedef typename TreeType::ValueType      ValueType;
+
+    MinCombine(std::vector<LeafNodeType*>& nodes, const ValueType* buffer)
+        : mNodes(nodes.empty() ? NULL : &nodes[0]), mBuffer(buffer)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            const ValueType* bufferData = &mBuffer[n * LeafNodeType::SIZE];
+
+            typename LeafNodeType::ValueOnIter iter = mNodes[n]->beginValueOn();
+
+            for (; iter; ++iter) {
+                ValueType& val = const_cast<ValueType&>(iter.getValue());
+                val = std::min(val, bufferData[iter.pos()]);
+            }
+        }
+    }
+
+private:
+    LeafNodeType * * const mNodes;
+    ValueType const * const mBuffer;
+};
+
+
+} // mesh_to_volume_internal namespace
+
+
+////////////////////////////////////////
+
+// Utility method implementation
+
+
+template <typename FloatTreeT>
+inline void
+traceExteriorBoundaries(FloatTreeT& tree)
+{
+    typedef mesh_to_volume_internal::LeafNodeConnectivityTable<FloatTreeT> ConnectivityTable;
+
+    ConnectivityTable nodeConnectivity(tree);
+
+    std::vector<size_t> zStartNodes, yStartNodes, xStartNodes;
+
+    for (size_t n = 0; n < nodeConnectivity.size(); ++n) {
+        if (ConnectivityTable::INVALID_OFFSET == nodeConnectivity.offsetsPrevX()[n]) {
+            xStartNodes.push_back(n);
+        }
+
+        if (ConnectivityTable::INVALID_OFFSET == nodeConnectivity.offsetsPrevY()[n]) {
+            yStartNodes.push_back(n);
+        }
+
+        if (ConnectivityTable::INVALID_OFFSET == nodeConnectivity.offsetsPrevZ()[n]) {
+            zStartNodes.push_back(n);
+        }
+    }
+
+    typedef mesh_to_volume_internal::SweepExteriorSign<FloatTreeT> SweepingOp;
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, zStartNodes.size()),
+        SweepingOp(SweepingOp::Z_AXIS, zStartNodes, nodeConnectivity));
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, yStartNodes.size()),
+        SweepingOp(SweepingOp::Y_AXIS, yStartNodes, nodeConnectivity));
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, xStartNodes.size()),
+        SweepingOp(SweepingOp::X_AXIS, xStartNodes, nodeConnectivity));
+
+    const size_t numLeafNodes = nodeConnectivity.size();
+    const size_t numVoxels = numLeafNodes * FloatTreeT::LeafNodeType::SIZE;
+
+    boost::scoped_array<bool> changedNodeMaskA(new bool[numLeafNodes]);
+    boost::scoped_array<bool> changedNodeMaskB(new bool[numLeafNodes]);
+    boost::scoped_array<bool> changedVoxelMask(new bool[numVoxels]);
+
+    memset(changedNodeMaskA.get(), 1, sizeof(bool) * numLeafNodes);
+    mesh_to_volume_internal::fillArray(changedVoxelMask.get(), false, numVoxels);
+
+    const tbb::blocked_range<size_t> nodeRange(0, numLeafNodes);
+
+    bool nodesUpdated = false;
+    do {
+        tbb::parallel_for(nodeRange, mesh_to_volume_internal::SeedFillExteriorSign<FloatTreeT>(
+            nodeConnectivity.nodes(), changedNodeMaskA.get()));
+
+        tbb::parallel_for(nodeRange, mesh_to_volume_internal::SeedPoints<FloatTreeT>(nodeConnectivity,
+            changedNodeMaskA.get(), changedNodeMaskB.get(), changedVoxelMask.get()));
+
+        changedNodeMaskA.swap(changedNodeMaskB);
+
+        nodesUpdated = false;
+        for (size_t n = 0; n < numLeafNodes; ++n) {
+            nodesUpdated |= changedNodeMaskA[n];
+            if (nodesUpdated) break;
+        }
+
+        if (nodesUpdated) {
+            tbb::parallel_for(nodeRange, mesh_to_volume_internal::SyncVoxelMask<FloatTreeT>(
+                nodeConnectivity.nodes(), changedNodeMaskA.get(), changedVoxelMask.get()));
+        }
+    } while (nodesUpdated);
+
+} // void traceExteriorBoundaries()
+
+
+////////////////////////////////////////
+
+
+template <typename GridType, typename MeshDataAdapter, typename Interrupter>
+inline typename GridType::Ptr
+meshToVolume(
+  Interrupter& interrupter,
+  const MeshDataAdapter& mesh,
+  const math::Transform& transform,
+  float exteriorBandWidth,
+  float interiorBandWidth,
+  int flags,
+  typename GridType::template ValueConverter<Int32>::Type * polygonIndexGrid)
+{
+    typedef typename GridType::Ptr              GridTypePtr;
+    typedef typename GridType::TreeType         TreeType;
+    typedef typename TreeType::LeafNodeType     LeafNodeType;
+    typedef typename GridType::ValueType        ValueType;
+
+    typedef typename GridType::template ValueConverter<Int32>::Type  Int32GridType;
+    typedef typename Int32GridType::TreeType                         Int32TreeType;
+
+    typedef typename TreeType::template ValueConverter<bool>::Type   BoolTreeType;
+
+    //////////
+
+    // Setup
+
+    GridTypePtr distGrid(new GridType(std::numeric_limits<ValueType>::max()));
+    distGrid->setTransform(transform.copy());
+
+    ValueType exteriorWidth = ValueType(exteriorBandWidth);
+    ValueType interiorWidth = ValueType(interiorBandWidth);
+
+    // Note: inf interior width is all right, this value makes the converter fill
+    // interior regions with distance values.
+    if (!boost::math::isfinite(exteriorWidth) || boost::math::isnan(interiorWidth)) {
+        std::stringstream msg;
+        msg << "Illegal narrow band width: exterior = " << exteriorWidth
+            << ", interior = " << interiorWidth;
+        OPENVDB_LOG_DEBUG(msg.str());
+        return distGrid;
+    }
+
+    const ValueType voxelSize = ValueType(transform.voxelSize()[0]);
+
+    if (!boost::math::isfinite(voxelSize) || math::isZero(voxelSize)) {
+        std::stringstream msg;
+        msg << "Illegal transform, voxel size = " << voxelSize;
+        OPENVDB_LOG_DEBUG(msg.str());
+        return distGrid;
+    }
+
+    // Convert narrow band width from voxel units to world space units.
+    exteriorWidth *= voxelSize;
+    // Avoid the unit conversion if the interior band width is set to
+    // inf or std::numeric_limits<float>::max().
+    if (interiorWidth < std::numeric_limits<ValueType>::max()) {
+        interiorWidth *= voxelSize;
+    }
+
+    const bool computeSignedDistanceField = (flags & UNSIGNED_DISTANCE_FIELD) == 0;
+    const bool removeIntersectingVoxels = (flags & DISABLE_INTERSECTING_VOXEL_REMOVAL) == 0;
+    const bool renormalizeValues = (flags & DISABLE_RENORMALIZATION) == 0;
+    const bool trimNarrowBand = (flags & DISABLE_NARROW_BAND_TRIMMING) == 0;
+
+    Int32GridType* indexGrid = NULL;
+
+    typename Int32GridType::Ptr temporaryIndexGrid;
+
+    if (polygonIndexGrid) {
+        indexGrid = polygonIndexGrid;
+    } else {
+        temporaryIndexGrid.reset(new Int32GridType(Int32(util::INVALID_IDX)));
+        indexGrid = temporaryIndexGrid.get();
+    }
+
+    indexGrid->newTree();
+    indexGrid->setTransform(transform.copy());
+
+    if (computeSignedDistanceField) {
+        distGrid->setGridClass(GRID_LEVEL_SET);
+    } else {
+        distGrid->setGridClass(GRID_UNKNOWN);
+        interiorWidth = ValueType(0.0);
+    }
+
+    TreeType& distTree = distGrid->tree();
+    Int32TreeType& indexTree = indexGrid->tree();
+
+
+    //////////
+
+    // Voxelize mesh
+
+    {
+        typedef mesh_to_volume_internal::VoxelizationData<TreeType> VoxelizationDataType;
+        typedef tbb::enumerable_thread_specific<typename VoxelizationDataType::Ptr> DataTable;
+
+        DataTable data;
+        typedef mesh_to_volume_internal::VoxelizePolygons<TreeType, MeshDataAdapter, Interrupter> Voxelizer;
+
+        const tbb::blocked_range<size_t> polygonRange(0, mesh.polygonCount());
+
+        tbb::parallel_for(polygonRange, Voxelizer(data, mesh, &interrupter));
+
+        for (typename DataTable::iterator i = data.begin(); i != data.end(); ++i) {
+            VoxelizationDataType& dataItem = **i;
+            mesh_to_volume_internal::combineData(
+                distTree, indexTree, dataItem.distTree, dataItem.indexTree);
+        }
+    }
+
+    // The progress estimates are based on the observed average time for a few different
+    // test cases and is only intended to provide some rough progression feedback to the user.
+    if (interrupter.wasInterrupted(30)) return distGrid;
+
+
+    //////////
+
+    // Classify interior and exterior regions
+
+    if (computeSignedDistanceField) {
+
+        // Determines the inside/outside state for the narrow band of voxels.
+        traceExteriorBoundaries(distTree);
+
+        std::vector<LeafNodeType*> nodes;
+        nodes.reserve(distTree.leafCount());
+        distTree.getNodes(nodes);
+
+        const tbb::blocked_range<size_t> nodeRange(0, nodes.size());
+
+        typedef mesh_to_volume_internal::ComputeIntersectingVoxelSign<TreeType, MeshDataAdapter> SignOp;
+
+        tbb::parallel_for(nodeRange, SignOp(nodes, distTree, indexTree, mesh));
+
+        if (interrupter.wasInterrupted(45)) return distGrid;
+
+        // Remove voxels created by self intersecting portions of the mesh.
+        if (removeIntersectingVoxels) {
+
+            tbb::parallel_for(nodeRange,
+                mesh_to_volume_internal::ValidateIntersectingVoxels<TreeType>(distTree, nodes));
+
+            tbb::parallel_for(nodeRange,
+                mesh_to_volume_internal::RemoveSelfIntersectingSurface<TreeType>(
+                    nodes, distTree, indexTree));
+
+            tools::pruneInactive(distTree,  /*threading=*/true);
+            tools::pruneInactive(indexTree, /*threading=*/true);
+        }
+    }
+
+    if (interrupter.wasInterrupted(50)) return distGrid;
+
+    if (distTree.activeVoxelCount() == 0) {
+        distGrid.reset((new GridType(ValueType(0.0))));
+        return distGrid;
+    }
+
+    // Transform values (world space scaling etc.).
+    {
+        std::vector<LeafNodeType*> nodes;
+        nodes.reserve(distTree.leafCount());
+        distTree.getNodes(nodes);
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+            mesh_to_volume_internal::TransformValues<TreeType>(
+                nodes, voxelSize, !computeSignedDistanceField));
+    }
+
+    // Propagate sign information into tile regions.
+    if (computeSignedDistanceField) {
+        distTree.root().setBackground(exteriorWidth, /*updateChildNodes=*/false);
+        tools::signedFloodFillWithValues(distTree, exteriorWidth, -interiorWidth);
+    } else {
+        tools::changeBackground(distTree, exteriorWidth);
+    }
+
+    if (interrupter.wasInterrupted(54)) return distGrid;
+
+
+    //////////
+
+    // Expand the narrow band region
+
+    const ValueType minBandWidth = voxelSize * ValueType(2.0);
+
+    if (interiorWidth > minBandWidth || exteriorWidth > minBandWidth) {
+
+        // Create the initial voxel mask.
+        BoolTreeType maskTree(false);
+
+        {
+            std::vector<LeafNodeType*> nodes;
+            nodes.reserve(distTree.leafCount());
+            distTree.getNodes(nodes);
+
+            mesh_to_volume_internal::ConstructVoxelMask<TreeType> op(maskTree, distTree, nodes);
+            tbb::parallel_reduce(tbb::blocked_range<size_t>(0, nodes.size()), op);
+        }
+
+        // Progress estimation
+        unsigned maxIterations = std::numeric_limits<unsigned>::max();
+
+        float progress = 54.0f, step = 0.0f;
+        double estimated =
+            2.0 * std::ceil((std::max(interiorWidth, exteriorWidth) - minBandWidth) / voxelSize);
+
+        if (estimated < double(maxIterations)) {
+            maxIterations = unsigned(estimated);
+            step = 40.0f / float(maxIterations);
+        }
+
+        std::vector<typename BoolTreeType::LeafNodeType*> maskNodes;
+
+        unsigned count = 0;
+        while (true) {
+
+            if (interrupter.wasInterrupted(int(progress))) return distGrid;
+
+            const size_t maskNodeCount = maskTree.leafCount();
+            if (maskNodeCount == 0) break;
+
+            maskNodes.clear();
+            maskNodes.reserve(maskNodeCount);
+            maskTree.getNodes(maskNodes);
+
+            const tbb::blocked_range<size_t> range(0, maskNodes.size());
+
+            tbb::parallel_for(range,
+                mesh_to_volume_internal::DiffLeafNodeMask<TreeType>(distTree, maskNodes));
+
+            mesh_to_volume_internal::expandNarrowband(distTree, indexTree, maskTree, maskNodes,
+                mesh, exteriorWidth, interiorWidth, voxelSize);
+
+            if ((++count) >= maxIterations) break;
+            progress += step;
+        }
+    }
+
+    if (interrupter.wasInterrupted(94)) return distGrid;
+
+    if (!polygonIndexGrid) indexGrid->clear();
+
+
+    /////////
+
+    // Renormalize distances to smooth out bumps caused by self intersecting
+    // and overlapping portions of the mesh and renormalize the level set.
+
+    if (computeSignedDistanceField && renormalizeValues) {
+
+        std::vector<LeafNodeType*> nodes;
+        nodes.reserve(distTree.leafCount());
+        distTree.getNodes(nodes);
+
+        boost::scoped_array<ValueType> buffer(new ValueType[LeafNodeType::SIZE * nodes.size()]);
+
+        const ValueType offset = ValueType(0.8 * voxelSize);
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+            mesh_to_volume_internal::OffsetValues<TreeType>(nodes, -offset));
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+            mesh_to_volume_internal::Renormalize<TreeType>(
+                distTree, nodes, buffer.get(), voxelSize));
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+            mesh_to_volume_internal::MinCombine<TreeType>(nodes, buffer.get()));
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+            mesh_to_volume_internal::OffsetValues<TreeType>(
+                nodes, offset - mesh_to_volume_internal::Tolerance<ValueType>::epsilon()));
+    }
+
+    if (interrupter.wasInterrupted(99)) return distGrid;
+
+
+    /////////
+
+    // Remove active voxels that exceed the narrow band limits
+
+    if (trimNarrowBand && std::min(interiorWidth, exteriorWidth) < voxelSize * ValueType(4.0)) {
+
+        std::vector<LeafNodeType*> nodes;
+        nodes.reserve(distTree.leafCount());
+        distTree.getNodes(nodes);
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+            mesh_to_volume_internal::InactivateValues<TreeType>(
+                nodes, exteriorWidth, computeSignedDistanceField ? interiorWidth : exteriorWidth));
+
+        tools::pruneLevelSet(
+            distTree, exteriorWidth, computeSignedDistanceField ? -interiorWidth : -exteriorWidth);
+    }
+
+    return distGrid;
+}
+
+
+template <typename GridType, typename MeshDataAdapter>
+inline typename GridType::Ptr
+meshToVolume(
+  const MeshDataAdapter& mesh,
+  const math::Transform& transform,
+  float exteriorBandWidth,
+  float interiorBandWidth,
+  int flags,
+  typename GridType::template ValueConverter<Int32>::Type * polygonIndexGrid)
+{
+    util::NullInterrupter nullInterrupter;
+    return meshToVolume<GridType>(nullInterrupter, mesh, transform,
+        exteriorBandWidth, interiorBandWidth, flags, polygonIndexGrid);
+}
+
+
+////////////////////////////////////////
+
+
+/// @internal This overload is enabled only for grids with a scalar, floating-point ValueType.
+template<typename GridType>
+inline typename boost::enable_if<boost::is_floating_point<typename GridType::ValueType>,
+typename GridType::Ptr>::type
+doMeshConversion(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float exBandWidth,
+    float inBandWidth,
+    bool unsignedDistanceField = false)
+{
+    if (points.empty()) {
+        return typename GridType::Ptr(new GridType(typename GridType::ValueType(exBandWidth)));
+    }
+
+    const size_t numPoints = points.size();
+    boost::scoped_array<Vec3s> indexSpacePoints(new Vec3s[numPoints]);
+
+    // transform points to local grid index space
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, numPoints),
+        mesh_to_volume_internal::TransformPoints<Vec3s>(
+                &points[0], indexSpacePoints.get(), xform));
+
+    const int conversionFlags = unsignedDistanceField ? UNSIGNED_DISTANCE_FIELD : 0;
+
+    if (quads.empty()) {
+
+        QuadAndTriangleDataAdapter<Vec3s, Vec3I>
+            mesh(indexSpacePoints.get(), numPoints, &triangles[0], triangles.size());
+
+        return meshToVolume<GridType>(mesh, xform, exBandWidth, inBandWidth, conversionFlags);
+
+    } else if (triangles.empty()) {
+
+        QuadAndTriangleDataAdapter<Vec3s, Vec4I>
+            mesh(indexSpacePoints.get(), numPoints, &quads[0], quads.size());
+
+        return meshToVolume<GridType>(mesh, xform, exBandWidth, inBandWidth, conversionFlags);
+    }
+
+    // pack primitives
+
+    const size_t numPrimitives = triangles.size() + quads.size();
+    boost::scoped_array<Vec4I> prims(new Vec4I[numPrimitives]);
+
+    for (size_t n = 0, N = triangles.size(); n < N; ++n) {
+        const Vec3I& triangle = triangles[n];
+        Vec4I& prim = prims[n];
+        prim[0] = triangle[0];
+        prim[1] = triangle[1];
+        prim[2] = triangle[2];
+        prim[3] = util::INVALID_IDX;
+    }
+
+    const size_t offset = triangles.size();
+    for (size_t n = 0, N = quads.size(); n < N; ++n) {
+        prims[offset + n] = quads[n];
+    }
+
+    QuadAndTriangleDataAdapter<Vec3s, Vec4I>
+        mesh(indexSpacePoints.get(), numPoints, prims.get(), numPrimitives);
+
+    return meshToVolume<GridType>(mesh, xform, exBandWidth, inBandWidth, conversionFlags);
+}
+
+
+/// @internal This overload is enabled only for grids that do not have a scalar,
+/// floating-point ValueType.
+template<typename GridType>
+inline typename boost::disable_if<boost::is_floating_point<typename GridType::ValueType>,
+typename GridType::Ptr>::type
+doMeshConversion(
+    const math::Transform& /*xform*/,
+    const std::vector<Vec3s>& /*points*/,
+    const std::vector<Vec3I>& /*triangles*/,
+    const std::vector<Vec4I>& /*quads*/,
+    float /*exBandWidth*/,
+    float /*inBandWidth*/,
+    bool /*unsignedDistanceField*/ = false)
+{
+    OPENVDB_THROW(TypeError,
+        "mesh to volume conversion is supported only for scalar floating-point grids");
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+meshToLevelSet(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    float halfWidth)
+{
+    std::vector<Vec4I> quads(0);
+    return doMeshConversion<GridType>(xform, points, triangles, quads,
+        halfWidth, halfWidth);
+}
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+meshToLevelSet(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec4I>& quads,
+    float halfWidth)
+{
+    std::vector<Vec3I> triangles(0);
+    return doMeshConversion<GridType>(xform, points, triangles, quads,
+        halfWidth, halfWidth);
+}
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+meshToLevelSet(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float halfWidth)
+{
+    return doMeshConversion<GridType>(xform, points, triangles, quads,
+        halfWidth, halfWidth);
+}
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+meshToSignedDistanceField(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float exBandWidth,
+    float inBandWidth)
+{
+    return doMeshConversion<GridType>(xform, points, triangles,
+        quads, exBandWidth, inBandWidth);
+}
+
+
+template<typename GridType>
+inline typename GridType::Ptr
+meshToUnsignedDistanceField(
+    const openvdb::math::Transform& xform,
+    const std::vector<Vec3s>& points,
+    const std::vector<Vec3I>& triangles,
+    const std::vector<Vec4I>& quads,
+    float bandWidth)
+{
+    return doMeshConversion<GridType>(xform, points, triangles, quads,
+        bandWidth, bandWidth, true);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+// Required by several of the tree nodes
+inline std::ostream&
+operator<<(std::ostream& ostr, const MeshToVoxelEdgeData::EdgeData& rhs)
+{
+    ostr << "{[ " << rhs.mXPrim << ", " << rhs.mXDist << "]";
+    ostr << " [ " << rhs.mYPrim << ", " << rhs.mYDist << "]";
+    ostr << " [ " << rhs.mZPrim << ", " << rhs.mZDist << "]}";
+    return ostr;
+}
+
+// Required by math::Abs
+inline MeshToVoxelEdgeData::EdgeData
+Abs(const MeshToVoxelEdgeData::EdgeData& x)
+{
+    return x;
+}
+
+
+////////////////////////////////////////
+
+
+class MeshToVoxelEdgeData::GenEdgeData
+{
+public:
+
+    GenEdgeData(
+        const std::vector<Vec3s>& pointList,
+        const std::vector<Vec4I>& polygonList);
+
+    void run(bool threaded = true);
+
+    GenEdgeData(GenEdgeData& rhs, tbb::split);
+    inline void operator() (const tbb::blocked_range<size_t> &range);
+    inline void join(GenEdgeData& rhs);
+
+    inline TreeType& tree() { return mTree; }
+
+private:
+    void operator=(const GenEdgeData&) {}
+
+    struct Primitive { Vec3d a, b, c, d; Int32 index; };
+
+    template<bool IsQuad>
+    inline void voxelize(const Primitive&);
+
+    template<bool IsQuad>
+    inline bool evalPrimitive(const Coord&, const Primitive&);
+
+    inline bool rayTriangleIntersection( const Vec3d& origin, const Vec3d& dir,
+        const Vec3d& a, const Vec3d& b, const Vec3d& c, double& t);
+
+
+    TreeType mTree;
+    Accessor mAccessor;
+
+    const std::vector<Vec3s>& mPointList;
+    const std::vector<Vec4I>& mPolygonList;
+
+    // Used internally for acceleration
+    typedef TreeType::ValueConverter<Int32>::Type IntTreeT;
+    IntTreeT mLastPrimTree;
+    tree::ValueAccessor<IntTreeT> mLastPrimAccessor;
+}; // class MeshToVoxelEdgeData::GenEdgeData
+
+
+inline
+MeshToVoxelEdgeData::GenEdgeData::GenEdgeData(
+    const std::vector<Vec3s>& pointList,
+    const std::vector<Vec4I>& polygonList)
+    : mTree(EdgeData())
+    , mAccessor(mTree)
+    , mPointList(pointList)
+    , mPolygonList(polygonList)
+    , mLastPrimTree(Int32(util::INVALID_IDX))
+    , mLastPrimAccessor(mLastPrimTree)
+{
+}
+
+
+inline
+MeshToVoxelEdgeData::GenEdgeData::GenEdgeData(GenEdgeData& rhs, tbb::split)
+    : mTree(EdgeData())
+    , mAccessor(mTree)
+    , mPointList(rhs.mPointList)
+    , mPolygonList(rhs.mPolygonList)
+    , mLastPrimTree(Int32(util::INVALID_IDX))
+    , mLastPrimAccessor(mLastPrimTree)
+{
+}
+
+
+inline void
+MeshToVoxelEdgeData::GenEdgeData::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_reduce(tbb::blocked_range<size_t>(0, mPolygonList.size()), *this);
+    } else {
+        (*this)(tbb::blocked_range<size_t>(0, mPolygonList.size()));
+    }
+}
+
+
+inline void
+MeshToVoxelEdgeData::GenEdgeData::join(GenEdgeData& rhs)
+{
+    typedef TreeType::RootNodeType       RootNodeType;
+    typedef RootNodeType::NodeChainType  NodeChainType;
+    BOOST_STATIC_ASSERT(boost::mpl::size<NodeChainType>::value > 1);
+    typedef boost::mpl::at<NodeChainType, boost::mpl::int_<1> >::type InternalNodeType;
+
+    Coord ijk;
+    Index offset;
+
+    rhs.mTree.clearAllAccessors();
+
+    TreeType::LeafIter leafIt = rhs.mTree.beginLeaf();
+    for ( ; leafIt; ++leafIt) {
+        ijk = leafIt->origin();
+
+        TreeType::LeafNodeType* lhsLeafPt = mTree.probeLeaf(ijk);
+
+        if (!lhsLeafPt) {
+
+            mAccessor.addLeaf(rhs.mAccessor.probeLeaf(ijk));
+            InternalNodeType* node = rhs.mAccessor.getNode<InternalNodeType>();
+            node->stealNode<TreeType::LeafNodeType>(ijk, EdgeData(), false);
+            rhs.mAccessor.clear();
+
+        } else {
+
+            TreeType::LeafNodeType::ValueOnCIter it = leafIt->cbeginValueOn();
+            for ( ; it; ++it) {
+
+                offset = it.pos();
+                const EdgeData& rhsValue = it.getValue();
+
+                if (!lhsLeafPt->isValueOn(offset)) {
+                    lhsLeafPt->setValueOn(offset, rhsValue);
+                } else {
+
+                    EdgeData& lhsValue = const_cast<EdgeData&>(lhsLeafPt->getValue(offset));
+
+                    if (rhsValue.mXDist < lhsValue.mXDist) {
+                        lhsValue.mXDist = rhsValue.mXDist;
+                        lhsValue.mXPrim = rhsValue.mXPrim;
+                    }
+
+                    if (rhsValue.mYDist < lhsValue.mYDist) {
+                        lhsValue.mYDist = rhsValue.mYDist;
+                        lhsValue.mYPrim = rhsValue.mYPrim;
+                    }
+
+                    if (rhsValue.mZDist < lhsValue.mZDist) {
+                        lhsValue.mZDist = rhsValue.mZDist;
+                        lhsValue.mZPrim = rhsValue.mZPrim;
+                    }
+
+                }
+            } // end value iteration
+        }
+    } // end leaf iteration
+}
+
+
+inline void
+MeshToVoxelEdgeData::GenEdgeData::operator()(const tbb::blocked_range<size_t> &range)
+{
+    Primitive prim;
+
+    for (size_t n = range.begin(); n < range.end(); ++n) {
+
+        const Vec4I& verts = mPolygonList[n];
+
+        prim.index = Int32(n);
+        prim.a = Vec3d(mPointList[verts[0]]);
+        prim.b = Vec3d(mPointList[verts[1]]);
+        prim.c = Vec3d(mPointList[verts[2]]);
+
+        if (util::INVALID_IDX != verts[3]) {
+            prim.d = Vec3d(mPointList[verts[3]]);
+            voxelize<true>(prim);
+        } else {
+            voxelize<false>(prim);
+        }
+    }
+}
+
+
+template<bool IsQuad>
+inline void
+MeshToVoxelEdgeData::GenEdgeData::voxelize(const Primitive& prim)
+{
+    std::deque<Coord> coordList;
+    Coord ijk, nijk;
+
+    ijk = Coord::floor(prim.a);
+    coordList.push_back(ijk);
+
+    evalPrimitive<IsQuad>(ijk, prim);
+
+    while (!coordList.empty()) {
+
+        ijk = coordList.back();
+        coordList.pop_back();
+
+        for (Int32 i = 0; i < 26; ++i) {
+            nijk = ijk + util::COORD_OFFSETS[i];
+
+            if (prim.index != mLastPrimAccessor.getValue(nijk)) {
+                mLastPrimAccessor.setValue(nijk, prim.index);
+                if(evalPrimitive<IsQuad>(nijk, prim)) coordList.push_back(nijk);
+            }
+        }
+    }
+}
+
+
+template<bool IsQuad>
+inline bool
+MeshToVoxelEdgeData::GenEdgeData::evalPrimitive(const Coord& ijk, const Primitive& prim)
+{
+    Vec3d uvw, org(ijk[0], ijk[1], ijk[2]);
+    bool intersecting = false;
+    double t;
+
+    EdgeData edgeData;
+    mAccessor.probeValue(ijk, edgeData);
+
+    // Evaluate first triangle
+    double dist = (org -
+        closestPointOnTriangleToPoint(prim.a, prim.c, prim.b, org, uvw)).lengthSqr();
+
+    if (rayTriangleIntersection(org, Vec3d(1.0, 0.0, 0.0), prim.a, prim.c, prim.b, t)) {
+        if (t < edgeData.mXDist) {
+            edgeData.mXDist = float(t);
+            edgeData.mXPrim = prim.index;
+            intersecting = true;
+        }
+    }
+
+    if (rayTriangleIntersection(org, Vec3d(0.0, 1.0, 0.0), prim.a, prim.c, prim.b, t)) {
+        if (t < edgeData.mYDist) {
+            edgeData.mYDist = float(t);
+            edgeData.mYPrim = prim.index;
+            intersecting = true;
+        }
+    }
+
+    if (rayTriangleIntersection(org, Vec3d(0.0, 0.0, 1.0), prim.a, prim.c, prim.b, t)) {
+        if (t < edgeData.mZDist) {
+            edgeData.mZDist = float(t);
+            edgeData.mZPrim = prim.index;
+            intersecting = true;
+        }
+    }
+
+    if (IsQuad) {
+        // Split quad into a second triangle and calculate distance.
+        double secondDist = (org -
+            closestPointOnTriangleToPoint(prim.a, prim.d, prim.c, org, uvw)).lengthSqr();
+
+        if (secondDist < dist) dist = secondDist;
+
+        if (rayTriangleIntersection(org, Vec3d(1.0, 0.0, 0.0), prim.a, prim.d, prim.c, t)) {
+            if (t < edgeData.mXDist) {
+                edgeData.mXDist = float(t);
+                edgeData.mXPrim = prim.index;
+                intersecting = true;
+            }
+        }
+
+        if (rayTriangleIntersection(org, Vec3d(0.0, 1.0, 0.0), prim.a, prim.d, prim.c, t)) {
+            if (t < edgeData.mYDist) {
+                edgeData.mYDist = float(t);
+                edgeData.mYPrim = prim.index;
+                intersecting = true;
+            }
+        }
+
+        if (rayTriangleIntersection(org, Vec3d(0.0, 0.0, 1.0), prim.a, prim.d, prim.c, t)) {
+            if (t < edgeData.mZDist) {
+                edgeData.mZDist = float(t);
+                edgeData.mZPrim = prim.index;
+                intersecting = true;
+            }
+        }
+    }
+
+    if (intersecting) mAccessor.setValue(ijk, edgeData);
+
+    return (dist < 0.86602540378443861);
+}
+
+
+inline bool
+MeshToVoxelEdgeData::GenEdgeData::rayTriangleIntersection(
+    const Vec3d& origin, const Vec3d& dir,
+    const Vec3d& a, const Vec3d& b, const Vec3d& c,
+    double& t)
+{
+    // Check if ray is parallel with triangle
+
+    Vec3d e1 = b - a;
+    Vec3d e2 = c - a;
+    Vec3d s1 = dir.cross(e2);
+
+    double divisor = s1.dot(e1);
+    if (!(std::abs(divisor) > 0.0)) return false;
+
+    // Compute barycentric coordinates
+
+    double inv_divisor = 1.0 / divisor;
+    Vec3d d = origin - a;
+    double b1 = d.dot(s1) * inv_divisor;
+
+    if (b1 < 0.0 || b1 > 1.0) return false;
+
+    Vec3d s2 = d.cross(e1);
+    double b2 = dir.dot(s2) * inv_divisor;
+
+    if (b2 < 0.0 || (b1 + b2) > 1.0) return false;
+
+    // Compute distance to intersection point
+
+    t = e2.dot(s2) * inv_divisor;
+    return (t < 0.0) ? false : true;
+}
+
+
+////////////////////////////////////////
+
+
+inline
+MeshToVoxelEdgeData::MeshToVoxelEdgeData()
+    : mTree(EdgeData())
+{
+}
+
+
+inline void
+MeshToVoxelEdgeData::convert(
+    const std::vector<Vec3s>& pointList,
+    const std::vector<Vec4I>& polygonList)
+{
+    GenEdgeData converter(pointList, polygonList);
+    converter.run();
+
+    mTree.clear();
+    mTree.merge(converter.tree());
+}
+
+
+inline void
+MeshToVoxelEdgeData::getEdgeData(
+    Accessor& acc,
+    const Coord& ijk,
+    std::vector<Vec3d>& points,
+    std::vector<Index32>& primitives)
+{
+    EdgeData data;
+    Vec3d point;
+
+    Coord coord = ijk;
+
+    if (acc.probeValue(coord, data)) {
+
+        if (data.mXPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]) + data.mXDist;
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mXPrim);
+        }
+
+        if (data.mYPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]) + data.mYDist;
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mYPrim);
+        }
+
+        if (data.mZPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]) + data.mZDist;
+
+            points.push_back(point);
+            primitives.push_back(data.mZPrim);
+        }
+
+    }
+
+    coord[0] += 1;
+
+    if (acc.probeValue(coord, data)) {
+
+        if (data.mYPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]) + data.mYDist;
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mYPrim);
+        }
+
+        if (data.mZPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]) + data.mZDist;
+
+            points.push_back(point);
+            primitives.push_back(data.mZPrim);
+        }
+    }
+
+    coord[2] += 1;
+
+    if (acc.probeValue(coord, data)) {
+        if (data.mYPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]) + data.mYDist;
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mYPrim);
+        }
+    }
+
+    coord[0] -= 1;
+
+    if (acc.probeValue(coord, data)) {
+
+        if (data.mXPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]) + data.mXDist;
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mXPrim);
+        }
+
+        if (data.mYPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]) + data.mYDist;
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mYPrim);
+        }
+    }
+
+
+    coord[1] += 1;
+
+    if (acc.probeValue(coord, data)) {
+
+        if (data.mXPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]) + data.mXDist;
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mXPrim);
+        }
+    }
+
+    coord[2] -= 1;
+
+    if (acc.probeValue(coord, data)) {
+
+        if (data.mXPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]) + data.mXDist;
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]);
+
+            points.push_back(point);
+            primitives.push_back(data.mXPrim);
+        }
+
+        if (data.mZPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]) + data.mZDist;
+
+            points.push_back(point);
+            primitives.push_back(data.mZPrim);
+        }
+    }
+
+    coord[0] += 1;
+
+    if (acc.probeValue(coord, data)) {
+
+        if (data.mZPrim != util::INVALID_IDX) {
+            point[0] = double(coord[0]);
+            point[1] = double(coord[1]);
+            point[2] = double(coord[2]) + data.mZDist;
+
+            points.push_back(point);
+            primitives.push_back(data.mZPrim);
+        }
+    }
+}
+
+
+template<typename GridType, typename VecType>
+inline typename GridType::Ptr
+createLevelSetBox(const math::BBox<VecType>& bbox,
+    const openvdb::math::Transform& xform,
+    typename VecType::ValueType halfWidth)
+{
+    const Vec3s pmin = Vec3s(xform.worldToIndex(bbox.min()));
+    const Vec3s pmax = Vec3s(xform.worldToIndex(bbox.max()));
+
+    Vec3s points[8];
+    points[0] = Vec3s(pmin[0], pmin[1], pmin[2]);
+    points[1] = Vec3s(pmin[0], pmin[1], pmax[2]);
+    points[2] = Vec3s(pmax[0], pmin[1], pmax[2]);
+    points[3] = Vec3s(pmax[0], pmin[1], pmin[2]);
+    points[4] = Vec3s(pmin[0], pmax[1], pmin[2]);
+    points[5] = Vec3s(pmin[0], pmax[1], pmax[2]);
+    points[6] = Vec3s(pmax[0], pmax[1], pmax[2]);
+    points[7] = Vec3s(pmax[0], pmax[1], pmin[2]);
+
+    Vec4I faces[6];
+    faces[0] = Vec4I(0, 1, 2, 3); // bottom
+    faces[1] = Vec4I(7, 6, 5, 4); // top
+    faces[2] = Vec4I(4, 5, 1, 0); // front
+    faces[3] = Vec4I(6, 7, 3, 2); // back
+    faces[4] = Vec4I(0, 3, 7, 4); // left
+    faces[5] = Vec4I(1, 5, 6, 2); // right
+
+    QuadAndTriangleDataAdapter<Vec3s, Vec4I> mesh(points, 8, faces, 6);
+
+    return meshToVolume<GridType>(mesh, xform, halfWidth, halfWidth);
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_MESH_TO_VOLUME_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Morphology.h b/nuparu/include/openvdb_new/tools/Morphology.h
new file mode 100644
index 00000000..9fac07cb
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Morphology.h
@@ -0,0 +1,1097 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file   Morphology.h
+///
+/// @brief  Implementation of morphological dilation and erosion.
+///
+/// @note   By design the morphological operations only change the
+///         state of voxels, not their values. If one desires to
+///         change the values of voxels that change state an efficient
+///         technique is to construct a boolean mask by performing a
+///         topology difference between the original and final grids.
+///
+/// @todo   Extend erosion with 18 and 26 neighbors (coming soon!)
+///
+/// @author Ken Museth
+///
+
+#ifndef OPENVDB_TOOLS_MORPHOLOGY_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_MORPHOLOGY_HAS_BEEN_INCLUDED
+
+#include <tbb/tbb_thread.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/math/Math.h> // for isApproxEqual()
+#include <openvdb/tree/TreeIterator.h>
+#include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/tree/LeafManager.h>
+#include <boost/scoped_array.hpp>
+#include <boost/bind.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <boost/type_traits/is_same.hpp> 
+#include "Prune.h"// for pruneLevelSet
+#include "ValueTransformer.h" // for foreach()
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {       
+
+/// @brief Voxel topology of nearest neighbors
+/// @details
+/// <dl>
+/// <dt><b>NN_FACE</b>
+/// <dd>face adjacency (6 nearest neighbors, defined as all neighbor
+/// voxels connected along one of the primary axes)
+///
+/// <dt><b>NN_FACE_EDGE</b>
+/// <dd>face and edge adjacency (18 nearest neighbors, defined as all
+/// neighbor voxels connected along either one or two of the primary axes)
+///
+/// <dt><b>NN_FACE_EDGE_VERTEX</b>
+/// <dd>face, edge and vertex adjacency (26 nearest neighbors, defined
+/// as all neighbor voxels connected along either one, two or all
+/// three of the primary axes)
+/// </dl>
+enum NearestNeighbors { NN_FACE = 6, NN_FACE_EDGE = 18, NN_FACE_EDGE_VERTEX = 26 };
+
+/// @brief Different policies when dilating trees with active tiles
+/// @details
+/// <dl>
+/// <dt><b>IGNORE_TILES</b>
+/// <dd>Active tiles are ignores, i.e. only active voxels are dilates.
+///
+/// <dt><b>EXPAND_TILES</b>
+/// <dd>Active tiles are expanded into active voxels and then dilated.
+///
+/// <dt><b>PRESERVE_TILES</b>
+/// <dd>Active tiles remain unchanged but they still contribute to the
+/// dilation as if they were active voxels.
+/// </dl>    
+enum TilePolicy { IGNORE_TILES, EXPAND_TILES, PRESERVE_TILES };
+    
+/// @brief Topologically dilate all active values (i.e. both voxels
+/// and tiles) in a tree using one of three nearest neighbor
+/// connectivity patterns.
+/// @note This method is fully multi-threaded and support active tiles!   
+///
+/// @param tree          tree to be dilated
+/// @param iterations    number of iterations to apply the dilation
+/// @param nn            connectivity pattern of the dilation: either
+///     face-adjacent (6 nearest neighbors), face- and edge-adjacent
+///     (18 nearest neighbors) or face-, edge- and vertex-adjacent (26
+///     nearest neighbors).
+/// @param mode          Defined the policy for handling active tiles
+///                      (see above for details)      
+///
+/// @note The values of any voxels are unchanged.
+template<typename TreeType> OPENVDB_STATIC_SPECIALIZATION
+inline void dilateActiveValues(TreeType& tree,
+                               int iterations = 1,
+                               NearestNeighbors nn = NN_FACE,
+                               TilePolicy mode = PRESERVE_TILES);
+
+/// @brief Topologically dilate all active values (i.e. both voxels
+/// and tiles) in a tree using one of three nearest neighbor
+/// connectivity patterns.
+///
+/// @warning Unlike the method above this one takes a LeafManger,
+/// however (unlike dilateVoxels method below) it offers no performance
+/// advantage over the one that takes a tree. Its merely included for
+/// API compatability. The leaf nodes in the manger are updated
+/// after the dilation, which incurres a (very small) overhead.
+///    
+/// @note This method is fully multi-threaded and support active tiles!   
+///
+/// @param manager       Leaf node manager for the tree to be dilated
+/// @param iterations    number of iterations to apply the dilation
+/// @param nn            connectivity pattern of the dilation: either
+///     face-adjacent (6 nearest neighbors), face- and edge-adjacent
+///     (18 nearest neighbors) or face-, edge- and vertex-adjacent (26
+///     nearest neighbors).
+/// @param mode          Defined the policy for handling active tiles
+///                      (see above for details)      
+///
+/// @note The values of any voxels are unchanged.
+template<typename TreeType> OPENVDB_STATIC_SPECIALIZATION
+inline void dilateActiveValues(tree::LeafManager<TreeType>& manager,
+                               int iterations = 1,
+                               NearestNeighbors nn = NN_FACE,
+                               TilePolicy mode = PRESERVE_TILES);
+    
+
+/// @brief Topologically dilate all leaf-level active voxels in a tree
+/// using one of three nearest neighbor connectivity patterns.
+/// @warning This method is NOT multi-threaded and ignores active tiles!    
+///
+/// @param tree          tree to be dilated
+/// @param iterations    number of iterations to apply the dilation
+/// @param nn            connectivity pattern of the dilation: either
+///     face-adjacent (6 nearest neighbors), face- and edge-adjacent
+///     (18 nearest neighbors) or face-, edge- and vertex-adjacent (26
+///     nearest neighbors).
+///
+/// @note The values of any voxels are unchanged.
+template<typename TreeType> OPENVDB_STATIC_SPECIALIZATION
+inline void dilateVoxels(TreeType& tree,
+                         int iterations = 1,
+                         NearestNeighbors nn = NN_FACE);
+
+/// @brief Topologically dilate all leaf-level active voxels in a tree
+/// using one of three nearest neighbor connectivity patterns.
+/// @warning This method is NOT multi-threaded and ignores active tiles!    
+///
+/// @param manager       LeafManager containing the tree to be dilated.
+/// @param iterations    number of iterations to apply the dilation
+/// @param nn           connectivity pattern of the dilation: either
+///     face-adjacent (6 nearest neighbors), face- and edge-adjacent
+///     (18 nearest neighbors) or face-, edge- and vertex-adjacent (26
+///     nearest neighbors).
+///
+/// @note The values of any voxels are unchanged.
+template<typename TreeType> OPENVDB_STATIC_SPECIALIZATION
+inline void dilateVoxels(tree::LeafManager<TreeType>& manager,
+                         int iterations = 1,
+                         NearestNeighbors nn = NN_FACE);
+
+
+//@{
+/// @brief Topologically erode all leaf-level active voxels in the given tree.
+/// @details That is, shrink the set of active voxels by @a iterations voxels
+/// in the +x, -x, +y, -y, +z and -z directions, but don't change the values
+/// of any voxels, only their active states.
+/// @todo Currently operates only on leaf voxels; need to extend to tiles.
+template<typename TreeType> OPENVDB_STATIC_SPECIALIZATION
+inline void erodeVoxels(TreeType& tree,
+                        int iterations=1,
+                        NearestNeighbors nn = NN_FACE);
+
+template<typename TreeType> OPENVDB_STATIC_SPECIALIZATION
+inline void erodeVoxels(tree::LeafManager<TreeType>& manager,
+                        int iterations = 1,
+                        NearestNeighbors nn = NN_FACE);
+//@}
+
+
+/// @brief Mark as active any inactive tiles or voxels in the given grid or tree
+/// whose values are equal to @a value (optionally to within the given @a tolerance).
+template<typename GridOrTree>
+inline void activate(
+    GridOrTree&,
+    const typename GridOrTree::ValueType& value,
+    const typename GridOrTree::ValueType& tolerance = zeroVal<typename GridOrTree::ValueType>()
+);
+
+
+/// @brief Mark as inactive any active tiles or voxels in the given grid or tree
+/// whose values are equal to @a value (optionally to within the given @a tolerance).
+template<typename GridOrTree>
+inline void deactivate(
+    GridOrTree&,
+    const typename GridOrTree::ValueType& value,
+    const typename GridOrTree::ValueType& tolerance = zeroVal<typename GridOrTree::ValueType>()
+);
+
+
+////////////////////////////////////////
+
+
+/// Mapping from a Log2Dim to a data type of size 2^Log2Dim bits
+template<Index Log2Dim> struct DimToWord {};
+template<> struct DimToWord<3> { typedef uint8_t Type; };
+template<> struct DimToWord<4> { typedef uint16_t Type; };
+template<> struct DimToWord<5> { typedef uint32_t Type; };
+template<> struct DimToWord<6> { typedef uint64_t Type; };
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+class Morphology
+{
+public:
+    typedef tree::LeafManager<TreeType> ManagerType;
+
+    Morphology(TreeType& tree):
+        mOwnsManager(true), mManager(new ManagerType(tree)), mAcc(tree), mSteps(1) {}
+    Morphology(ManagerType* mgr):
+        mOwnsManager(false), mManager(mgr), mAcc(mgr->tree()), mSteps(1) {}
+    virtual ~Morphology() { if (mOwnsManager) delete mManager; }
+
+    /// @brief Face-adjacent dilation pattern
+    void dilateVoxels6();
+    /// @brief Face- and edge-adjacent dilation pattern.
+    void dilateVoxels18();
+    /// @brief Face-, edge- and vertex-adjacent dilation pattern.
+    void dilateVoxels26();
+    void dilateVoxels(int iterations = 1, NearestNeighbors nn = NN_FACE);
+
+    /// @brief Face-adjacent erosion pattern.
+    void erodeVoxels6()  { mSteps = 1; this->doErosion(NN_FACE); }
+    /// @brief Face- and edge-adjacent erosion pattern.
+    void erodeVoxels18() { mSteps = 1; this->doErosion(NN_FACE_EDGE); }
+    /// @brief Face-, edge- and vertex-adjacent erosion pattern.
+    void erodeVoxels26() { mSteps = 1; this->doErosion(NN_FACE_EDGE_VERTEX); }
+    void erodeVoxels(int iterations = 1, NearestNeighbors nn = NN_FACE)
+    {
+        mSteps = iterations;
+        this->doErosion(nn);
+    }
+
+protected:
+
+    void doErosion(NearestNeighbors nn);
+
+    typedef typename TreeType::LeafNodeType LeafType;
+    typedef typename LeafType::NodeMaskType MaskType;
+    typedef tree::ValueAccessor<TreeType>   AccessorType;
+
+    const bool   mOwnsManager;
+    ManagerType* mManager;
+    AccessorType mAcc;
+    int mSteps;
+
+    static const int LEAF_DIM     = LeafType::DIM;
+    static const int LEAF_LOG2DIM = LeafType::LOG2DIM;
+    typedef typename DimToWord<LEAF_LOG2DIM>::Type Word;
+
+    struct Neighbor {
+        LeafType* leaf;//null if a tile
+        bool      init;//true if initialization is required
+        bool      isOn;//true if an active tile
+        Neighbor() : leaf(NULL), init(true) {}
+        inline void clear() { leaf = NULL; init = true; }
+        template<int DX, int DY, int DZ>
+        void scatter(AccessorType& acc, const Coord &xyz, int indx, Word mask)
+        {
+            if (init) {
+                init = false;
+                Coord orig = xyz.offsetBy(DX*LEAF_DIM, DY*LEAF_DIM, DZ*LEAF_DIM);
+                leaf = acc.probeLeaf(orig);
+                if (leaf==NULL && !acc.isValueOn(orig)) leaf = acc.touchLeaf(orig);
+            }
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+            static
+#endif
+            const int N = (LEAF_DIM - 1)*(DY + DX*LEAF_DIM);
+            if (leaf) leaf->getValueMask().template getWord<Word>(indx-N) |= mask;
+        }
+
+        template<int DX, int DY, int DZ>
+        Word gather(AccessorType& acc, const Coord &xyz, int indx)
+        {
+            if (init) {
+                init = false;
+                Coord orig = xyz.offsetBy(DX*LEAF_DIM, DY*LEAF_DIM, DZ*LEAF_DIM);
+                leaf = acc.probeLeaf(orig);
+                isOn = leaf ? false : acc.isValueOn(orig);
+            }
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+            static
+#endif
+            const int N = (LEAF_DIM -1 )*(DY + DX*LEAF_DIM);
+            return leaf ? leaf->getValueMask().template getWord<Word>(indx-N)
+                : isOn ? ~Word(0) : Word(0);
+        }
+    };// Neighbor
+
+    struct LeafCache
+    {
+        LeafCache(size_t n, TreeType& tree) : size(n), leafs(new LeafType*[n]), acc(tree)
+        {
+            onTile.setValuesOn();
+            this->clear();
+        }
+        ~LeafCache() { delete [] leafs; }
+        LeafType*& operator[](int offset) { return leafs[offset]; }
+        inline void clear() { for (size_t i=0; i<size; ++i) leafs[i]=NULL; }
+        inline void setOrigin(const Coord& xyz) { origin = &xyz; }
+        inline void scatter(int n, int indx)
+        {
+            assert(leafs[n]);
+            leafs[n]->getValueMask().template getWord<Word>(indx) |= mask;
+        }
+        template<int DX, int DY, int DZ>
+        inline void scatter(int n, int indx)
+        {
+            if (!leafs[n]) {
+                const Coord xyz = origin->offsetBy(DX*LEAF_DIM, DY*LEAF_DIM, DZ*LEAF_DIM);
+                leafs[n] = acc.probeLeaf(xyz);
+                if (!leafs[n]) leafs[n] = acc.isValueOn(xyz) ? &onTile : acc.touchLeaf(xyz);
+            }
+            this->scatter(n, indx - (LEAF_DIM - 1)*(DY + DX*LEAF_DIM));
+        }
+        inline Word gather(int n, int indx)
+        {
+            assert(leafs[n]);
+            return leafs[n]->getValueMask().template getWord<Word>(indx);
+        }
+        template<int DX, int DY, int DZ>
+        inline Word gather(int n, int indx)
+        {
+            if (!leafs[n]) {
+                const Coord xyz = origin->offsetBy(DX*LEAF_DIM, DY*LEAF_DIM, DZ*LEAF_DIM);
+                leafs[n] = acc.probeLeaf(xyz);
+                if (!leafs[n]) leafs[n] = acc.isValueOn(xyz) ? &onTile : &offTile;
+            }
+            return this->gather(n, indx - (LEAF_DIM -1 )*(DY + DX*LEAF_DIM));
+        }
+        // Scatters in the xy face-directions relative to leaf i1
+        void scatterFacesXY(int x, int y, int i1, int n, int i2);
+
+        // Scatters in the xy edge-directions relative to leaf i1
+        void scatterEdgesXY(int x, int y, int i1, int n, int i2);
+
+        Word gatherFacesXY(int x, int y, int i1, int n, int i2);
+
+        Word gatherEdgesXY(int x, int y, int i1, int n, int i2);
+
+        const Coord* origin;
+        size_t size;
+        LeafType** leafs;
+        LeafType onTile, offTile;
+        AccessorType acc;
+        Word mask;
+    };// LeafCache
+
+    struct ErodeVoxelsOp {
+        typedef tbb::blocked_range<size_t> RangeT;
+        ErodeVoxelsOp(std::vector<MaskType>& masks, ManagerType& manager)
+            : mTask(0), mSavedMasks(masks) , mManager(manager) {}
+        void runParallel(NearestNeighbors nn);
+        void operator()(const RangeT& r) const {mTask(const_cast<ErodeVoxelsOp*>(this), r);}
+        void erode6( const RangeT&) const;
+        void erode18(const RangeT&) const;
+        void erode26(const RangeT&) const;
+    private:
+        typedef typename boost::function<void (ErodeVoxelsOp*, const RangeT&)> FuncT;
+        FuncT                  mTask;
+        std::vector<MaskType>& mSavedMasks;
+        ManagerType&           mManager;
+    };// ErodeVoxelsOp
+
+    struct MaskManager {
+        MaskManager(std::vector<MaskType>& masks, ManagerType& manager)
+            : mMasks(masks) , mManager(manager), mSaveMasks(true) {}
+
+        void save() { mSaveMasks = true; tbb::parallel_for(mManager.getRange(), *this); }
+        void update() { mSaveMasks = false; tbb::parallel_for(mManager.getRange(), *this); }
+        void operator()(const tbb::blocked_range<size_t>& range) const
+        {
+            if (mSaveMasks) {
+                for (size_t i = range.begin(); i < range.end(); ++i) {
+                    mMasks[i] = mManager.leaf(i).getValueMask();
+                }
+            } else {
+                for (size_t i = range.begin(); i < range.end(); ++i) {
+                    mManager.leaf(i).setValueMask(mMasks[i]);
+                }
+            }
+        }
+    private:
+        std::vector<MaskType>& mMasks;
+        ManagerType& mManager;
+        bool mSaveMasks;
+    };// MaskManager
+
+    struct UpdateMasks {
+        UpdateMasks(const std::vector<MaskType>& masks, ManagerType& manager)
+            : mMasks(masks), mManager(manager) {}
+        void update() { tbb::parallel_for(mManager.getRange(), *this); }
+        void operator()(const tbb::blocked_range<size_t>& r) const {
+            for (size_t i=r.begin(); i<r.end(); ++i) mManager.leaf(i).setValueMask(mMasks[i]);
+        }
+        const std::vector<MaskType>& mMasks;
+        ManagerType& mManager;
+    };
+    struct CopyMasks {
+        CopyMasks(std::vector<MaskType>& masks, const ManagerType& manager)
+            : mMasks(masks), mManager(manager) {}
+        void copy() { tbb::parallel_for(mManager.getRange(), *this); }
+        void operator()(const tbb::blocked_range<size_t>& r) const {
+            for (size_t i=r.begin(); i<r.end(); ++i) mMasks[i]=mManager.leaf(i).getValueMask();
+        }
+        std::vector<MaskType>& mMasks;
+        const ManagerType& mManager;
+    };
+    void copyMasks(std::vector<MaskType>& a, const ManagerType& b) {CopyMasks c(a, b); c.copy();}
+};// Morphology
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::dilateVoxels(int iterations, NearestNeighbors nn)
+{
+    for (int i=0; i<iterations; ++i) {
+        switch (nn) {
+        case NN_FACE_EDGE:
+            this->dilateVoxels18();
+            break;
+        case NN_FACE_EDGE_VERTEX:
+            this->dilateVoxels26();
+            break;
+        default:
+            this->dilateVoxels6();
+        }
+    }
+}
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::dilateVoxels6()
+{
+    /// @todo Currently operates only on leaf voxels; need to extend to tiles.
+    const int leafCount = static_cast<int>(mManager->leafCount());
+
+    // Save the value masks of all leaf nodes.
+    std::vector<MaskType> savedMasks(leafCount);
+    this->copyMasks(savedMasks, *mManager);
+    LeafCache cache(7, mManager->tree());
+    for (int leafIdx = 0; leafIdx < leafCount; ++leafIdx) {
+        const MaskType& oldMask = savedMasks[leafIdx];//original bit-mask of current leaf node
+        cache[0] = &mManager->leaf(leafIdx);
+        cache.setOrigin(cache[0]->origin());
+        for (int x = 0; x < LEAF_DIM; ++x ) {
+            for (int y = 0, n = (x << LEAF_LOG2DIM); y < LEAF_DIM; ++y, ++n) {
+                // Extract the portion of the original mask that corresponds to a row in z.
+                if (const Word w = oldMask.template getWord<Word>(n)) {
+
+                    // Dilate the current leaf in the +z and -z direction
+                    cache.mask = Word(w | (w>>1) | (w<<1)); cache.scatter(0, n);
+
+                    // Dilate into neighbor leaf in the -z direction
+                    if ( (cache.mask = Word(w<<(LEAF_DIM-1))) ) {
+                        cache.template scatter< 0, 0,-1>(1, n);
+                    }
+                    // Dilate into neighbor leaf in the +z direction
+                    if ( (cache.mask = Word(w>>(LEAF_DIM-1))) ) {
+                        cache.template scatter< 0, 0, 1>(2, n);
+                    }
+                    // Dilate in the xy-face directions relative to the center leaf
+                    cache.mask = w; cache.scatterFacesXY(x, y, 0, n, 3);
+                }
+            }// loop over y
+        }//loop over x
+        cache.clear();
+    }//loop over leafs
+
+    mManager->rebuildLeafArray();
+}//dilateVoxels6
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::dilateVoxels18()
+{
+    /// @todo Currently operates only on leaf voxels; need to extend to tiles.
+    const int leafCount = static_cast<int>(mManager->leafCount());
+
+    // Save the value masks of all leaf nodes.
+    std::vector<MaskType> savedMasks(leafCount);
+    this->copyMasks(savedMasks, *mManager);
+    LeafCache cache(19, mManager->tree());
+    Coord orig_mz, orig_pz;//origins of neighbor leaf nodes in the -z and +z directions
+    for (int leafIdx = 0; leafIdx < leafCount; ++leafIdx) {
+        const MaskType& oldMask = savedMasks[leafIdx];//original bit-mask of current leaf node
+        cache[0] = &mManager->leaf(leafIdx);
+        orig_mz = cache[0]->origin().offsetBy(0, 0, -LEAF_DIM);
+        orig_pz = cache[0]->origin().offsetBy(0, 0,  LEAF_DIM);
+        for (int x = 0; x < LEAF_DIM; ++x ) {
+            for (int y = 0, n = (x << LEAF_LOG2DIM); y < LEAF_DIM; ++y, ++n) {
+                if (const Word w = oldMask.template getWord<Word>(n)) {
+                    {
+                        cache.mask = Word(w | (w>>1) | (w<<1));
+                        cache.setOrigin(cache[0]->origin());
+                        cache.scatter(0, n);
+                        cache.scatterFacesXY(x, y, 0, n, 3);
+                        cache.mask = w;
+                        cache.scatterEdgesXY(x, y, 0, n, 3);
+                    }
+                    if ( (cache.mask = Word(w<<(LEAF_DIM-1))) ) {
+                        cache.setOrigin(cache[0]->origin());
+                        cache.template scatter< 0, 0,-1>(1, n);
+                        cache.setOrigin(orig_mz);
+                        cache.scatterFacesXY(x, y, 1, n, 11);
+                    }
+                    if ( (cache.mask = Word(w>>(LEAF_DIM-1))) ) {
+                        cache.setOrigin(cache[0]->origin());
+                        cache.template scatter< 0, 0, 1>(2, n);
+                        cache.setOrigin(orig_pz);
+                        cache.scatterFacesXY(x, y, 2, n, 15);
+                    }
+                }
+            }// loop over y
+        }//loop over x
+        cache.clear();
+    }//loop over leafs
+
+    mManager->rebuildLeafArray();
+}// dilateVoxels18
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::dilateVoxels26()
+{
+    const int leafCount = static_cast<int>(mManager->leafCount());
+
+    // Save the value masks of all leaf nodes.
+    std::vector<MaskType> savedMasks(leafCount);
+    this->copyMasks(savedMasks, *mManager);
+    LeafCache cache(27, mManager->tree());
+    Coord orig_mz, orig_pz;//origins of neighbor leaf nodes in the -z and +z directions
+    for (int leafIdx = 0; leafIdx < leafCount; ++leafIdx) {
+        const MaskType& oldMask = savedMasks[leafIdx];//original bit-mask of current leaf node
+        cache[0] = &mManager->leaf(leafIdx);
+        orig_mz = cache[0]->origin().offsetBy(0, 0, -LEAF_DIM);
+        orig_pz = cache[0]->origin().offsetBy(0, 0,  LEAF_DIM);
+        for (int x = 0; x < LEAF_DIM; ++x ) {
+            for (int y = 0, n = (x << LEAF_LOG2DIM); y < LEAF_DIM; ++y, ++n) {
+                if (const Word w = oldMask.template getWord<Word>(n)) {
+                    {
+                        cache.mask = Word(w | (w>>1) | (w<<1));
+                        cache.setOrigin(cache[0]->origin());
+                        cache.scatter(0, n);
+                        cache.scatterFacesXY(x, y, 0, n, 3);
+                        cache.scatterEdgesXY(x, y, 0, n, 3);
+                    }
+                    if ( (cache.mask = Word(w<<(LEAF_DIM-1))) ) {
+                        cache.setOrigin(cache[0]->origin());
+                        cache.template scatter< 0, 0,-1>(1, n);
+                        cache.setOrigin(orig_mz);
+                        cache.scatterFacesXY(x, y, 1, n, 11);
+                        cache.scatterEdgesXY(x, y, 1, n, 11);
+                    }
+                    if ( (cache.mask = Word(w>>(LEAF_DIM-1))) ) {
+                        cache.setOrigin(cache[0]->origin());
+                        cache.template scatter< 0, 0, 1>(2, n);
+                        cache.setOrigin(orig_pz);
+                        cache.scatterFacesXY(x, y, 2, n, 19);
+                        cache.scatterEdgesXY(x, y, 2, n, 19);
+                    }
+                }
+            }// loop over y
+        }//loop over x
+        cache.clear();
+    }//loop over leafs
+
+    mManager->rebuildLeafArray();
+}// dilateVoxels26
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::LeafCache::scatterFacesXY(int x, int y, int i1, int n, int i2)
+{
+    // dilate current leaf or neighbor in the -x direction
+    if (x > 0) {
+        this->scatter(i1, n-LEAF_DIM);
+    } else {
+        this->template scatter<-1, 0, 0>(i2, n);
+    }
+    // dilate current leaf or neighbor in the +x direction
+    if (x < LEAF_DIM-1) {
+        this->scatter(i1, n+LEAF_DIM);
+    } else {
+        this->template scatter< 1, 0, 0>(i2+1, n);
+    }
+    // dilate current leaf or neighbor in the -y direction
+    if (y > 0) {
+        this->scatter(i1, n-1);
+    } else {
+        this->template scatter< 0,-1, 0>(i2+2, n);
+    }
+    // dilate current leaf or neighbor in the +y direction
+    if (y < LEAF_DIM-1) {
+        this->scatter(i1, n+1);
+    } else {
+        this->template scatter< 0, 1, 0>(i2+3, n);
+    }
+}
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::LeafCache::scatterEdgesXY(int x, int y, int i1, int n, int i2)
+{
+    if (x > 0) {
+        if (y > 0) {
+            this->scatter(i1, n-LEAF_DIM-1);
+        } else {
+            this->template scatter< 0,-1, 0>(i2+2, n-LEAF_DIM);
+        }
+        if (y < LEAF_DIM-1) {
+            this->scatter(i1, n-LEAF_DIM+1);
+        } else {
+            this->template scatter< 0, 1, 0>(i2+3, n-LEAF_DIM);
+        }
+    } else {
+        if (y < LEAF_DIM-1) {
+            this->template scatter<-1, 0, 0>(i2  , n+1);
+        } else {
+            this->template scatter<-1, 1, 0>(i2+7, n  );
+        }
+        if (y > 0) {
+            this->template scatter<-1, 0, 0>(i2  , n-1);
+        } else {
+            this->template scatter<-1,-1, 0>(i2+4, n  );
+        }
+    }
+    if (x < LEAF_DIM-1) {
+        if (y > 0) {
+            this->scatter(i1, n+LEAF_DIM-1);
+        } else {
+            this->template scatter< 0,-1, 0>(i2+2, n+LEAF_DIM);
+        }
+        if (y < LEAF_DIM-1) {
+            this->scatter(i1, n+LEAF_DIM+1);
+        } else {
+            this->template scatter< 0, 1, 0>(i2+3, n+LEAF_DIM);
+        }
+    } else {
+        if (y > 0) {
+            this->template scatter< 1, 0, 0>(i2+1, n-1);
+        } else {
+            this->template scatter< 1,-1, 0>(i2+6, n  );
+        }
+        if (y < LEAF_DIM-1) {
+            this->template scatter< 1, 0, 0>(i2+1, n+1);
+        } else {
+            this->template scatter< 1, 1, 0>(i2+5, n  );
+        }
+    }
+}
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::ErodeVoxelsOp::runParallel(NearestNeighbors nn)
+{
+    switch (nn) {
+    case NN_FACE_EDGE:
+        mTask = boost::bind(&ErodeVoxelsOp::erode18, _1, _2);
+        break;
+    case NN_FACE_EDGE_VERTEX:
+        mTask = boost::bind(&ErodeVoxelsOp::erode26, _1, _2);
+        break;
+    default:
+        mTask = boost::bind(&ErodeVoxelsOp::erode6, _1, _2);
+    }
+    tbb::parallel_for(mManager.getRange(), *this);
+}
+
+
+template<typename TreeType>
+inline typename Morphology<TreeType>::Word
+Morphology<TreeType>::LeafCache::gatherFacesXY(int x, int y, int i1, int n, int i2)
+{
+    // erode current leaf or neighbor in negative x-direction
+    Word w = x>0 ? this->gather(i1,n-LEAF_DIM) : this->template gather<-1,0,0>(i2, n);
+
+    // erode current leaf or neighbor in positive x-direction
+    w = Word(w & (x<LEAF_DIM-1?this->gather(i1,n+LEAF_DIM):this->template gather<1,0,0>(i2+1,n)));
+
+    // erode current leaf or neighbor in negative y-direction
+    w = Word(w & (y>0 ? this->gather(i1, n-1) : this->template gather<0,-1,0>(i2+2, n)));
+
+    // erode current leaf or neighbor in positive y-direction
+    w = Word(w & (y<LEAF_DIM-1 ? this->gather(i1, n+1) : this->template gather<0,1,0>(i2+3, n)));
+
+    return w;
+}
+
+
+template<typename TreeType>
+inline typename Morphology<TreeType>::Word
+Morphology<TreeType>::LeafCache::gatherEdgesXY(int x, int y, int i1, int n, int i2)
+{
+    Word w = ~Word(0);
+
+    if (x > 0) {
+        w &= y > 0 ?          this->gather(i1, n-LEAF_DIM-1) :
+                              this->template gather< 0,-1, 0>(i2+2, n-LEAF_DIM);
+        w &= y < LEAF_DIM-1 ? this->gather(i1, n-LEAF_DIM+1) :
+                              this->template gather< 0, 1, 0>(i2+3, n-LEAF_DIM);
+    } else {
+        w &= y < LEAF_DIM-1 ? this->template gather<-1, 0, 0>(i2  , n+1):
+                              this->template gather<-1, 1, 0>(i2+7, n  );
+        w &= y > 0 ?          this->template gather<-1, 0, 0>(i2  , n-1):
+                              this->template gather<-1,-1, 0>(i2+4, n  );
+    }
+    if (x < LEAF_DIM-1) {
+        w &= y > 0 ?          this->gather(i1, n+LEAF_DIM-1) :
+                              this->template gather< 0,-1, 0>(i2+2, n+LEAF_DIM);
+        w &= y < LEAF_DIM-1 ? this->gather(i1, n+LEAF_DIM+1) :
+                              this->template gather< 0, 1, 0>(i2+3, n+LEAF_DIM);
+    } else {
+        w &= y > 0          ? this->template gather< 1, 0, 0>(i2+1, n-1):
+                              this->template gather< 1,-1, 0>(i2+6, n  );
+        w &= y < LEAF_DIM-1 ? this->template gather< 1, 0, 0>(i2+1, n+1):
+                              this->template gather< 1, 1, 0>(i2+5, n  );
+    }
+
+    return w;
+}
+
+
+template <typename TreeType>
+inline void
+Morphology<TreeType>::ErodeVoxelsOp::erode6(const RangeT& range) const
+{
+    LeafCache cache(7, mManager.tree());
+    for (size_t leafIdx = range.begin(); leafIdx < range.end(); ++leafIdx) {
+        cache[0] = &mManager.leaf(leafIdx);
+        if (cache[0]->isEmpty()) continue;
+        cache.setOrigin(cache[0]->origin());
+        MaskType& newMask = mSavedMasks[leafIdx];//original bit-mask of current leaf node
+        for (int x = 0; x < LEAF_DIM; ++x ) {
+            for (int y = 0, n = (x << LEAF_LOG2DIM); y < LEAF_DIM; ++y, ++n) {
+                // Extract the portion of the original mask that corresponds to a row in z.
+                if (Word& w = newMask.template getWord<Word>(n)) {
+
+                    // erode in two z directions (this is first since it uses the original w)
+                    w = Word(w &
+                        (Word(w<<1 | (cache.template gather<0,0,-1>(1, n)>>(LEAF_DIM-1))) &
+                         Word(w>>1 | (cache.template gather<0,0, 1>(2, n)<<(LEAF_DIM-1)))));
+
+                    w = Word(w & cache.gatherFacesXY(x, y, 0, n, 3));
+                }
+            }// loop over y
+        }//loop over x
+        cache.clear();
+    }//loop over leafs
+}
+
+
+template <typename TreeType>
+inline void
+Morphology<TreeType>::ErodeVoxelsOp::erode18(const RangeT&) const
+{
+    OPENVDB_THROW(NotImplementedError, "tools::erode18 is not implemented yet!");
+}
+
+
+template <typename TreeType>
+inline void
+Morphology<TreeType>::ErodeVoxelsOp::erode26(const RangeT&) const
+{
+    OPENVDB_THROW(NotImplementedError, "tools::erode26 is not implemented yet!");
+}
+
+
+template<typename TreeType>
+inline void
+Morphology<TreeType>::doErosion(NearestNeighbors nn)
+{
+    /// @todo Currently operates only on leaf voxels; need to extend to tiles.
+    const size_t leafCount = mManager->leafCount();
+
+    // Save the value masks of all leaf nodes.
+    std::vector<MaskType> savedMasks(leafCount);
+    this->copyMasks(savedMasks, *mManager);
+    UpdateMasks a(savedMasks, *mManager);
+    ErodeVoxelsOp erode(savedMasks, *mManager);
+
+    for (int i = 0; i < mSteps; ++i) {
+        erode.runParallel(nn);
+        a.update();
+    }
+
+    tools::pruneLevelSet(mManager->tree());
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+OPENVDB_STATIC_SPECIALIZATION inline void
+dilateVoxels(tree::LeafManager<TreeType>& manager, int iterations, NearestNeighbors nn)
+{
+    if (iterations > 0 ) {
+        Morphology<TreeType> m(&manager);
+        m.dilateVoxels(iterations, nn);
+    }
+}
+
+template<typename TreeType>
+OPENVDB_STATIC_SPECIALIZATION inline void
+dilateVoxels(TreeType& tree, int iterations, NearestNeighbors nn)
+{
+    if (iterations > 0 ) {
+        Morphology<TreeType> m(tree);
+        m.dilateVoxels(iterations, nn);
+    }
+}
+
+template<typename TreeType>
+OPENVDB_STATIC_SPECIALIZATION inline void
+erodeVoxels(tree::LeafManager<TreeType>& manager, int iterations, NearestNeighbors nn)
+{
+    if (iterations > 0 ) {
+        Morphology<TreeType> m(&manager);
+        m.erodeVoxels(iterations, nn);
+    }
+}
+
+template<typename TreeType>
+OPENVDB_STATIC_SPECIALIZATION inline void
+erodeVoxels(TreeType& tree, int iterations, NearestNeighbors nn)
+{
+    if (iterations > 0 ) {
+        Morphology<TreeType> m(tree);
+        m.erodeVoxels(iterations, nn);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+namespace activation {
+
+template<typename TreeType>
+class ActivationOp
+{
+public:
+    typedef typename TreeType::ValueType ValueT;
+
+    ActivationOp(bool state, const ValueT& val, const ValueT& tol)
+        : mActivate(state)
+        , mValue(val)
+        , mTolerance(tol)
+    {}
+
+    void operator()(const typename TreeType::ValueOnIter& it) const
+    {
+        if (math::isApproxEqual(*it, mValue, mTolerance)) {
+            it.setValueOff();
+        }
+    }
+
+    void operator()(const typename TreeType::ValueOffIter& it) const
+    {
+        if (math::isApproxEqual(*it, mValue, mTolerance)) {
+            it.setActiveState(/*on=*/true);
+        }
+    }
+
+    void operator()(const typename TreeType::LeafIter& lit) const
+    {
+        typedef typename TreeType::LeafNodeType LeafT;
+        LeafT& leaf = *lit;
+        if (mActivate) {
+            for (typename LeafT::ValueOffIter it = leaf.beginValueOff(); it; ++it) {
+                if (math::isApproxEqual(*it, mValue, mTolerance)) {
+                    leaf.setValueOn(it.pos());
+                }
+            }
+        } else {
+            for (typename LeafT::ValueOnIter it = leaf.beginValueOn(); it; ++it) {
+                if (math::isApproxEqual(*it, mValue, mTolerance)) {
+                    leaf.setValueOff(it.pos());
+                }
+            }
+        }
+    }
+
+private:
+    bool mActivate;
+    const ValueT mValue, mTolerance;
+}; // class ActivationOp
+
+} // namespace activation
+
+
+template<typename GridOrTree>
+inline void
+activate(GridOrTree& gridOrTree, const typename GridOrTree::ValueType& value,
+    const typename GridOrTree::ValueType& tolerance)
+{
+    typedef TreeAdapter<GridOrTree> Adapter;
+    typedef typename Adapter::TreeType TreeType;
+
+    TreeType& tree = Adapter::tree(gridOrTree);
+
+    activation::ActivationOp<TreeType> op(/*activate=*/true, value, tolerance);
+
+    // Process all leaf nodes in parallel.
+    foreach(tree.beginLeaf(), op);
+
+    // Process all other inactive values serially (because changing active states
+    // is not thread-safe unless no two threads modify the same node).
+    typename TreeType::ValueOffIter it = tree.beginValueOff();
+    it.setMaxDepth(tree.treeDepth() - 2);
+    foreach(it, op, /*threaded=*/false);
+}
+
+
+template<typename GridOrTree>
+inline void
+deactivate(GridOrTree& gridOrTree, const typename GridOrTree::ValueType& value,
+    const typename GridOrTree::ValueType& tolerance)
+{
+    typedef TreeAdapter<GridOrTree> Adapter;
+    typedef typename Adapter::TreeType TreeType;
+
+    TreeType& tree = Adapter::tree(gridOrTree);
+
+    activation::ActivationOp<TreeType> op(/*activate=*/false, value, tolerance);
+
+    // Process all leaf nodes in parallel.
+    foreach(tree.beginLeaf(), op);
+
+    // Process all other active values serially (because changing active states
+    // is not thread-safe unless no two threads modify the same node).
+    typename TreeType::ValueOnIter it = tree.beginValueOn();
+    it.setMaxDepth(tree.treeDepth() - 2);
+    foreach(it, op, /*threaded=*/false);
+}
+
+/// @brief Class that performs multi-threaded dilation with support for active tiles.
+/// @warning Dont use this class directly, instead call the function dilateActiveValues!
+template<typename TreeT>
+class DilationOp
+{
+    typedef typename TreeT::template ValueConverter<ValueMask>::Type MaskT;
+    typedef tbb::enumerable_thread_specific<MaskT>                   PoolT;
+    typedef typename MaskT::LeafNodeType                             LeafT;
+
+    // Very light-weight member data
+    const int mIter;// number of iterations
+    const tools::NearestNeighbors mNN;//enum to specify the dilation scheme
+    PoolT  *mPool;// pointer to the thread-local pool of mask trees
+    LeafT **mLeafs;// raw array of pointers to leaf nodes
+
+public:
+    
+    DilationOp(TreeT &tree, int iterations, NearestNeighbors nn, TilePolicy mode)
+        : mIter(iterations), mNN(nn), mPool(NULL), mLeafs(NULL)
+    {
+        const size_t numLeafs = this->init( tree, mode );
+        const size_t numThreads = size_t(tbb::task_scheduler_init::default_num_threads());
+        const size_t grainSize = math::Max(size_t(1), numLeafs/(2*numThreads));
+
+        MaskT mask;
+        PoolT pool(mask);// Scoped thread-local storage of mask trees
+        mPool = &pool;
+
+        tbb::parallel_for(tbb::blocked_range<LeafT**>(mLeafs, mLeafs+numLeafs, grainSize), *this);
+        
+        delete [] mLeafs;// no more need for the array of leaf node pointers
+        
+        typedef typename PoolT::iterator IterT;
+        for (IterT it=pool.begin(); it!=pool.end(); ++it) mask.merge(*it);// fast stealing
+
+        if (mode == PRESERVE_TILES) tools::prune(mask);//multithreaded
+
+        tree.topologyUnion(mask);//multithreaded
+    }
+
+    // This is required by tbb and should never be called directly
+    void operator()(const tbb::blocked_range<LeafT**> &r) const
+    {
+        MaskT mask;// thread-local temporary mask tree
+        for (LeafT** it=r.begin(); it!=r.end(); ++it) mask.addLeaf( **it );
+        tree::LeafManager<MaskT> manager(mask, r.begin(), r.end());
+        tools::dilateVoxels(manager, mIter, mNN);// serial dilation of active voxels
+        mPool->local().merge(mask, MERGE_ACTIVE_STATES);
+    }
+private:
+
+    // Simple wrapper of a raw double-pointer to mimic a std container
+    struct MyArray {
+        typedef LeafT* value_type;//required by Tree::stealNodes
+        value_type* ptr;
+        MyArray(value_type* array) : ptr(array) {}
+        void push_back(value_type leaf) { *ptr++ = leaf; }//required by Tree::stealNodes
+    };
+
+    // Convert active tiles to leafs and de-construct the tree into a linear array of leafs.
+    size_t linearize(MaskT& mask, TilePolicy mode)
+    {
+        if (mode != IGNORE_TILES) mask.voxelizeActiveTiles();// light-weight since this is a mask tree
+        const size_t numLeafs = mask.leafCount();
+        mLeafs = new LeafT*[numLeafs];// fast pre-allocation
+        MyArray tmp(mLeafs);
+        mask.stealNodes(tmp);// serializes the mask tree and leaves it empty
+        return numLeafs;
+    }
+    
+    template <typename T>
+    typename boost::enable_if<boost::is_same<T,MaskT>,size_t>::type init(T& tree, TilePolicy mode)
+    {
+        return this->linearize(tree, mode);
+    }
+    
+    template <typename T>
+    typename boost::disable_if<boost::is_same<T,MaskT>,size_t>::type init(const T& tree, TilePolicy mode)
+    {
+        MaskT mask(tree, false, true, TopologyCopy());
+        return this->linearize(mask, mode);
+    }
+    
+};// DilationOp
+
+template<typename TreeType>
+OPENVDB_STATIC_SPECIALIZATION inline void
+dilateActiveValues(TreeType& tree, int iterations, NearestNeighbors nn, TilePolicy mode)
+{
+    if (iterations > 0 ) DilationOp<TreeType> tmp(tree, iterations, nn, mode);
+}
+
+template<typename TreeType>
+OPENVDB_STATIC_SPECIALIZATION inline void
+dilateActiveValues(tree::LeafManager<TreeType>& manager,
+                   int iterations,
+                   NearestNeighbors nn,
+                   TilePolicy mode)
+{
+    if (iterations > 0 ) {
+        DilationOp<TreeType> tmp(manager.tree(), iterations, nn, mode);
+        manager.rebuildLeafArray();
+    }
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_MORPHOLOGY_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/MultiResGrid.h b/nuparu/include/openvdb_new/tools/MultiResGrid.h
new file mode 100644
index 00000000..09f2c70a
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/MultiResGrid.h
@@ -0,0 +1,968 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file MultiResGrid.h
+///
+/// @brief Ken Museth
+///
+/// @warning This class is fairly new and as such has not seen a lot of
+/// use in production. Please report any issues or request for new
+/// features directly to ken.museth@dreamworks.com.
+///
+/// @brief Multi-resolution grid that contains LoD sequences of trees
+/// with powers of two refinements.
+///
+/// @note While this class can arguably be used to implement a sparse
+/// Multi-Grid solver it is currently intended as a means to
+/// efficiently compute LoD levels for applications like rendering
+///
+/// @note Prolongation means interpolation from coarse -> fine
+/// @note Restriction means interpolation (or remapping) from fine -> coarse
+///
+
+#ifndef OPENVDB_TOOLS_MULTIRESGRID_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_MULTIRESGRID_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/math/FiniteDifference.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Operators.h>
+#include <openvdb/math/Stencils.h>
+#include <openvdb/metadata/StringMetadata.h>
+#include <openvdb/tools/Interpolation.h>
+#include <openvdb/tools/Morphology.h>
+#include <openvdb/tools/Prune.h>
+#include <openvdb/tools/SignedFloodFill.h>
+#include <openvdb/tools/ValueTransformer.h>
+#include <openvdb/tree/LeafManager.h>
+#include <openvdb/tree/NodeManager.h>
+
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/tbb_thread.h>
+
+#include <boost/type_traits/is_floating_point.hpp>
+#include <boost/utility/enable_if.hpp>
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+template<typename TreeType>
+class MultiResGrid: public MetaMap
+{
+public:
+
+    typedef boost::shared_ptr<MultiResGrid>         Ptr;
+    typedef boost::shared_ptr<const MultiResGrid>   ConstPtr;
+
+    typedef typename TreeType::ValueType            ValueType;
+    typedef typename TreeType::ValueOnCIter         ValueOnCIter;
+    typedef typename TreeType::ValueOnIter          ValueOnIter;
+    typedef typename TreeType::Ptr                  TreePtr;
+    typedef typename TreeType::ConstPtr             ConstTreePtr;
+    typedef typename Grid<TreeType>::Ptr            GridPtr;
+    typedef typename Grid<TreeType>::ConstPtr       ConstGridPtr;
+
+    //////////////////////////////////////////////////////////////////////
+
+    /// @brief Constructor of empty grids
+    /// @param levels The number of trees in this MultiResGrid
+    /// @param background Background value
+    /// @param voxelSize Size of a (uniform voxel). Defaults to one.
+    /// @note The multiple grids are all empty.
+    MultiResGrid(size_t levels, ValueType background, double voxelSize = 1.0);
+
+    /// @brief Given an initial high-resolution grid this constructor
+    /// generates all the coarser grids by means of restriction.
+    /// @param levels The number of trees in this MultiResGrid
+    /// @param grid High-resolution input grid
+    /// @param useInjection Use restriction by injection, vs
+    /// full-weighting. It defaults to false and should rarely to used.
+    /// @note This constructor will perform a deep copy of the input
+    /// grid and use it as the highest level grid.
+    MultiResGrid(size_t levels, const Grid<TreeType> &grid, bool useInjection = false);
+
+    /// @brief Given an initial high-resolution grid this constructor
+    /// generates all the coarser grids by means of restriction.
+    /// @param levels The number of trees in this MultiResGrid
+    /// @param grid High-resolution input grid
+    /// @param useInjection Use restriction by injection, vs
+    /// full-weighting. It defaults to false and should rarely to used.
+    /// @note This constructor will steal the input input
+    /// grid and use it as the highest level grid. On output the grid
+    /// is empty.
+    MultiResGrid(size_t levels, GridPtr grid, bool useInjection = false);
+
+    //////////////////////////////////////////////////////////////////////
+
+    /// @brief Return the number of levels, i.e. trees, in this MultiResGrid
+    /// @note level 0 is the finest level and numLevels()-1 is the coarsest
+    /// level.
+    size_t numLevels() const { return mTrees.size(); }
+
+    /// @brief Return the level of the finest grid (always 0)
+    static size_t finestLevel() { return 0; }
+
+    /// @brief Return the level of the coarsest grid, i.e. numLevels()-1
+    size_t coarsestLevel() const { return mTrees.size()-1; }
+
+    //////////////////////////////////////////////////////////////////////
+
+    /// @brief Return a reference to the tree at the specified level
+    /// @param level The level of the tree to be returned
+    /// @note Level 0 is by definition the finest tree.
+    TreeType& tree(size_t level);
+
+    /// @brief Return a const reference to the tree at the specified level
+    /// @param level The level of the tree to be returned
+    /// @note Level 0 is by definition the finest tree.
+    const TreeType& constTree(size_t level) const;
+
+    /// @brief Return a shared pointer to the tree at the specified level
+    /// @param level The level of the tree to be returned
+    /// @note Level 0 is by definition the finest tree.
+    TreePtr treePtr(size_t level);
+
+    /// @brief Return a const shared pointer to the tree at the specified level
+    /// @param level The level of the tree to be returned
+    /// @note Level 0 is by definition the finest tree.
+    ConstTreePtr constTreePtr(size_t level) const;
+
+    /// @brief Return a reference to the tree at the finest level
+    TreeType& finestTree() { return *mTrees.front(); }
+
+    /// @brief Return a const reference to the tree at the finest level
+    const TreeType& finestConstTree() const { return *mTrees.front(); }
+
+    /// @brief Return a shared pointer to the tree at the finest level
+    TreePtr finestTreePtr() { return mTrees.front(); }
+
+    /// @brief Return a const shared pointer to the tree at the finest level
+    ConstTreePtr finestConstTreePtr() const { return mTrees.front(); }
+
+    /// @brief Return a reference to the tree at the coarsest level
+    TreeType& coarsestTree() { return *mTrees.back(); }
+
+    /// @brief Return a const reference to the tree at the coarsest level
+    const TreeType& coarsestConstTree() const { return *mTrees.back(); }
+
+    /// @brief Return a shared pointer to the tree at the coarsest level
+    TreePtr coarsestTreePtr() { return mTrees.back(); }
+
+    /// @brief Return a const shared pointer to the tree at the coarsest level
+    ConstTreePtr coarsestConstTreePtr() const { return mTrees.back(); }
+
+    //////////////////////////////////////////////////////////////////////
+
+    /// @brief Return a shared pointer to the grid at the specified integer level
+    /// @param level Integer level of the grid to be returned
+    /// @note Level 0 is by definition the finest grid.
+    GridPtr grid(size_t level);
+
+    /// @brief Return a const shared pointer to the grid at the specified level
+    /// @param level The level of the grid to be returned
+    /// @note Level 0 is by definition the finest grid.
+    ConstGridPtr grid(size_t level) const;
+
+    /// @brief Return a shared pointer to a new grid at the specified
+    /// floating-point level.
+    /// @param level Floating-point level of the grid to be returned
+    /// @param grainSize Grain size for the multi-threading
+    /// @details Interpolation of the specified order is performed
+    /// between the bracketing integer levels.
+    /// @note Level 0 is by definition the finest grid.
+    template<Index Order>
+    GridPtr createGrid(float level, size_t grainSize = 1) const;
+
+    /// @brief Return a shared pointer to a vector of all the base
+    /// grids in this instance of the MultiResGrid.
+    /// @brief This method is useful for I/O
+    GridPtrVecPtr grids();
+
+    /// @brief Return a const shared pointer to a vector of all the base
+    /// grids in this instance of the MultiResGrid.
+    /// @brief This method is useful for I/O
+    GridCPtrVecPtr grids() const;
+
+    //////////////////////////////////////////////////////////////////////
+
+    //@{
+    /// @brief Return a reference to the finest grid's transform, which might be
+    /// shared with other grids.
+    /// @note Calling setTransform() on this grid invalidates all references
+    /// previously returned by this method.
+    /// @warning The transform is relative to the finest level (=0) grid!
+    math::Transform& transform() { return *mTransform; }
+    const math::Transform& transform() const { return *mTransform; }
+    const math::Transform& constTransform() const { return *mTransform; }
+    //@}
+
+    //////////////////////////////////////////////////////////////////////
+
+    //@{
+    /// @brief Return the floating-point index coordinate at out_level given
+    /// the index coordinate in_xyz at in_level.
+    static Vec3R xyz(const Coord& in_ijk, size_t in_level, size_t out_level);
+    static Vec3R xyz(const Vec3R& in_xyz, size_t in_level, size_t out_level);
+    static Vec3R xyz(const Vec3R& in_xyz, double in_level, double out_level);
+    //@}
+
+    //////////////////////////////////////////////////////////////////////
+
+
+
+    //@{
+    /// @brief Return the value at the specified  coordinate position using
+    /// interpolation of the specified order into the tree at the out_level.
+    ///
+    /// @details First in_ijk is mapped from index space at in_level to
+    /// out_level, and then a value is interpolated from the tree at out_level.
+    ///
+    /// @param in_ijk Index coordinate position relative to tree at in_level
+    /// @param in_level Integer level of the input coordinate in_ijk
+    /// @param out_level Integer level of the interpolated value
+    template<Index Order>
+    ValueType sampleValue(const Coord& in_ijk, size_t in_level, size_t out_level) const;
+    template<Index Order>
+    ValueType sampleValue(const Vec3R& in_ijk, size_t in_level, size_t out_level) const;
+    //@}
+
+    /// @brief Return the value at the specified integer coordinate position
+    /// and level using interpolation of the specified order.
+    /// @param ijk Integer coordinate position relative to the highest level (=0) grid
+    /// @param level Floating-point level from which to interpolate the value.
+    /// @brief Non-integer values of the level will use linear-interpolation
+    /// between the neighboring integer levels.
+    template<Index Order>
+    ValueType sampleValue(const Coord& ijk, double level) const;
+
+    /// @brief Return the value at the specified floating-point coordinate position
+    /// and level using interpolation of the specified order.
+    /// @param xyz Floating-point coordinate position relative to the highest level grid
+    /// @param level Floating-point level from which to interpolate
+    /// the value.
+    /// @brief Non-integer values of the level will use linear-interpolation
+    /// between the neighboring integer levels.
+    template<Index Order>
+    ValueType sampleValue(const Vec3R& xyz, double level) const;
+
+    //////////////////////////////////////////////////////////////////////
+
+    /// @brief Return the value at coordinate location in @a level tree
+    /// from the coarser tree at @a level+1 using trilinear interpolation
+    /// @param coords input coords relative to the fine tree at level
+    /// @param level The fine level to receive values from the coarser
+    /// level-1
+    /// @note Prolongation means to interpolation from coarse -> fine
+    ValueType prolongateVoxel(const Coord& coords, const size_t level) const;
+
+
+    /// (coarse->fine) Populates all the active voxel values in a fine (@a level) tree
+    /// from the coarse (@a level+1) tree using linear interpolation
+    /// This transforms multiple values of the tree in parallel
+    void prolongateActiveVoxels(size_t destlevel, size_t grainSize = 1);
+
+    //////////////////////////////////////////////////////////////////////
+
+    /// Populate a coordinate location in @a level (coarse) tree
+    /// from the @a level-1 (fine) tree using trilinear interpolation
+    /// input coords are relative to the mTree[level] (coarse)
+    /// @note Restriction means remapping from fine -> coarse
+    ValueType restrictVoxel(Coord ijk, const size_t level, bool useInjection = false) const;
+
+    /// (fine->coarse) Populates all the active voxel values in the coarse (@a level) tree
+    /// from the fine (@a level-1) tree using trilinear interpolation.
+    /// For cell-centered data, this is equivalent to an average
+    /// For vertex-centered data this is equivalent to transferring the data
+    /// from the fine vertex directly above the coarse vertex.
+    /// This transforms multiple values of the tree in parallel
+    void restrictActiveVoxels(size_t destlevel, size_t grainSize = 1);
+
+    /// Output a human-readable description of this MultiResGrid
+    void print(std::ostream& = std::cout, int verboseLevel = 1) const;
+
+    /// @brief Return a string with the name of this MultiResGrid
+    std::string getName() const
+    {
+        if (Metadata::ConstPtr meta = (*this)[GridBase::META_GRID_NAME]) return meta->str();
+        return "";
+    }
+
+    /// @brief Set the name of this MultiResGrid
+    void setName(const std::string& name)
+    {
+        this->removeMeta(GridBase::META_GRID_NAME);
+        this->insertMeta(GridBase::META_GRID_NAME, StringMetadata(name));
+    }
+
+    /// Return the class of volumetric data (level set, fog volume, etc.) stored in this grid.
+    GridClass getGridClass() const
+    {
+        typename StringMetadata::ConstPtr s =
+            this->getMetadata<StringMetadata>(GridBase::META_GRID_CLASS);
+        return s ? GridBase::stringToGridClass(s->value()) : GRID_UNKNOWN;
+    }
+
+    /// Specify the class of volumetric data (level set, fog volume, etc.) stored in this grid.
+    void setGridClass(GridClass cls)
+    {
+        this->insertMeta(GridBase::META_GRID_CLASS, StringMetadata(GridBase::gridClassToString(cls)));
+    }
+
+    /// Remove the setting specifying the class of this grid's volumetric data.
+    void clearGridClass() { this->removeMeta(GridBase::META_GRID_CLASS); }
+
+private:
+
+    MultiResGrid(const MultiResGrid& other);//disallow copy construction
+    MultiResGrid& operator=(const MultiResGrid& other);//disallow copy assignment
+
+    // For optimal performance we disable registration of the ValueAccessor
+    typedef tree::ValueAccessor<TreeType, false>       Accessor;
+    typedef tree::ValueAccessor<const TreeType, false> ConstAccessor;
+
+    void topDownRestrict(bool useInjection);
+
+    inline void initMeta();
+
+    // Private struct that concurrently creates a mask of active voxel
+    // in a coarse tree from the active voxels in a fine tree
+    struct MaskOp;
+
+    /// Private struct that performs multi-threaded restriction
+    struct RestrictOp;
+
+    /// Private struct that performs multi-threaded prolongation
+    struct ProlongateOp;
+
+    // Private struct that performs multi-threaded computation of grids a fraction levels
+    template<Index Order>
+    struct FractionOp;
+
+    /// Private template struct that performs the actual multi-threading
+    template<typename OpType> struct CookOp;
+
+    // Array of shared pointer to trees, level 0 has the highest resolution.
+    std::vector<TreePtr> mTrees;
+    // Shared point to a transform associated with the finest level grid
+    typename math::Transform::Ptr mTransform;
+};// MultiResGrid
+
+template<typename TreeType>
+MultiResGrid<TreeType>::
+MultiResGrid(size_t levels, ValueType background, double voxelSize)
+    : mTrees(levels)
+    , mTransform(math::Transform::createLinearTransform( voxelSize ))
+{
+    this->initMeta();
+    for (size_t i=0; i<levels; ++i) mTrees[i] = TreePtr(new TreeType(background));
+}
+
+template<typename TreeType>
+MultiResGrid<TreeType>::
+MultiResGrid(size_t levels, const Grid<TreeType> &grid, bool useInjection)
+    : MetaMap(grid)
+    , mTrees(levels)
+    , mTransform( grid.transform().copy() )
+{
+    this->initMeta();
+    mTrees[0].reset( new TreeType( grid.tree() ) );// deep copy input tree
+    mTrees[0]->voxelizeActiveTiles();
+    this->topDownRestrict(useInjection);
+}
+
+template<typename TreeType>
+MultiResGrid<TreeType>::
+MultiResGrid(size_t levels, GridPtr grid, bool useInjection)
+    : MetaMap(*grid)
+    , mTrees(levels)
+    , mTransform( grid->transform().copy() )
+{
+    this->initMeta();
+    mTrees[0] = grid->treePtr();// steal tree from input grid
+    mTrees[0]->voxelizeActiveTiles();
+    grid->newTree();
+    this->topDownRestrict(useInjection);
+}
+
+template<typename TreeType>
+inline TreeType& MultiResGrid<TreeType>::
+tree(size_t level)
+{
+    assert( level < mTrees.size() );
+    return *mTrees[level];
+}
+
+template<typename TreeType>
+inline const TreeType& MultiResGrid<TreeType>::
+constTree(size_t level) const
+{
+    assert( level < mTrees.size() );
+    return *mTrees[level];
+}
+
+template<typename TreeType>
+inline typename TreeType::Ptr MultiResGrid<TreeType>::
+treePtr(size_t level)
+{
+    assert( level < mTrees.size() );
+    return mTrees[level];
+}
+
+template<typename TreeType>
+inline typename TreeType::ConstPtr MultiResGrid<TreeType>::
+constTreePtr(size_t level) const
+{
+    assert( level < mTrees.size() );
+    return mTrees[level];
+}
+
+template<typename TreeType>
+typename Grid<TreeType>::Ptr MultiResGrid<TreeType>::
+grid(size_t level)
+{
+    typename Grid<TreeType>::Ptr grid = Grid<TreeType>::create(this->treePtr(level));
+    math::Transform::Ptr xform = mTransform->copy();
+    if (level>0) xform->preScale( Real(1 << level) );
+    grid->setTransform( xform );
+    grid->insertMeta( *this->copyMeta() );
+    grid->insertMeta( "MultiResGrid_Level", Int64Metadata(level));
+    std::stringstream ss;
+    ss << this->getName() << "_level_" << level;
+    grid->setName( ss.str() );
+    return grid;
+}
+
+template<typename TreeType>
+inline typename Grid<TreeType>::ConstPtr MultiResGrid<TreeType>::
+grid(size_t level) const
+{
+    return const_cast<MultiResGrid*>(this)->grid(level);
+}
+
+template<typename TreeType>
+template<Index Order>
+typename Grid<TreeType>::Ptr MultiResGrid<TreeType>::
+createGrid(float level, size_t grainSize) const
+{
+    assert( level >= 0.0f && level <= float(mTrees.size()-1) );
+
+    typename Grid<TreeType>::Ptr grid(new Grid<TreeType>(this->constTree(0).background()));
+    math::Transform::Ptr xform = mTransform->copy();
+    xform->preScale( math::Pow(2.0f, level) );
+    grid->setTransform( xform );
+    grid->insertMeta( *(this->copyMeta()) );
+    grid->insertMeta( "MultiResGrid_Level", FloatMetadata(level) );
+    std::stringstream ss;
+    ss << this->getName() << "_level_" << level;
+    grid->setName( ss.str() );
+
+    if ( size_t(floorf(level)) == size_t(ceilf(level)) ) {
+        grid->setTree( this->constTree( size_t(floorf(level))).copy() );
+    } else {
+        FractionOp<Order> tmp(*this, grid->tree(), level, grainSize);
+        if ( grid->getGridClass() == GRID_LEVEL_SET ) {
+            signedFloodFill( grid->tree() );
+            pruneLevelSet( grid->tree() );//only creates inactive tiles
+        }
+    }
+
+    return grid;
+}
+
+template<typename TreeType>
+GridPtrVecPtr MultiResGrid<TreeType>::
+grids()
+{
+    GridPtrVecPtr grids( new GridPtrVec );
+    for (size_t level=0; level<mTrees.size(); ++level) grids->push_back(this->grid(level));
+    return grids;
+}
+
+template<typename TreeType>
+GridCPtrVecPtr MultiResGrid<TreeType>::
+grids() const
+{
+    GridCPtrVecPtr grids( new GridCPtrVec );
+    for (size_t level=0; level<mTrees.size(); ++level) grids->push_back(this->grid(level));
+    return grids;
+}
+
+template<typename TreeType>
+Vec3R MultiResGrid<TreeType>::
+xyz(const Coord& in_ijk, size_t in_level, size_t out_level)
+{
+    return Vec3R( in_ijk.data() ) * Real(1 << in_level) / Real(1 << out_level);
+}
+
+template<typename TreeType>
+Vec3R MultiResGrid<TreeType>::
+xyz(const Vec3R& in_xyz, size_t in_level, size_t out_level)
+{
+    return in_xyz * Real(1 << in_level) / Real(1 << out_level);
+}
+
+template<typename TreeType>
+Vec3R MultiResGrid<TreeType>::
+xyz(const Vec3R& in_xyz, double in_level, double out_level)
+{
+    return in_xyz * math::Pow(2.0, in_level - out_level);
+
+}
+
+template<typename TreeType>
+template<Index Order>
+typename TreeType::ValueType MultiResGrid<TreeType>::
+sampleValue(const Coord& in_ijk, size_t in_level, size_t out_level) const
+{
+    assert( in_level  >= 0 && in_level  < mTrees.size() );
+    assert( out_level >= 0 && out_level < mTrees.size() );
+    const ConstAccessor acc(*mTrees[out_level]);// has disabled registration!
+    return tools::Sampler<Order>::sample( acc, this->xyz(in_ijk, in_level, out_level) );
+}
+
+template<typename TreeType>
+template<Index Order>
+typename TreeType::ValueType MultiResGrid<TreeType>::
+sampleValue(const Vec3R& in_xyz, size_t in_level, size_t out_level) const
+{
+    assert( in_level  >= 0 && in_level  < mTrees.size() );
+    assert( out_level >= 0 && out_level < mTrees.size() );
+    const ConstAccessor acc(*mTrees[out_level]);// has disabled registration!
+    return tools::Sampler<Order>::sample( acc, this->xyz(in_xyz, in_level, out_level) );
+}
+
+template<typename TreeType>
+template<Index Order>
+typename TreeType::ValueType MultiResGrid<TreeType>::
+sampleValue(const Coord& ijk, double level) const
+{
+    assert( level >= 0.0 && level <= double(mTrees.size()-1) );
+    const size_t level0 = size_t(floor(level)), level1 = size_t(ceil(level));
+    const ValueType v0 = this->template sampleValue<Order>( ijk, 0, level0 );
+    if ( level0 == level1 ) return v0;
+    assert( level1 - level0 == 1 );
+    const ValueType v1 = this->template sampleValue<Order>( ijk, 0, level1 );
+    const ValueType a = ValueType(level1 - level);
+    return a * v0 + (ValueType(1) - a) * v1;
+}
+
+template<typename TreeType>
+template<Index Order>
+typename TreeType::ValueType MultiResGrid<TreeType>::
+sampleValue(const Vec3R& xyz, double level) const
+{
+    assert( level >= 0.0 && level <= double(mTrees.size()-1) );
+    const size_t level0 = size_t(floor(level)), level1 = size_t(ceil(level));
+    const ValueType v0 = this->template sampleValue<Order>( xyz, 0, level0 );
+    if ( level0 == level1 ) return v0;
+    assert( level1 - level0 == 1 );
+    const ValueType v1 = this->template sampleValue<Order>( xyz, 0, level1 );
+    const ValueType a = ValueType(level1 - level);
+    return a * v0 + (ValueType(1) - a) * v1;
+}
+
+template<typename TreeType>
+typename TreeType::ValueType MultiResGrid<TreeType>::
+prolongateVoxel(const Coord& ijk, const size_t level) const
+{
+    assert( level+1 < mTrees.size() );
+    const ConstAccessor acc(*mTrees[level + 1]);// has disabled registration!
+    return ProlongateOp::run(ijk, acc);
+}
+
+template<typename TreeType>
+void MultiResGrid<TreeType>::
+prolongateActiveVoxels(size_t destlevel, size_t grainSize)
+{
+    assert( destlevel < mTrees.size()-1 );
+    TreeType &fineTree = *mTrees[ destlevel ];
+    const TreeType &coarseTree = *mTrees[ destlevel+1 ];
+    CookOp<ProlongateOp> tmp( coarseTree, fineTree, grainSize );
+}
+
+template<typename TreeType>
+typename TreeType::ValueType MultiResGrid<TreeType>::
+restrictVoxel(Coord ijk, const size_t destlevel, bool useInjection) const
+{
+    assert( destlevel > 0 && destlevel < mTrees.size() );
+    const TreeType &fineTree = *mTrees[ destlevel-1 ];
+    if ( useInjection ) return fineTree.getValue(ijk<<1);
+    const ConstAccessor acc( fineTree );// has disabled registration!
+    return RestrictOp::run( ijk, acc);
+}
+
+template<typename TreeType>
+void MultiResGrid<TreeType>::
+restrictActiveVoxels(size_t destlevel, size_t grainSize)
+{
+    assert( destlevel > 0 && destlevel < mTrees.size() );
+    const TreeType &fineTree = *mTrees[ destlevel-1 ];
+    TreeType &coarseTree = *mTrees[ destlevel ];
+    CookOp<RestrictOp> tmp( fineTree, coarseTree, grainSize );
+}
+
+template<typename TreeType>
+void MultiResGrid<TreeType>::
+print(std::ostream& os, int verboseLevel) const
+{
+    os << "MultiResGrid with " << mTrees.size() << " levels\n";
+    for (size_t i=0; i<mTrees.size(); ++i) {
+        os << "Level " << i << ": ";
+        mTrees[i]->print(os, verboseLevel);
+    }
+
+    if ( MetaMap::metaCount() > 0) {
+        os << "Additional metadata:" << std::endl;
+        for (ConstMetaIterator it = beginMeta(), end = endMeta(); it != end; ++it) {
+            os << "  " << it->first;
+            if (it->second) {
+                const std::string value = it->second->str();
+                if (!value.empty()) os << ": " << value;
+            }
+            os << "\n";
+        }
+    }
+
+    os << "Transform:" << std::endl;
+    transform().print(os, /*indent=*/"  ");
+    os << std::endl;
+}
+
+template<typename TreeType>
+void MultiResGrid<TreeType>::
+initMeta()
+{
+    const size_t levels = this->numLevels();
+    if (levels < 2) {
+        OPENVDB_THROW(ValueError, "MultiResGrid: at least two levels are required");
+    }
+    this->insertMeta("MultiResGrid_Levels", Int64Metadata( levels ) );
+}
+
+template<typename TreeType>
+void MultiResGrid<TreeType>::
+topDownRestrict(bool useInjection)
+{
+    const bool isLevelSet = this->getGridClass() == GRID_LEVEL_SET;
+    for (size_t n=1; n<mTrees.size(); ++n) {
+        const TreeType &fineTree = *mTrees[n-1];
+        mTrees[n] = TreePtr(new TreeType( fineTree.background() ) );// empty tree
+        TreeType &coarseTree = *mTrees[n];
+        if (useInjection) {// Restriction by injection
+            for (ValueOnCIter it = fineTree.cbeginValueOn(); it; ++it) {
+                const Coord ijk = it.getCoord();
+                if ( (ijk[0] & 1) || (ijk[1] & 1) || (ijk[2] & 1) ) continue;
+                coarseTree.setValue( ijk >> 1, *it );
+            }
+        } else {// Restriction by full-weighting
+            MaskOp tmp(fineTree, coarseTree, 128);
+            this->restrictActiveVoxels(n, 64);
+        }
+        if ( isLevelSet ) {
+            tools::signedFloodFill( coarseTree );
+            tools::pruneLevelSet( coarseTree );//only creates inactive tiles
+        }
+    }// loop over grid levels
+}
+
+template<typename TreeType>
+struct MultiResGrid<TreeType>::MaskOp
+{
+    typedef typename TreeType::template ValueConverter<ValueMask>::Type MaskT;
+    typedef tbb::enumerable_thread_specific<TreeType> PoolType;
+    typedef tree::LeafManager<const MaskT>  ManagerT;
+    typedef typename ManagerT::LeafRange RangeT;
+    typedef typename ManagerT::LeafNodeType::ValueOnCIter VoxelIterT;
+
+    MaskOp(const TreeType& fineTree, TreeType& coarseTree, size_t grainSize = 1)
+        : mPool(new PoolType( coarseTree ) )// empty coarse tree acts as examplar
+    {
+        assert( coarseTree.empty() );
+
+        // Create Mask of restruction performed on fineTree
+        MaskT mask(fineTree, false, true, TopologyCopy() );
+
+        // Muli-threaded dilation which also linearizes the tree to leaf nodes
+        tools::dilateActiveValues(mask, 1, NN_FACE_EDGE_VERTEX, EXPAND_TILES);
+
+        // Restriction by injection using thread-local storage of coarse tree masks
+        ManagerT leafs( mask );
+        tbb::parallel_for(leafs.leafRange( grainSize ), *this);
+
+        // multithreaded union of thread-local coarse tree masks with the coarse tree
+        typedef typename PoolType::const_iterator IterT;
+        for (IterT it=mPool->begin(); it!=mPool->end(); ++it) coarseTree.topologyUnion( *it );
+        delete mPool;
+    }
+    void operator()(const RangeT& range) const
+    {
+        Accessor coarseAcc( mPool->local() );// disabled registration
+        for (typename RangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            for (VoxelIterT voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+                Coord ijk = voxelIter.getCoord();
+                if ( (ijk[2] & 1) || (ijk[1] & 1) || (ijk[0] & 1) ) continue;//no overlap
+                coarseAcc.setValueOn( ijk >> 1 );//injection from fine to coarse level
+            }//loop over active voxels in the fine tree
+        }// loop over leaf nodes in the fine tree
+    }
+    PoolType* mPool;
+};// MaskOp
+
+template<typename TreeType>
+template<Index Order>
+struct MultiResGrid<TreeType>::FractionOp
+{
+    typedef typename TreeType::template ValueConverter<ValueMask>::Type MaskT;
+    typedef tbb::enumerable_thread_specific<MaskT>        PoolType;
+    typedef typename PoolType::iterator                   PoolIterT;
+    typedef tree::LeafManager<const TreeType>             Manager1;
+    typedef tree::LeafManager<TreeType>                   Manager2;
+    typedef typename Manager1::LeafRange                  Range1;
+    typedef typename Manager2::LeafRange                  Range2;
+
+    FractionOp(const MultiResGrid& parent,
+               TreeType& midTree,
+               float level,
+               size_t grainSize = 1)
+        : mLevel( level )
+        , mPool( NULL )
+        , mTree0( &*(parent.mTrees[size_t(floorf(level))]) )//high-resolution
+        , mTree1( &*(parent.mTrees[size_t(ceilf(level))]) ) //low-resolution
+    {
+        assert( midTree.empty() );
+        assert( mTree0 != mTree1 );
+
+        // Create a pool of  thread-local masks
+        MaskT examplar( false );
+        mPool = new PoolType( examplar );
+
+        {// create mask from re-mapping coarse tree to mid-level tree
+            tree::LeafManager<const TreeType> manager( *mTree1 );
+            tbb::parallel_for( manager.leafRange(grainSize), *this );
+        }
+
+        // Multi-threaded dilation of mask
+        tbb::parallel_for(tbb::blocked_range<PoolIterT>(mPool->begin(),mPool->end(),1), *this);
+
+        // Union thread-local coarse tree masks into the coarse tree
+        for (PoolIterT it=mPool->begin(); it!=mPool->end(); ++it) midTree.topologyUnion( *it );
+        delete mPool;
+
+        {// Interpolate values into the static mid level tree
+            Manager2 manager( midTree );
+            tbb::parallel_for(manager.leafRange(grainSize), *this);
+        }
+    }
+    void operator()(const Range1& range) const
+    {
+        typedef typename Manager1::LeafNodeType::ValueOnCIter VoxelIter;
+        // Let mLevel = level + frac, where
+        // level is integer part of mLevel and frac is the fractional part
+        // low-res voxel size in world units = dx1 = 2^(level + 1)
+        // mid-res voxel size in world units = dx  = 2^(mLevel) = 2^(level + frac)
+        // low-res index -> world: ijk * dx1
+        // world -> mid-res index: world / dx
+        // low-res index -> mid-res index: (ijk * dx1) / dx = ijk * scale where
+        // scale = dx1/dx = 2^(level+1)/2^(level+frac) = 2^(1-frac)
+        const float scale = math::Pow(2.0f, 1.0f - math::FractionalPart(mLevel));
+        tree::ValueAccessor<MaskT, false>  acc( mPool->local() );// disabled registration
+        for (typename Range1::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            for (VoxelIter voxelIter = leafIter->cbeginValueOn(); voxelIter; ++voxelIter) {
+                Coord ijk = voxelIter.getCoord();
+                ijk[0] = int(math::Round(ijk[0] * scale));
+                ijk[1] = int(math::Round(ijk[1] * scale));
+                ijk[2] = int(math::Round(ijk[2] * scale));
+                acc.setValueOn( ijk );
+            }//loop over active voxels in the fine tree
+        }// loop over leaf nodes in the fine tree
+    }
+    void operator()(const tbb::blocked_range<PoolIterT>& range) const
+    {
+        for (PoolIterT it=range.begin(); it!=range.end(); ++it) {
+            tools::dilateVoxels( *it, 1, NN_FACE_EDGE_VERTEX);
+        }
+    }
+    void operator()(const Range2 &r) const
+    {
+        typedef typename TreeType::LeafNodeType::ValueOnIter VoxelIter;
+        // Let mLevel = level + frac, where
+        // level is integer part of mLevel and frac is the fractional part
+        // high-res voxel size in world units = dx0 = 2^(level)
+        // low-res voxel size in world units = dx1 = 2^(level+1)
+        // mid-res voxel size in world units = dx  = 2^(mLevel) = 2^(level + frac)
+        // mid-res index -> world: ijk * dx
+        // world -> high-res index: world / dx0
+        // world -> low-res index: world / dx1
+        // mid-res index -> high-res index: (ijk * dx) / dx0 = ijk * scale0 where
+        // scale0 = dx/dx0 = 2^(level+frac)/2^(level) = 2^(frac)
+        // mid-res index -> low-res index: (ijk * dx) / dx1 = ijk * scale1 where
+        // scale1 = dx/dx1 = 2^(level+frac)/2^(level+1) = 2^(frac-1)
+        const float b = math::FractionalPart(mLevel), a = 1.0f - b;
+        const float scale0 = math::Pow( 2.0f, b );
+        const float scale1 = math::Pow( 2.0f,-a );
+        ConstAccessor acc0( *mTree0 ), acc1( *mTree1 );
+        for (typename Range2::Iterator leafIter = r.begin(); leafIter; ++leafIter) {
+            for (VoxelIter voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                const Vec3R xyz =  Vec3R( voxelIter.getCoord().data() );// mid level coord
+                const ValueType v0 = tools::Sampler<Order>::sample( acc0, xyz * scale0 );
+                const ValueType v1 = tools::Sampler<Order>::sample( acc1, xyz * scale1 );
+                voxelIter.setValue( ValueType(a*v0 + b*v1) );
+            }
+        }
+    }
+    const float mLevel;
+    PoolType* mPool;
+    const TreeType *mTree0, *mTree1;
+};// FractionOp
+
+template<typename TreeType>
+template<typename OperatorType>
+struct MultiResGrid<TreeType>::CookOp
+{
+  typedef tree::LeafManager<TreeType>  ManagerT;
+  typedef typename ManagerT::LeafRange RangeT;
+  CookOp(const TreeType& srcTree, TreeType& dstTree, size_t grainSize) : acc( srcTree )
+  {
+      ManagerT leafs( dstTree );
+      tbb::parallel_for( leafs.leafRange( grainSize ), *this );
+  }
+  CookOp(const CookOp &other) : acc( other.acc.tree() ) {}
+  void operator()(const RangeT& range) const
+  {
+      typedef typename RangeT::Iterator LeafIterT;
+      typedef typename ManagerT::LeafNodeType::ValueOnIter VoxelIterT;
+      for (LeafIterT leaf = range.begin(); leaf; ++leaf) {
+          ValueType* phi = leaf.buffer(0).data();// avoids small overhead of out-of-core
+          for (VoxelIterT voxel = leaf->beginValueOn(); voxel; ++voxel) {
+              phi[ voxel.pos() ] = OperatorType::run(voxel.getCoord(), acc);
+          }
+      }
+  }
+  const ConstAccessor acc;
+};// CookOp
+
+template<typename TreeType>
+struct MultiResGrid<TreeType>::RestrictOp
+{
+    /// @brief Static method that performs restriction by full weighting
+    /// @param ijk Coordinate location on the coarse tree
+    /// @param acc ValueAccessor to the fine tree
+    static ValueType run(Coord ijk, const ConstAccessor &acc)
+    {
+        ijk <<= 1;
+        // Overlapping grid point
+        ValueType v = 8*acc.getValue(ijk);
+        // neighbors in one axial direction
+        v += 4*(acc.getValue(ijk.offsetBy(-1, 0, 0)) + acc.getValue(ijk.offsetBy( 1, 0, 0)) +// x
+                acc.getValue(ijk.offsetBy( 0,-1, 0)) + acc.getValue(ijk.offsetBy( 0, 1, 0)) +// y
+                acc.getValue(ijk.offsetBy( 0, 0,-1)) + acc.getValue(ijk.offsetBy( 0, 0, 1)));// z
+        // neighbors in two axial directions
+        v += 2*(acc.getValue(ijk.offsetBy(-1,-1, 0)) + acc.getValue(ijk.offsetBy(-1, 1, 0)) +// xy
+                acc.getValue(ijk.offsetBy( 1,-1, 0)) + acc.getValue(ijk.offsetBy( 1, 1, 0)) +// xy
+                acc.getValue(ijk.offsetBy(-1, 0,-1)) + acc.getValue(ijk.offsetBy(-1, 0, 1)) +// xz
+                acc.getValue(ijk.offsetBy( 1, 0,-1)) + acc.getValue(ijk.offsetBy( 1, 0, 1)) +// xz
+                acc.getValue(ijk.offsetBy( 0,-1,-1)) + acc.getValue(ijk.offsetBy( 0,-1, 1)) +// yz
+                acc.getValue(ijk.offsetBy( 0, 1,-1)) + acc.getValue(ijk.offsetBy( 0, 1, 1)));// yz
+        // neighbors in three axial directions
+        for (int i=-1; i<=1; i+=2) {
+            for (int j=-1; j<=1; j+=2) {
+                for (int k=-1; k<=1; k+=2) v += acc.getValue(ijk.offsetBy(i,j,k));// xyz
+            }
+        }
+        v *= ValueType(1.0f/64.0f);
+        return v;
+    }
+};// RestrictOp
+
+template<typename TreeType>
+struct MultiResGrid<TreeType>::ProlongateOp
+{
+    /// @brief Interpolate values from a coarse grid (acc) into the index space (ijk) of a fine grid
+    /// @param ijk Coordinate location on the fine tree
+    /// @param acc ValueAccessor to the coarse tree
+    static ValueType run(const Coord& ijk, const ConstAccessor &acc)
+    {
+        switch ( (ijk[0] & 1) | ((ijk[1] & 1) << 1) | ((ijk[2] & 1) << 2) ) {
+        case 0:// all even
+            return acc.getValue(ijk>>1);
+        case 1:// x is odd
+            return ValueType(0.5)*(acc.getValue(ijk.offsetBy(-1,0,0)>>1) +
+                                   acc.getValue(ijk.offsetBy( 1,0,0)>>1));
+        case 2:// y is odd
+            return ValueType(0.5)*(acc.getValue(ijk.offsetBy(0,-1,0)>>1) +
+                                   acc.getValue(ijk.offsetBy(0, 1,0)>>1));
+        case 3:// x&y are odd
+            return ValueType(0.25)*(acc.getValue(ijk.offsetBy(-1,-1,0)>>1) +
+                                    acc.getValue(ijk.offsetBy(-1, 1,0)>>1) +
+                                    acc.getValue(ijk.offsetBy( 1,-1,0)>>1) +
+                                    acc.getValue(ijk.offsetBy( 1, 1,0)>>1));
+        case 4:// z is odd
+            return ValueType(0.5)*(acc.getValue(ijk.offsetBy(0,0,-1)>>1) +
+                                   acc.getValue(ijk.offsetBy(0,0, 1)>>1));
+        case 5:// x&z are odd
+            return ValueType(0.25)*(acc.getValue(ijk.offsetBy(-1,0,-1)>>1) +
+                                    acc.getValue(ijk.offsetBy(-1,0, 1)>>1) +
+                                    acc.getValue(ijk.offsetBy( 1,0,-1)>>1) +
+                                    acc.getValue(ijk.offsetBy( 1,0, 1)>>1));
+        case 6:// y&z are odd
+            return ValueType(0.25)*(acc.getValue(ijk.offsetBy(0,-1,-1)>>1) +
+                                    acc.getValue(ijk.offsetBy(0,-1, 1)>>1) +
+                                    acc.getValue(ijk.offsetBy(0, 1,-1)>>1) +
+                                    acc.getValue(ijk.offsetBy(0, 1, 1)>>1));
+        }
+        // all are odd
+        ValueType v = zeroVal<ValueType>();
+        for (int i=-1; i<=1; i+=2) {
+            for (int j=-1; j<=1; j+=2) {
+                for (int k=-1; k<=1; k+=2) v += acc.getValue(ijk.offsetBy(i,j,k)>>1);// xyz
+            }
+        }
+        return ValueType(0.125) * v;
+    }
+};// ProlongateOp
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_MULTIRESGRID_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/ParticleAtlas.h b/nuparu/include/openvdb_new/tools/ParticleAtlas.h
new file mode 100644
index 00000000..d98e0b54
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/ParticleAtlas.h
@@ -0,0 +1,1052 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file     ParticleAtlas.h
+///
+/// @brief    Space-partitioning acceleration structure for particles, points with
+///           radius. Partitions particle indices into voxels to accelerate range
+///           and nearest neighbor searches.
+///
+/// @note     This acceleration structure only stores integer offsets into an external particle
+///           data structure that conforms to the ParticleArray interface. 
+///
+/// @details  Constructs and maintains a sequence of @c PointIndexGrids each of progressively
+///           lower resolution. Particles are uniquely assigned to a particular resolution
+///           level based on their radius. This strategy has proven efficient for accelerating
+///           spatial queries on particle data sets with varying radii.
+///
+/// @details  The data structure automatically detects and adapts to particle data sets with
+///           uniform radii. The construction is simplified and spatial queries pre-cache the
+///           uniform particle radius to avoid redundant access calls to the
+///           ParticleArray::getRadius method.
+///
+/// @author   Mihai Alden
+
+
+#ifndef OPENVDB_TOOLS_PARTICLE_ATLAS_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_PARTICLE_ATLAS_HAS_BEEN_INCLUDED
+
+#include "PointIndexGrid.h"
+
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/tree/LeafNode.h>
+
+#include <boost/scoped_array.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <deque>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+/// @brief  Partition particles and performs range and nearest-neighbor searches.
+///
+/// @interface ParticleArray
+/// Expected interface for the ParticleArray container:
+/// @code
+/// template<typename VectorType>
+/// struct ParticleArray
+/// {
+///     // The type used to represent world-space positions
+///     typedef VectorType                      PosType;
+///     typedef typename PosType::value_type    ScalarType;
+///
+///     // Return the number of particles in the array
+///     size_t size() const;
+///
+///     // Return the world-space position for the nth particle.
+///     void getPos(size_t n, PosType& xyz) const;
+///
+///     // Return the world-space radius for the nth particle.
+///     void getRadius(size_t n, ScalarType& radius) const;
+/// };
+/// @endcode
+///
+/// @details    Constructs a collection of @c PointIndexGrids of different resolutions
+///             to accelerate spatial searches for particles with varying radius.
+template<typename PointIndexGridType = PointIndexGrid>
+struct ParticleAtlas
+{
+    typedef boost::shared_ptr<ParticleAtlas>            Ptr;
+    typedef boost::shared_ptr<const ParticleAtlas>      ConstPtr;
+
+    typedef typename PointIndexGridType::Ptr            PointIndexGridPtr;
+    typedef typename PointIndexGridType::ValueType      IndexType;
+
+    struct Iterator;
+
+    //////////
+
+    ParticleAtlas() : mIndexGridArray(), mMinRadiusArray(), mMaxRadiusArray() {}
+
+    /// @brief Partitions particle indices
+    ///
+    /// @param particles        container conforming to the ParticleArray interface
+    /// @param minVoxelSize     minimum voxel size limit
+    /// @param maxLevels        maximum number of resolution levels
+    template<typename ParticleArrayType>
+    void construct(const ParticleArrayType& particles, double minVoxelSize, size_t maxLevels = 50);
+
+    /// @brief Create a new @c ParticleAtlas from the given @a particles.
+    ///
+    /// @param particles        container conforming to the ParticleArray interface
+    /// @param minVoxelSize     minimum voxel size limit
+    /// @param maxLevels        maximum number of resolution levels
+    template<typename ParticleArrayType>
+    static Ptr create(const ParticleArrayType& particles, double minVoxelSize, size_t maxLevels = 50);
+
+    /// @brief Returns the number of resolution levels.
+    size_t levels() const { return mIndexGridArray.size(); }
+    /// @brief true if the container size is 0, false otherwise.
+    bool empty() const { return mIndexGridArray.empty(); }
+
+    /// @brief Returns minimum particle radius for level @a n.
+    double minRadius(size_t n) const { return mMinRadiusArray[n]; }
+    /// @brief Returns maximum particle radius for level @a n.
+    double maxRadius(size_t n) const { return mMaxRadiusArray[n]; }
+
+    /// @brief Returns the @c PointIndexGrid that represents the given level @a n.
+    PointIndexGridType& pointIndexGrid(size_t n) { return *mIndexGridArray[n]; }
+    /// @brief Returns the @c PointIndexGrid that represents the given level @a n.
+    const PointIndexGridType& pointIndexGrid(size_t n) const { return *mIndexGridArray[n]; }
+
+private:
+    // Disallow copying
+    ParticleAtlas(const ParticleAtlas&);
+    ParticleAtlas& operator=(const ParticleAtlas&);
+
+    std::vector<PointIndexGridPtr>  mIndexGridArray;
+    std::vector<double> mMinRadiusArray, mMaxRadiusArray;
+}; // struct ParticleAtlas
+
+
+typedef ParticleAtlas<PointIndexGrid> ParticleIndexAtlas;
+
+
+////////////////////////////////////////
+
+
+/// @brief Provides accelerated range and nearest-neighbor searches for
+///        particles that are partitioned using the ParticleAtlas.
+///
+/// @note  Prefer to construct the iterator object once and reuse it
+///        for subsequent queries.
+template<typename PointIndexGridType>
+struct ParticleAtlas<PointIndexGridType>::Iterator
+{
+    typedef typename PointIndexGridType::TreeType   TreeType;
+    typedef tree::ValueAccessor<const TreeType>     ConstAccessor;
+    typedef boost::scoped_ptr<ConstAccessor>        ConstAccessorPtr;
+
+    /////
+
+    /// @brief Construct an iterator from the given @a atlas.
+    explicit Iterator(const ParticleAtlas& atlas);
+
+    /// @brief Clear the iterator and update it with the result of the given
+    ///        world-space radial query.
+    /// @param center    world-space center
+    /// @param radius    world-space search radius
+    /// @param particles container conforming to the ParticleArray interface
+    template<typename ParticleArrayType>
+    void worldSpaceSearchAndUpdate(const Vec3d& center, double radius, const ParticleArrayType& particles);
+
+    /// @brief Clear the iterator and update it with the result of the given
+    ///        world-space radial query.
+    /// @param bbox      world-space bounding box
+    /// @param particles container conforming to the ParticleArray interface
+    template<typename ParticleArrayType>
+    void worldSpaceSearchAndUpdate(const BBoxd& bbox, const ParticleArrayType& particles);
+
+    /// @brief Returns the total number of resolution levels.
+    size_t levels() const { return mAtlas->levels(); }
+
+    /// @brief Clear the iterator and update it with all particles that reside
+    ///        at the given resolution @a level.
+    void updateFromLevel(size_t level);
+
+    /// Reset the iterator to point to the first item.
+    void reset();
+
+    /// Return a const reference to the item to which this iterator is pointing.
+    const IndexType& operator*() const { return *mRange.first; }
+
+    /// @{
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    bool test() const { return mRange.first < mRange.second || mIter != mRangeList.end(); }
+    operator bool() const { return this->test(); }
+    /// @}
+
+    /// Advance iterator to next item.
+    void increment();
+
+    /// Advance iterator to next item.
+    void operator++() { this->increment(); }
+
+    /// @brief Advance iterator to next item.
+    /// @return @c true if this iterator is not yet exhausted.
+    bool next();
+
+    /// Return the number of point indices in the iterator range.
+    size_t size() const;
+
+    /// Return @c true if both iterators point to the same element.
+    bool operator==(const Iterator& p) const { return mRange.first == p.mRange.first; }
+    bool operator!=(const Iterator& p) const { return !this->operator==(p); }
+
+private:
+    Iterator(const Iterator& rhs);
+    Iterator& operator=(const Iterator& rhs);
+
+    void clear();
+
+    typedef std::pair<const IndexType*, const IndexType*> Range;
+    typedef std::deque<Range>                             RangeDeque;
+    typedef typename RangeDeque::const_iterator           RangeDequeCIter;
+    typedef boost::scoped_array<IndexType>                IndexArray;
+
+    ParticleAtlas const * const mAtlas;
+    boost::scoped_array<ConstAccessorPtr> mAccessorList;
+
+    // Primary index collection
+    Range           mRange;
+    RangeDeque      mRangeList;
+    RangeDequeCIter mIter;
+    // Secondary index collection
+    IndexArray      mIndexArray;
+    size_t          mIndexArraySize, mAccessorListSize;
+}; // struct ParticleAtlas::Iterator
+
+
+////////////////////////////////////////
+
+// Internal operators and implementation details
+
+
+namespace particle_atlas_internal {
+
+
+template<typename ParticleArrayT>
+struct ComputeExtremas {
+    typedef typename ParticleArrayT::PosType    PosType;
+    typedef typename PosType::value_type        ScalarType;
+
+    ComputeExtremas(const ParticleArrayT& particles)
+        : particleArray(&particles)
+        , minRadius(std::numeric_limits<ScalarType>::max())
+        , maxRadius(-std::numeric_limits<ScalarType>::max())
+    {
+    }
+
+    ComputeExtremas(ComputeExtremas& rhs, tbb::split)
+        : particleArray(rhs.particleArray)
+        , minRadius(std::numeric_limits<ScalarType>::max())
+        , maxRadius(-std::numeric_limits<ScalarType>::max())
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) {
+
+        ScalarType radius, tmpMin = minRadius, tmpMax = maxRadius;
+
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+            particleArray->getRadius(n, radius);
+            tmpMin = std::min(radius, tmpMin);
+            tmpMax = std::max(radius, tmpMax);
+        }
+
+        minRadius = std::min(minRadius, tmpMin);
+        maxRadius = std::max(maxRadius, tmpMax);
+    }
+
+    void join(const ComputeExtremas& rhs) {
+        minRadius = std::min(minRadius, rhs.minRadius);
+        maxRadius = std::max(maxRadius, rhs.maxRadius);
+    }
+
+    ParticleArrayT const * const particleArray;
+    ScalarType minRadius, maxRadius;
+}; // struct ComputeExtremas
+
+
+template<typename ParticleArrayT, typename PointIndex>
+struct SplittableParticleArray
+{
+    typedef boost::shared_ptr<SplittableParticleArray>          Ptr;
+    typedef boost::shared_ptr<const SplittableParticleArray>    ConstPtr;
+    typedef ParticleArrayT                                      ParticleArray;
+
+    typedef typename ParticleArray::PosType                     PosType;
+    typedef typename PosType::value_type                        ScalarType;
+
+    SplittableParticleArray(const ParticleArrayT& particles)
+        : mIndexMap(), mParticleArray(&particles), mSize(particles.size())
+    {
+        updateExtremas();
+    }
+
+    SplittableParticleArray(const ParticleArrayT& particles, double minR, double maxR)
+        : mIndexMap(), mParticleArray(&particles), mSize(particles.size())
+    {
+        mMinRadius = ScalarType(minR);
+        mMaxRadius = ScalarType(maxR);
+    }
+
+    const ParticleArrayT& particleArray() const { return *mParticleArray; }
+
+    size_t size() const { return mSize; }
+
+    void getPos(size_t n, PosType& xyz) const { return mParticleArray->getPos(getGlobalIndex(n), xyz); }
+    void getRadius(size_t n, ScalarType& radius) const { return mParticleArray->getRadius(getGlobalIndex(n), radius); }
+
+    ScalarType minRadius() const { return mMinRadius; }
+    ScalarType maxRadius() const { return mMaxRadius; }
+
+    size_t getGlobalIndex(size_t n) const { return mIndexMap ? size_t(mIndexMap[n]) : n; }
+
+    /// Move all particle indices that have a radius larger or equal to @a maxRadiusLimit
+    /// into a separate container.
+    Ptr split(ScalarType maxRadiusLimit) {
+
+        if (mMaxRadius < maxRadiusLimit) return Ptr();
+
+        boost::scoped_array<bool> mask(new bool[mSize]);
+
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mSize),
+            MaskParticles(*this, mask, maxRadiusLimit));
+
+        Ptr output(new SplittableParticleArray(*this, mask));
+        if (output->size() == 0) return Ptr();
+
+        size_t newSize = 0;
+        for (size_t n = 0, N = mSize; n < N; ++n) {
+            newSize += size_t(!mask[n]);
+        }
+
+        boost::scoped_array<PointIndex> newIndexMap(new PointIndex[newSize]);
+
+        setIndexMap(newIndexMap, mask, false);
+
+        mSize = newSize;
+        mIndexMap.swap(newIndexMap);
+        updateExtremas();
+
+        return output;
+    }
+
+
+private:
+    // Disallow copying
+    SplittableParticleArray(const SplittableParticleArray&);
+    SplittableParticleArray& operator=(const SplittableParticleArray&);
+
+    // Masked copy constructor
+    SplittableParticleArray(const SplittableParticleArray& other, const boost::scoped_array<bool>& mask)
+        : mIndexMap(), mParticleArray(&other.particleArray()), mSize(0)
+    {
+        for (size_t n = 0, N = other.size(); n < N; ++n) {
+            mSize += size_t(mask[n]);
+        }
+
+        if (mSize != 0) {
+            mIndexMap.reset(new PointIndex[mSize]);
+            other.setIndexMap(mIndexMap, mask, true);
+        }
+
+        updateExtremas();
+    }
+
+    struct MaskParticles {
+        MaskParticles(const SplittableParticleArray& particles,
+            const boost::scoped_array<bool>& mask, ScalarType radius)
+            : particleArray(&particles)
+            , particleMask(mask.get())
+            , radiusLimit(radius)
+        {
+        }
+
+        void operator()(const tbb::blocked_range<size_t>& range) const {
+            const ScalarType maxRadius = radiusLimit;
+            ScalarType radius;
+            for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+                particleArray->getRadius(n, radius);
+                particleMask[n] = !(radius < maxRadius);
+            }
+        }
+
+        SplittableParticleArray const * const particleArray;
+        bool                          * const particleMask;
+        ScalarType                      const radiusLimit;
+    }; // struct MaskParticles
+
+    inline void updateExtremas() {
+        ComputeExtremas<SplittableParticleArray> op(*this);
+        tbb::parallel_reduce(tbb::blocked_range<size_t>(0, mSize), op);
+        mMinRadius = op.minRadius;
+        mMaxRadius = op.maxRadius;
+    }
+
+    void setIndexMap(boost::scoped_array<PointIndex>& newIndexMap,
+        const boost::scoped_array<bool>& mask, bool maskValue) const
+    {
+        if (mIndexMap.get() != NULL) {
+                const PointIndex* indices = mIndexMap.get();
+            for (size_t idx = 0, n = 0, N = mSize; n < N; ++n) {
+                if (mask[n] == maskValue) newIndexMap[idx++] = indices[n];
+            }
+        } else {
+            for (size_t idx = 0, n = 0, N = mSize; n < N; ++n) {
+                if (mask[n] == maskValue) newIndexMap[idx++] = PointIndex(n);
+            }
+        }
+    }
+
+
+    //////////
+
+    boost::scoped_array<PointIndex> mIndexMap;
+    ParticleArrayT const * const    mParticleArray;
+    size_t                          mSize;
+    ScalarType                      mMinRadius, mMaxRadius;
+}; // struct SplittableParticleArray
+
+
+template<typename ParticleArrayType, typename PointIndexLeafNodeType>
+struct RemapIndices {
+
+    RemapIndices(const ParticleArrayType& particles, std::vector<PointIndexLeafNodeType*> nodes)
+        : mParticles(&particles)
+        , mNodes(nodes.empty() ? NULL : &nodes.front())
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        typedef typename PointIndexLeafNodeType::ValueType PointIndexType;
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+            PointIndexLeafNodeType& node = *mNodes[n];
+            const size_t numIndices = node.indices().size();
+
+            if (numIndices > 0) {
+                PointIndexType* begin = &node.indices().front();
+                const PointIndexType* end = begin + numIndices;
+
+                while (begin < end) {
+                    *begin = PointIndexType(mParticles->getGlobalIndex(*begin));
+                    ++begin;
+                }
+            }
+        }
+    }
+
+    ParticleArrayType         const * const mParticles;
+    PointIndexLeafNodeType  * const * const mNodes;
+}; // struct RemapIndices
+
+
+template<typename ParticleArrayType, typename IndexT>
+struct RadialRangeFilter
+{
+    typedef typename ParticleArrayType::PosType             PosType;
+    typedef typename PosType::value_type                    ScalarType;
+
+    typedef std::pair<const IndexT*, const IndexT*>         Range;
+    typedef std::deque<Range>                               RangeDeque;
+    typedef std::deque<IndexT>                              IndexDeque;
+
+    RadialRangeFilter(RangeDeque& ranges, IndexDeque& indices, const PosType& xyz,
+        ScalarType radius, const ParticleArrayType& particles, bool hasUniformRadius = false)
+        : mRanges(ranges)
+        , mIndices(indices)
+        , mCenter(xyz)
+        , mRadius(radius)
+        , mParticles(&particles)
+        , mHasUniformRadius(hasUniformRadius)
+    {
+        if (mHasUniformRadius) {
+            ScalarType uniformRadius;
+            mParticles->getRadius(0, uniformRadius);
+            mRadius = mRadius + uniformRadius;
+            mRadius *= mRadius;
+        }
+    }
+
+    template <typename LeafNodeType>
+    void filterLeafNode(const LeafNodeType& leaf)
+    {
+        const size_t numIndices = leaf.indices().size();
+        if (numIndices > 0) {
+            const IndexT* begin = &leaf.indices().front();
+            filterVoxel(leaf.origin(), begin, begin + numIndices);
+        }
+    }
+
+    void filterVoxel(const Coord&, const IndexT* begin, const IndexT* end)
+    {
+        PosType pos;
+
+        if (mHasUniformRadius) {
+
+            const ScalarType searchRadiusSqr = mRadius;
+
+            while (begin < end) {
+                mParticles->getPos(size_t(*begin), pos);
+                const ScalarType distSqr = (mCenter - pos).lengthSqr();
+                if (distSqr < searchRadiusSqr) {
+                    mIndices.push_back(*begin);
+                }
+                ++begin;
+            }
+        } else {
+            while (begin < end) {
+                const size_t idx = size_t(*begin);
+                mParticles->getPos(idx, pos);
+
+                ScalarType radius;
+                mParticles->getRadius(idx, radius);
+
+                ScalarType searchRadiusSqr = mRadius + radius;
+                searchRadiusSqr *= searchRadiusSqr;
+
+                const ScalarType distSqr = (mCenter - pos).lengthSqr();
+
+                if (distSqr < searchRadiusSqr) {
+                    mIndices.push_back(*begin);
+                }
+
+                ++begin;
+            }
+        }
+    }
+
+private:
+    RadialRangeFilter(const RadialRangeFilter&);
+    RadialRangeFilter& operator=(const RadialRangeFilter&);
+
+    RangeDeque&                     mRanges;
+    IndexDeque&                     mIndices;
+    PosType                   const mCenter;
+    ScalarType                      mRadius;
+    ParticleArrayType const * const mParticles;
+    bool                      const mHasUniformRadius;
+}; // struct RadialRangeFilter
+
+
+template<typename ParticleArrayType, typename IndexT>
+struct BBoxFilter
+{
+    typedef typename ParticleArrayType::PosType     PosType;
+    typedef typename PosType::value_type            ScalarType;
+
+    typedef std::pair<const IndexT*, const IndexT*> Range;
+    typedef std::deque<Range>                       RangeDeque;
+    typedef std::deque<IndexT>                      IndexDeque;
+
+    BBoxFilter(RangeDeque& ranges, IndexDeque& indices,
+        const BBoxd& bbox, const ParticleArrayType& particles, bool hasUniformRadius = false)
+        : mRanges(ranges)
+        , mIndices(indices)
+        , mBBox(PosType(bbox.min()), PosType(bbox.max()))
+        , mCenter(mBBox.getCenter())
+        , mParticles(&particles)
+        , mHasUniformRadius(hasUniformRadius)
+        , mUniformRadiusSqr(ScalarType(0.0))
+    {
+        if (mHasUniformRadius) {
+            mParticles->getRadius(0, mUniformRadiusSqr);
+            mUniformRadiusSqr *= mUniformRadiusSqr;
+        }
+    }
+
+    template <typename LeafNodeType>
+    void filterLeafNode(const LeafNodeType& leaf)
+    {
+        const size_t numIndices = leaf.indices().size();
+        if (numIndices > 0) {
+            const IndexT* begin = &leaf.indices().front();
+            filterVoxel(leaf.origin(), begin, begin + numIndices);
+        }
+    }
+
+    void filterVoxel(const Coord&, const IndexT* begin, const IndexT* end)
+    {
+        PosType pos;
+
+        if (mHasUniformRadius) {
+            const ScalarType radiusSqr = mUniformRadiusSqr;
+
+            while (begin < end) {
+
+                mParticles->getPos(size_t(*begin), pos);
+                if (mBBox.isInside(pos)) {
+                    mIndices.push_back(*begin++);
+                    continue;
+                }
+
+                const ScalarType distSqr = pointToBBoxDistSqr(pos);
+                if (!(distSqr > radiusSqr)) {
+                    mIndices.push_back(*begin);
+                }
+
+                ++begin;
+            }
+
+        } else {
+            while (begin < end) {
+
+                const size_t idx = size_t(*begin);
+                mParticles->getPos(idx, pos);
+                if (mBBox.isInside(pos)) {
+                    mIndices.push_back(*begin++);
+                    continue;
+                }
+
+                ScalarType radius;
+                mParticles->getRadius(idx, radius);
+                const ScalarType distSqr = pointToBBoxDistSqr(pos);
+                if (!(distSqr > (radius * radius))) {
+                    mIndices.push_back(*begin);
+                }
+
+                ++begin;
+            }
+        }
+    }
+
+private:
+    BBoxFilter(const BBoxFilter&);
+    BBoxFilter& operator=(const BBoxFilter&);
+
+    ScalarType pointToBBoxDistSqr(const PosType& pos) const
+    {
+        ScalarType distSqr = ScalarType(0.0);
+
+        for (int i = 0; i < 3; ++i) {
+
+            const ScalarType a = pos[i];
+
+            ScalarType b = mBBox.min()[i];
+            if (a < b) {
+                ScalarType delta = b - a;
+                distSqr += delta * delta;
+            }
+
+            b = mBBox.max()[i];
+            if (a > b) {
+                ScalarType delta = a - b;
+                distSqr += delta * delta;
+            }
+        }
+
+        return distSqr;
+    }
+
+    RangeDeque&                     mRanges;
+    IndexDeque&                     mIndices;
+    math::BBox<PosType>       const mBBox;
+    PosType                   const mCenter;
+    ParticleArrayType const * const mParticles;
+    bool                      const mHasUniformRadius;
+    ScalarType                      mUniformRadiusSqr;
+}; // struct BBoxFilter
+
+
+} // namespace particle_atlas_internal
+
+
+////////////////////////////////////////
+
+
+template<typename PointIndexGridType>
+template<typename ParticleArrayType>
+inline void
+ParticleAtlas<PointIndexGridType>::construct(
+    const ParticleArrayType& particles, double minVoxelSize, size_t maxLevels)
+{
+    typedef typename particle_atlas_internal::
+        SplittableParticleArray<ParticleArrayType, IndexType>   SplittableParticleArray;
+    typedef typename SplittableParticleArray::Ptr               SplittableParticleArrayPtr;
+    typedef typename ParticleArrayType::ScalarType              ScalarType;
+
+    /////
+
+    particle_atlas_internal::ComputeExtremas<ParticleArrayType> extremas(particles);
+    tbb::parallel_reduce(tbb::blocked_range<size_t>(0, particles.size()), extremas);
+    const double firstMin = extremas.minRadius;
+    const double firstMax = extremas.maxRadius;
+    const double firstVoxelSize = std::max(minVoxelSize, firstMin);
+
+    if (!(firstMax < (firstVoxelSize * double(2.0))) && maxLevels > 1) {
+
+        std::vector<SplittableParticleArrayPtr> levels;
+        levels.push_back(SplittableParticleArrayPtr(
+                new SplittableParticleArray(particles, firstMin, firstMax)));
+
+        std::vector<double> voxelSizeArray;
+        voxelSizeArray.push_back(firstVoxelSize);
+
+        for (size_t n = 0; n < maxLevels; ++n) {
+
+            const double maxParticleRadius = double(levels.back()->maxRadius());
+            const double particleRadiusLimit = voxelSizeArray.back() * double(2.0);
+            if (maxParticleRadius < particleRadiusLimit) break;
+
+            SplittableParticleArrayPtr newLevel = levels.back()->split(ScalarType(particleRadiusLimit));
+            if (!newLevel) break;
+
+            levels.push_back(newLevel);
+            voxelSizeArray.push_back(double(newLevel->minRadius()));
+        }
+
+        size_t numPoints = 0;
+
+        typedef typename PointIndexGridType::TreeType       PointIndexTreeType;
+        typedef typename PointIndexTreeType::LeafNodeType   PointIndexLeafNodeType;
+
+        std::vector<PointIndexLeafNodeType*> nodes;
+
+        for (size_t n = 0, N = levels.size(); n < N; ++n) {
+
+            const SplittableParticleArray& particleArray = *levels[n];
+
+            numPoints += particleArray.size();
+
+            mMinRadiusArray.push_back(double(particleArray.minRadius()));
+            mMaxRadiusArray.push_back(double(particleArray.maxRadius()));
+
+            PointIndexGridPtr grid = createPointIndexGrid<PointIndexGridType>(particleArray, voxelSizeArray[n]);
+
+            nodes.clear();
+            grid->tree().getNodes(nodes);
+
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+                particle_atlas_internal::RemapIndices<SplittableParticleArray, PointIndexLeafNodeType>
+                (particleArray, nodes));
+
+            mIndexGridArray.push_back(grid);
+        }
+
+    } else {
+        mMinRadiusArray.push_back(firstMin);
+        mMaxRadiusArray.push_back(firstMax);
+        mIndexGridArray.push_back(
+            createPointIndexGrid<PointIndexGridType>(particles, firstVoxelSize));
+    }
+}
+
+
+template<typename PointIndexGridType>
+template<typename ParticleArrayType>
+inline typename ParticleAtlas<PointIndexGridType>::Ptr
+ParticleAtlas<PointIndexGridType>::create(
+    const ParticleArrayType& particles, double minVoxelSize, size_t maxLevels)
+{
+    Ptr ret(new ParticleAtlas());
+    ret->construct(particles, minVoxelSize, maxLevels);
+    return ret;
+}
+
+
+////////////////////////////////////////
+
+// ParticleAtlas::Iterator implementation
+
+template<typename PointIndexGridType>
+inline
+ParticleAtlas<PointIndexGridType>::Iterator::Iterator(const ParticleAtlas& atlas)
+    : mAtlas(&atlas)
+    , mAccessorList()
+    , mRange(static_cast<IndexType*>(NULL), static_cast<IndexType*>(NULL))
+    , mRangeList()
+    , mIter(mRangeList.begin())
+    , mIndexArray()
+    , mIndexArraySize(0)
+    , mAccessorListSize(atlas.levels())
+{
+    if (mAccessorListSize > 0) {
+        mAccessorList.reset(new ConstAccessorPtr[mAccessorListSize]);
+        for (size_t n = 0, N = mAccessorListSize; n < N; ++n) {
+            mAccessorList[n].reset(new ConstAccessor(atlas.pointIndexGrid(n).tree()));
+        }
+    }
+}
+
+
+template<typename PointIndexGridType>
+inline void
+ParticleAtlas<PointIndexGridType>::Iterator::reset()
+{
+    mIter = mRangeList.begin();
+    if (!mRangeList.empty()) {
+        mRange = mRangeList.front();
+    } else if (mIndexArray) {
+        mRange.first = mIndexArray.get();
+        mRange.second = mRange.first + mIndexArraySize;
+    } else {
+        mRange.first = static_cast<IndexType*>(NULL);
+        mRange.second = static_cast<IndexType*>(NULL);
+    }
+}
+
+
+template<typename PointIndexGridType>
+inline void
+ParticleAtlas<PointIndexGridType>::Iterator::increment()
+{
+    ++mRange.first;
+    if (mRange.first >= mRange.second && mIter != mRangeList.end()) {
+        ++mIter;
+        if (mIter != mRangeList.end()) {
+            mRange = *mIter;
+        } else if (mIndexArray) {
+            mRange.first = mIndexArray.get();
+            mRange.second = mRange.first + mIndexArraySize;
+        }
+    }
+}
+
+
+template<typename PointIndexGridType>
+inline bool
+ParticleAtlas<PointIndexGridType>::Iterator::next()
+{
+    if (!this->test()) return false;
+    this->increment();
+    return this->test();
+}
+
+
+template<typename PointIndexGridType>
+inline size_t
+ParticleAtlas<PointIndexGridType>::Iterator::size() const
+{
+    size_t count = 0;
+    typename RangeDeque::const_iterator it =
+        mRangeList.begin(), end = mRangeList.end();
+
+    for ( ; it != end; ++it) {
+        count += it->second - it->first;
+    }
+
+    return count + mIndexArraySize;
+}
+
+
+template<typename PointIndexGridType>
+inline void
+ParticleAtlas<PointIndexGridType>::Iterator::clear()
+{
+    mRange.first = static_cast<IndexType*>(NULL);
+    mRange.second = static_cast<IndexType*>(NULL);
+    mRangeList.clear();
+    mIter = mRangeList.end();
+    mIndexArray.reset();
+    mIndexArraySize = 0;
+}
+
+
+template<typename PointIndexGridType>
+inline void
+ParticleAtlas<PointIndexGridType>::Iterator::updateFromLevel(size_t level)
+{
+    typedef typename PointIndexGridType::TreeType   TreeType;
+    typedef typename TreeType::LeafNodeType         LeafNodeType;
+
+    this->clear();
+
+    if (mAccessorListSize > 0) {
+        const size_t levelIdx = std::min(mAccessorListSize - 1, level);
+
+        const TreeType& tree = mAtlas->pointIndexGrid(levelIdx).tree();
+
+
+        std::vector<const LeafNodeType*> nodes;
+        tree.getNodes(nodes);
+
+        for (size_t n = 0, N = nodes.size(); n < N; ++n) {
+
+            const LeafNodeType& node = *nodes[n];
+            const size_t numIndices = node.indices().size();
+
+            if (numIndices > 0) {
+                const IndexType* begin = &node.indices().front();
+                mRangeList.push_back(Range(begin, (begin + numIndices)));
+            }
+        }
+    }
+
+    this->reset();
+}
+
+
+template<typename PointIndexGridType>
+template<typename ParticleArrayType>
+inline void
+ParticleAtlas<PointIndexGridType>::Iterator::worldSpaceSearchAndUpdate(
+    const Vec3d& center, double radius, const ParticleArrayType& particles)
+{
+    typedef typename ParticleArrayType::PosType     PosType;
+    typedef typename ParticleArrayType::ScalarType  ScalarType;
+
+    /////
+
+    this->clear();
+
+    std::deque<IndexType> filteredIndices;
+    std::vector<CoordBBox> searchRegions;
+
+    const double iRadius = radius * double(1.0 / std::sqrt(3.0));
+
+    const Vec3d ibMin(center[0] - iRadius, center[1] - iRadius, center[2] - iRadius);
+    const Vec3d ibMax(center[0] + iRadius, center[1] + iRadius, center[2] + iRadius);
+
+    const Vec3d bMin(center[0] - radius, center[1] - radius, center[2] - radius);
+    const Vec3d bMax(center[0] + radius, center[1] + radius, center[2] + radius);
+
+    const PosType pos = PosType(center);
+    const ScalarType dist = ScalarType(radius);
+
+    for (size_t n = 0, N = mAccessorListSize; n < N; ++n) {
+
+        const double maxRadius = mAtlas->maxRadius(n);
+        const bool uniformRadius = math::isApproxEqual(mAtlas->minRadius(n), maxRadius);
+
+        const openvdb::math::Transform& xform = mAtlas->pointIndexGrid(n).transform();
+
+        ConstAccessor& acc = *mAccessorList[n];
+
+        openvdb::CoordBBox inscribedRegion(
+            xform.worldToIndexCellCentered(ibMin),
+            xform.worldToIndexCellCentered(ibMax));
+
+        inscribedRegion.expand(-1); // erode by one voxel
+
+        // collect indices that don't need to be tested
+        point_index_grid_internal::pointIndexSearch(mRangeList, acc, inscribedRegion);
+
+        searchRegions.clear();
+
+        const openvdb::CoordBBox region(
+            xform.worldToIndexCellCentered(bMin - maxRadius),
+            xform.worldToIndexCellCentered(bMax + maxRadius));
+
+        inscribedRegion.expand(1);
+        point_index_grid_internal::constructExclusiveRegions(searchRegions, region, inscribedRegion);
+
+        typedef particle_atlas_internal::RadialRangeFilter<ParticleArrayType, IndexType> FilterType;
+        FilterType filter(mRangeList, filteredIndices, pos, dist, particles, uniformRadius);
+
+        for (size_t i = 0, I = searchRegions.size(); i < I; ++i) {
+            point_index_grid_internal::filteredPointIndexSearch(filter, acc, searchRegions[i]);
+        }
+    }
+
+    point_index_grid_internal::dequeToArray(filteredIndices, mIndexArray, mIndexArraySize);
+
+    this->reset();
+}
+
+
+template<typename PointIndexGridType>
+template<typename ParticleArrayType>
+inline void
+ParticleAtlas<PointIndexGridType>::Iterator::worldSpaceSearchAndUpdate(
+    const BBoxd& bbox, const ParticleArrayType& particles)
+{
+    typedef typename ParticleArrayType::PosType     PosType;
+    typedef typename ParticleArrayType::ScalarType  ScalarType;
+
+    /////
+
+    this->clear();
+
+    std::deque<IndexType> filteredIndices;
+    std::vector<CoordBBox> searchRegions;
+
+    for (size_t n = 0, N = mAccessorListSize; n < N; ++n) {
+
+        const double maxRadius = mAtlas->maxRadius(n);
+        const bool uniformRadius = math::isApproxEqual(mAtlas->minRadius(n), maxRadius);
+        const openvdb::math::Transform& xform = mAtlas->pointIndexGrid(n).transform();
+
+        ConstAccessor& acc = *mAccessorList[n];
+
+        openvdb::CoordBBox inscribedRegion(
+            xform.worldToIndexCellCentered(bbox.min()),
+            xform.worldToIndexCellCentered(bbox.max()));
+
+        inscribedRegion.expand(-1); // erode by one voxel
+
+        // collect indices that don't need to be tested
+        point_index_grid_internal::pointIndexSearch(mRangeList, acc, inscribedRegion);
+
+        searchRegions.clear();
+
+        const openvdb::CoordBBox region(
+            xform.worldToIndexCellCentered(bbox.min() - maxRadius),
+            xform.worldToIndexCellCentered(bbox.max() + maxRadius));
+
+        inscribedRegion.expand(1);
+        point_index_grid_internal::constructExclusiveRegions(searchRegions, region, inscribedRegion);
+
+        typedef particle_atlas_internal::BBoxFilter<ParticleArrayType, IndexType> FilterType;
+        FilterType filter(mRangeList, filteredIndices, bbox, particles, uniformRadius);
+
+        for (size_t i = 0, I = searchRegions.size(); i < I; ++i) {
+            point_index_grid_internal::filteredPointIndexSearch(filter, acc, searchRegions[i]);
+        }
+    }
+
+    point_index_grid_internal::dequeToArray(filteredIndices, mIndexArray, mIndexArraySize);
+
+    this->reset();
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_PARTICLE_ATLAS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/ParticlesToLevelSet.h b/nuparu/include/openvdb_new/tools/ParticlesToLevelSet.h
new file mode 100644
index 00000000..5570f19a
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/ParticlesToLevelSet.h
@@ -0,0 +1,924 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file ParticlesToLevelSet.h
+///
+/// @brief This tool converts particles (with position, radius and velocity)
+/// into a signed distance field encoded as a narrow band level set.
+/// Optionally, arbitrary attributes on the particles can be transferred
+/// resulting in an additional attribute grid with the same topology
+/// as the level set grid.
+///
+/// @note This fast particle to level set converter is always intended
+/// to be combined with some kind of surface post processing,
+/// i.e. tools::Filter. Without such post processing the generated
+/// surface is typically too noisy and blobby. However it serves as a
+/// great and fast starting point for subsequent level set surface
+/// processing and convolution.
+///
+/// The @c ParticleListT template argument below refers to any class
+/// with the following interface (see unittest/TestParticlesToLevelSet.cc
+/// and SOP_DW_OpenVDBParticleVoxelizer for practical examples):
+/// @code
+///
+/// class ParticleList {
+///   ...
+/// public:
+///   typedef openvdb::Vec3R    PosType;
+///
+///   // Return the total number of particles in list.
+///   // Always required!
+///   size_t         size()          const;
+///
+///   // Get the world space position of the nth particle.
+///   // Required by ParticledToLevelSet::rasterizeSphere(*this,radius).
+///   void getPos(size_t n, Vec3R& xyz) const;
+///
+///   // Get the world space position and radius of the nth particle.
+///   // Required by ParticledToLevelSet::rasterizeSphere(*this).
+///   void getPosRad(size_t n, Vec3R& xyz, Real& rad) const;
+///
+///   // Get the world space position, radius and velocity of the nth particle.
+///   // Required by ParticledToLevelSet::rasterizeSphere(*this,radius).
+///   void getPosRadVel(size_t n, Vec3R& xyz, Real& rad, Vec3R& vel) const;
+///
+///   // Get the attribute of the nth particle. AttributeType is user-defined!
+///   // Only required if attribute transfer is enabled in ParticlesToLevelSet.
+///   void getAtt(size_t n, AttributeType& att) const;
+/// };
+/// @endcode
+///
+/// @note See unittest/TestParticlesToLevelSet.cc for an example.
+///
+/// The @c InterruptT template argument below refers to any class
+/// with the following interface:
+/// @code
+/// class Interrupter {
+///   ...
+/// public:
+///   void start(const char* name = NULL)// called when computations begin
+///   void end()                         // called when computations end
+///   bool wasInterrupted(int percent=-1)// return true to break computation
+/// };
+/// @endcode
+///
+/// @note If no template argument is provided for this InterruptT
+/// the util::NullInterrupter is used which implies that all
+/// interrupter calls are no-ops (i.e. incurs no computational overhead).
+
+#ifndef OPENVDB_TOOLS_PARTICLES_TO_LEVELSET_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_PARTICLES_TO_LEVELSET_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range.h>
+#include <tbb/task_group.h>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <boost/mpl/if.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/util/NullInterrupter.h>
+#include "Composite.h" // for csgUnion()
+#include "PointMaskGrid.h"
+#include "PointPartitioner.h"
+#include "Morphology.h" // for {dilate|erode}Voxels
+#include "Prune.h"
+#include "SignedFloodFill.h"
+#include "LevelSetTracker.h"
+#include "MaskToLevelSet.h"
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+namespace {
+// This is a simple type that combines a distance value and a particle
+// attribute. It's required for attribute transfer which is performed
+// in the ParticlesToLevelSet::Raster member class defined below.
+template<typename VisibleT, typename BlindT> class BlindData;
+}// unnamed namespace
+
+
+template<typename SdfGridT,
+         typename AttributeT = void,
+         typename InterrupterT = util::NullInterrupter>
+class ParticlesToLevelSet
+{
+public:
+    typedef typename boost::is_void<AttributeT>::type DisableT;
+    typedef InterrupterT                          InterrupterType;
+
+    typedef SdfGridT                              SdfGridType;
+    typedef typename SdfGridT::ValueType          SdfType;
+
+    typedef typename boost::mpl::if_<DisableT, size_t, AttributeT>::type  AttType;
+    typedef typename SdfGridT::template ValueConverter<AttType>::Type AttGridType;
+
+    BOOST_STATIC_ASSERT(boost::is_floating_point<SdfType>::value);
+
+    /// @brief Constructor using an exiting signed distance,
+    /// i.e. narrow band level set, grid.
+    ///
+    /// @param grid      Level set grid in which particles are rasterized
+    /// @param interrupt Callback to interrupt a long-running process
+    ///
+    /// @note The input grid is assumed to be a valid level set and if
+    /// it already contains voxels (with SDF values) particles are unioned
+    /// onto the existing level set surface. However, if attribute transfer
+    /// is enabled, i.e. AttributeT != void, attributes are only
+    /// generated for voxels that overlap with particles, not the existing
+    /// voxels in the input grid (for which no attributes exist!).
+    ///
+    /// @details The width in voxel units of the generated narrow band level set is
+    /// given by 2*background/dx, where background is the background value
+    /// stored in the grid, and dx is the voxel size derived from the
+    /// transform also stored in the grid. Also note that -background
+    /// corresponds to the constant value inside the generated narrow
+    /// band level sets. Finally the default NullInterrupter should
+    /// compile out interruption checks during optimization, thus
+    /// incurring no run-time overhead.
+    explicit ParticlesToLevelSet(SdfGridT& grid, InterrupterT* interrupt = NULL);
+
+    /// Destructor
+    ~ParticlesToLevelSet() { delete mBlindGrid; }
+
+    /// @brief This methods syncs up the level set and attribute grids
+    /// and therefore needs to be called before any of these grids are
+    /// used and after the last call to any of the rasterizer methods.
+    ///
+    /// @note Avoid calling this method more than once and only after
+    /// all the particles have been rasterized. It has no effect or
+    /// overhead if attribute transfer is disabled, i.e. AttributeT =
+    /// void and prune is false.
+    void finalize(bool prune = false);
+
+    /// @brief Return a shared pointer to the grid containing the
+    /// (optional) attribute.
+    ///
+    /// @warning If attribute transfer was disabled, i.e. AttributeT =
+    /// void, or finalize() was not called the pointer is NULL!
+    typename AttGridType::Ptr attributeGrid() { return mAttGrid; }
+
+    /// @brief Return the size of a voxel in world units
+    Real getVoxelSize() const { return mDx; }
+
+    /// @brief Return the half-width of the narrow band in voxel units
+    Real getHalfWidth() const { return mHalfWidth; }
+
+    /// @brief Return the smallest radius allowed in voxel units
+    Real getRmin() const { return mRmin; }
+    /// @brief Return the largest radius allowed in voxel units
+    Real getRmax() const { return mRmax; }
+
+    /// @brief Return true if any particles were ignored due to their size
+    bool ignoredParticles() const { return mMinCount>0 || mMaxCount>0; }
+    /// @brief Return number of small particles that were ignore due to Rmin
+    size_t getMinCount() const { return mMinCount; }
+    /// @brief Return number of large particles that were ignore due to Rmax
+    size_t getMaxCount() const { return mMaxCount; }
+
+    /// @brief set the smallest radius allowed in voxel units
+    void setRmin(Real Rmin) { mRmin = math::Max(Real(0),Rmin); }
+    /// @brief set the largest radius allowed in voxel units
+    void setRmax(Real Rmax) { mRmax = math::Max(mRmin,Rmax); }
+
+    /// @brief Returns the grain-size used for multi-threading
+    int  getGrainSize() const { return mGrainSize; }
+    /// @brief Set the grain-size used for multi-threading.
+    /// @note A grainsize of 0 or less disables multi-threading!
+    void setGrainSize(int grainSize) { mGrainSize = grainSize; }
+
+    /// @brief Very fast generation of a level set from the active
+    /// mask of an input grid, e.g. a MaskGrid generated from points.
+    ///
+    /// @param grid Points with radius (no radius required).
+    /// @param dilationInVoxels Dilation in voxel units.
+    /// @param erosionInVoxels  Erosion in voxel units. It is
+    /// recommended that erosionInVoxels <= dilationInVoxels.
+    template <typename GridT>
+    void rasterizeMask(const GridT& grid,
+                       const int dilationInVoxels = 1,
+                       const int erosionInVoxels  = 1);
+
+    /// @brief Very fast generation of a level set from points
+    /// (e.g. particles without a radius). It employes a MaskGrid
+    /// an various bit-wise topology operations.
+    ///
+    /// @param points Points with radius (no radius required).
+    /// @param dilationInVoxels Dilation in voxel units.
+    /// @param erosionInVoxels  Erosion in voxel units. It is
+    /// recommended that erosionInVoxels <= dilationInVoxels.
+    template <typename PointListT>
+    void rasterizePoints(const PointListT& points,
+                         const int dilationInVoxels = 1,
+                         const int erosionInVoxels  = 1);
+
+    /// @brief Rasterize a sphere per particle derived from their
+    /// position and radius. All spheres are CSG unioned.
+    ///
+    /// @param pa Particles with position and radius.
+    template <typename ParticleListT>
+    void rasterizeSpheres(const ParticleListT& pa);
+
+    /// @brief Rasterize a sphere per particle derived from their
+    /// position and constant radius. All spheres are CSG unioned.
+    ///
+    /// @param pa Particles with position.
+    /// @param radius Constant particle radius in world units.
+    template <typename ParticleListT>
+    void rasterizeSpheres(const ParticleListT& pa, Real radius);
+
+    /// @brief Rasterize a trail per particle derived from their
+    /// position, radius and velocity. Each trail is generated
+    /// as CSG unions of sphere instances with decreasing radius.
+    ///
+    /// @param pa particles with position, radius and velocity.
+    /// @param delta controls distance between sphere instances
+    /// (default=1). Be careful not to use too small values since this
+    /// can lead to excessive computation per trail (which the
+    /// interrupter can't stop).
+    ///
+    /// @note The direction of a trail is inverse to the direction of
+    /// the velocity vector, and the length is given by |V|. The radius
+    /// at the head of the trail is given by the radius of the particle
+    /// and the radius at the tail of the trail is Rmin voxel units which
+    /// has a default value of 1.5 corresponding to the Nyquist
+    /// frequency!
+    template <typename ParticleListT>
+    void rasterizeTrails(const ParticleListT& pa, Real delta=1.0);
+
+private:
+    typedef BlindData<SdfType, AttType> BlindType;
+    typedef typename SdfGridT::template ValueConverter<BlindType>::Type BlindGridType;
+
+    /// Class with multi-threaded implementation of particle rasterization
+    template<typename ParticleListT, typename GridT> struct Raster;
+
+    SdfGridType*   mSdfGrid;
+    typename AttGridType::Ptr   mAttGrid;
+    BlindGridType* mBlindGrid;
+    InterrupterT*  mInterrupter;
+    Real           mDx, mHalfWidth;
+    Real           mRmin, mRmax;//ignore particles outside this range of radii in voxel
+    size_t         mMinCount, mMaxCount;//counters for ignored particles!
+    int            mGrainSize;
+
+};//end of ParticlesToLevelSet class
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+inline ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::
+ParticlesToLevelSet(SdfGridT& grid, InterrupterT* interrupter) :
+    mSdfGrid(&grid),
+    mBlindGrid(NULL),
+    mInterrupter(interrupter),
+    mDx(grid.voxelSize()[0]),
+    mHalfWidth(grid.background()/mDx),
+    mRmin(1.5),// corresponds to the Nyquist grid sampling frequency
+    mRmax(100.0),// corresponds to a huge particle (probably too large!)
+    mMinCount(0),
+    mMaxCount(0),
+    mGrainSize(1)
+{
+    if (!mSdfGrid->hasUniformVoxels() ) {
+        OPENVDB_THROW(RuntimeError,
+                      "ParticlesToLevelSet only supports uniform voxels!");
+    }
+    if (mSdfGrid->getGridClass() != GRID_LEVEL_SET) {
+        OPENVDB_THROW(RuntimeError,
+                      "ParticlesToLevelSet only supports level sets!"
+                      "\nUse Grid::setGridClass(openvdb::GRID_LEVEL_SET)");
+    }
+
+    if (!DisableT::value) {
+        mBlindGrid = new BlindGridType(BlindType(grid.background()));
+        mBlindGrid->setTransform(mSdfGrid->transform().copy());
+    }
+}
+
+namespace {
+
+template<typename TreeT> struct DilationHandler
+{
+    DilationHandler(TreeT& t, int n) : tree(&t), size(n) {}
+    void operator()() const { dilateVoxels( *tree, size); }
+    TreeT* tree;
+    const int size;
+};
+template<typename TreeT> struct ErosionHandler
+{
+    ErosionHandler(TreeT& t, int n) : tree(&t), size(n) {}
+    void operator()() const { erodeVoxels( *tree, size); }
+    TreeT* tree;
+    const int size;
+};
+
+}
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+template <typename MaskGrid>
+inline void ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::
+rasterizeMask(const MaskGrid& maskGrid, int dilation, int erosion)
+{
+    // Generate a level set from the mask
+    mSdfGrid->setTree( maskToLevelSet<MaskGrid, math::FIRST_BIAS, InterrupterT>
+                       ( maskGrid, int(mHalfWidth), dilation, erosion, mInterrupter )->treePtr() );
+}
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+template <typename PointListT>
+inline void ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::
+rasterizePoints(const PointListT& points, int dilation, int erosion)
+{
+    typedef typename SdfGridT::template ValueConverter<ValueMask>::Type MaskGrid;
+    typedef typename MaskGrid::TreeType                                 MaskTree;
+    typedef typename MaskTree::Ptr                                      MaskTreePtr;
+
+    // Generate a mask grid of the points
+    MaskGrid maskGrid(MaskTreePtr(new MaskTree(mSdfGrid->tree(), false, TopologyCopy())));
+    maskGrid.setTransform( mSdfGrid->transform().copy() );
+    PointMaskGrid<MaskGrid, InterrupterT> pmg( maskGrid, mInterrupter );
+    pmg.addPoints( points );
+    // Generate a level set from the mask
+    this->rasterizeMask( maskGrid, dilation, erosion );
+}
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+template <typename ParticleListT>
+inline void ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::
+rasterizeSpheres(const ParticleListT& pa)
+{
+    if (DisableT::value) {
+        Raster<ParticleListT, SdfGridT> r(*this, mSdfGrid, pa);
+        r.rasterizeSpheres();
+    } else {
+        Raster<ParticleListT, BlindGridType> r(*this, mBlindGrid, pa);
+        r.rasterizeSpheres();
+    }
+}
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+template <typename ParticleListT>
+inline void ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::
+rasterizeSpheres(const ParticleListT& pa, Real radius)
+{
+    if (DisableT::value) {
+        Raster<ParticleListT, SdfGridT> r(*this, mSdfGrid, pa);
+        r.rasterizeSpheres(radius/mDx);
+    } else {
+        Raster<ParticleListT, BlindGridType> r(*this, mBlindGrid, pa);
+        r.rasterizeSpheres(radius/mDx);
+    }
+}
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+template <typename ParticleListT>
+inline void ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::
+rasterizeTrails(const ParticleListT& pa, Real delta)
+{
+    if (DisableT::value) {
+        Raster<ParticleListT, SdfGridT> r(*this, mSdfGrid, pa);
+        r.rasterizeTrails(delta);
+    } else {
+        Raster<ParticleListT, BlindGridType> r(*this, mBlindGrid, pa);
+        r.rasterizeTrails(delta);
+    }
+}
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+inline void
+ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::finalize(bool prune)
+{
+    if (mBlindGrid==NULL) {
+        if (prune) tools::pruneLevelSet(mSdfGrid->tree());
+        return;
+    } else {
+        if (prune) tools::prune(mBlindGrid->tree());
+    }
+
+    typedef typename SdfGridType::TreeType   SdfTreeT;
+    typedef typename AttGridType::TreeType   AttTreeT;
+    typedef typename BlindGridType::TreeType BlindTreeT;
+    // Use topology copy constructors since output grids have the same topology as mBlindDataGrid
+    const BlindTreeT& tree = mBlindGrid->tree();
+
+    // New level set tree
+    typename SdfTreeT::Ptr sdfTree(new SdfTreeT(
+        tree, tree.background().visible(), openvdb::TopologyCopy()));
+
+    // Note this overwrites any existing attribute grids!
+    typename AttTreeT::Ptr attTree(new AttTreeT(
+        tree, tree.background().blind(), openvdb::TopologyCopy()));
+    mAttGrid = typename AttGridType::Ptr(new AttGridType(attTree));
+    mAttGrid->setTransform(mBlindGrid->transform().copy());
+
+    // Extract the level set and IDs from mBlindDataGrid. We will
+    // explore the fact that by design active values always live
+    // at the leaf node level, i.e. level sets have no active tiles!
+    typedef typename BlindTreeT::LeafCIter    LeafIterT;
+    typedef typename BlindTreeT::LeafNodeType LeafT;
+    typedef typename SdfTreeT::LeafNodeType   SdfLeafT;
+    typedef typename AttTreeT::LeafNodeType   AttLeafT;
+    for (LeafIterT n = tree.cbeginLeaf(); n; ++n) {
+        const LeafT& leaf = *n;
+        const openvdb::Coord xyz = leaf.origin();
+        // Get leafnodes that were allocated during topology construction!
+        SdfLeafT* sdfLeaf = sdfTree->probeLeaf(xyz);
+        AttLeafT* attLeaf = attTree->probeLeaf(xyz);
+        // Use linear offset (vs coordinate) access for better performance!
+        typename LeafT::ValueOnCIter m=leaf.cbeginValueOn();
+        if (!m) {//no active values in leaf node so copy everything
+            for (openvdb::Index k = 0; k!=LeafT::SIZE; ++k) {
+                const BlindType& v = leaf.getValue(k);
+                sdfLeaf->setValueOnly(k, v.visible());
+                attLeaf->setValueOnly(k, v.blind());
+            }
+        } else {//only copy active values (using flood fill for the inactive values)
+            for(; m; ++m) {
+                const openvdb::Index k = m.pos();
+                const BlindType& v = *m;
+                sdfLeaf->setValueOnly(k, v.visible());
+                attLeaf->setValueOnly(k, v.blind());
+            }
+        }
+    }
+
+    tools::signedFloodFill(*sdfTree);//required since we only transferred active voxels!
+
+    if (mSdfGrid->empty()) {
+        mSdfGrid->setTree(sdfTree);
+    } else {
+        tools::csgUnion(mSdfGrid->tree(), *sdfTree, /*prune=*/true);
+    }
+}
+
+///////////////////////////////////////////////////////////
+
+template<typename SdfGridT, typename AttributeT, typename InterrupterT>
+template<typename ParticleListT, typename GridT>
+struct ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT>::Raster
+{
+    typedef typename boost::is_void<AttributeT>::type DisableT;
+    typedef ParticlesToLevelSet<SdfGridT, AttributeT, InterrupterT> ParticlesToLevelSetT;
+    typedef typename ParticlesToLevelSetT::SdfType   SdfT;//type of signed distance values
+    typedef typename ParticlesToLevelSetT::AttType   AttT;//type of particle attribute
+    typedef typename GridT::ValueType                ValueT;
+    typedef typename GridT::Accessor                 AccessorT;
+    typedef typename GridT::TreeType                 TreeT;
+    typedef typename TreeT::LeafNodeType             LeafNodeT;
+    typedef PointPartitioner<Index32, LeafNodeT::LOG2DIM> PointPartitionerT;
+
+
+    /// @brief Main constructor
+    Raster(ParticlesToLevelSetT& parent, GridT* grid, const ParticleListT& particles)
+        : mParent(parent)
+        , mParticles(particles)
+        , mGrid(grid)
+        , mMap(*(mGrid->transform().baseMap()))
+        , mMinCount(0)
+        , mMaxCount(0)
+        , mIsCopy(false)
+    {
+        mPointPartitioner = new PointPartitionerT();
+        mPointPartitioner->construct(particles, mGrid->transform());
+    }
+
+    /// @brief Copy constructor called by tbb threads
+    Raster(Raster& other, tbb::split)
+        : mParent(other.mParent)
+        , mParticles(other.mParticles)
+        , mGrid(new GridT(*other.mGrid, openvdb::ShallowCopy()))
+        , mMap(other.mMap)
+        , mMinCount(0)
+        , mMaxCount(0)
+        , mTask(other.mTask)
+        , mIsCopy(true)
+        , mPointPartitioner(other.mPointPartitioner)
+    {
+        mGrid->newTree();
+    }
+
+    virtual ~Raster() {
+
+        // Copies construct temporary grids that have to be deleted
+        // but the original has ownership of the bucket array
+        if (mIsCopy) {
+            delete mGrid;
+        } else {
+            delete mPointPartitioner;
+        }
+    }
+
+    /// @brief Rasterize a sphere per particle derived from their
+    /// position and radius. All spheres are CSG unioned.
+    void rasterizeSpheres()
+    {
+        mMinCount = mMaxCount = 0;
+        if (mParent.mInterrupter) {
+            mParent.mInterrupter->start("Rasterizing particles to level set using spheres");
+        }
+        mTask = boost::bind(&Raster::rasterSpheres, _1, _2);
+        this->cook();
+        if (mParent.mInterrupter) mParent.mInterrupter->end();
+    }
+    /// @brief Rasterize a sphere per particle derived from their
+    /// position and constant radius. All spheres are CSG unioned.
+    /// @param radius constant radius of all particles in voxel units.
+    void rasterizeSpheres(Real radius)
+    {
+        mMinCount = radius < mParent.mRmin ? mParticles.size() : 0;
+        mMaxCount = radius > mParent.mRmax ? mParticles.size() : 0;
+        if (mMinCount>0 || mMaxCount>0) {//skipping all particles!
+            mParent.mMinCount = mMinCount;
+            mParent.mMaxCount = mMaxCount;
+        } else {
+            if (mParent.mInterrupter) {
+                mParent.mInterrupter->start(
+                    "Rasterizing particles to level set using const spheres");
+            }
+            mTask = boost::bind(&Raster::rasterFixedSpheres, _1, _2, SdfT(radius));
+            this->cook();
+            if (mParent.mInterrupter) mParent.mInterrupter->end();
+        }
+    }
+    /// @brief Rasterize a trail per particle derived from their
+    /// position, radius and velocity. Each trail is generated
+    /// as CSG unions of sphere instances with decreasing radius.
+    ///
+    /// @param delta controls distance between sphere instances
+    /// (default=1). Be careful not to use too small values since this
+    /// can lead to excessive computation per trail (which the
+    /// interrupter can't stop).
+    ///
+    /// @note The direction of a trail is inverse to the direction of
+    /// the velocity vector, and the length is given by |V|. The radius
+    /// at the head of the trail is given by the radius of the particle
+    /// and the radius at the tail of the trail is Rmin voxel units which
+    /// has a default value of 1.5 corresponding to the Nyquist frequency!
+    void rasterizeTrails(Real delta=1.0)
+    {
+        mMinCount = mMaxCount = 0;
+        if (mParent.mInterrupter) {
+            mParent.mInterrupter->start("Rasterizing particles to level set using trails");
+        }
+        mTask = boost::bind(&Raster::rasterTrails, _1, _2, SdfT(delta));
+        this->cook();
+        if (mParent.mInterrupter) mParent.mInterrupter->end();
+    }
+
+    /// @brief Kicks off the optionally multithreaded computation
+    void operator()(const tbb::blocked_range<size_t>& r)
+    {
+        assert(mTask);
+        mTask(this, r);
+        mParent.mMinCount = mMinCount;
+        mParent.mMaxCount = mMaxCount;
+    }
+
+    /// @brief Reguired by tbb::parallel_reduce
+    void join(Raster& other)
+    {
+        tools::csgUnion(*mGrid, *other.mGrid, /*prune=*/true);
+        mMinCount += other.mMinCount;
+        mMaxCount += other.mMaxCount;
+    }
+private:
+    /// Disallow assignment since some of the members are references
+    Raster& operator=(const Raster&) { return *this; }
+
+    /// @return true if the particle is too small or too large
+    bool ignoreParticle(SdfT R)
+    {
+        if (R < mParent.mRmin) {// below the cutoff radius
+            ++mMinCount;
+            return true;
+        }
+        if (R > mParent.mRmax) {// above the cutoff radius
+            ++mMaxCount;
+            return true;
+        }
+        return false;
+    }
+    /// @brief Reguired by tbb::parallel_reduce to multithreaded
+    /// rasterization of particles as spheres with variable radius
+    ///
+    /// @param r tbb's default range referring to the list of particles
+    void rasterSpheres(const tbb::blocked_range<size_t>& r)
+    {
+        AccessorT acc = mGrid->getAccessor(); // local accessor
+        bool run = true;
+        const SdfT invDx = SdfT(1/mParent.mDx);
+        AttT att;
+        Vec3R pos;
+        Real rad;
+
+        // Loop over buckets
+        for (size_t n = r.begin(), N = r.end(); n < N; ++n) {
+            // Loop over particles in bucket n.
+            typename PointPartitionerT::IndexIterator iter = mPointPartitioner->indices(n);
+            for ( ; run && iter; ++iter) {
+                const Index32& id = *iter;
+                mParticles.getPosRad(id, pos, rad);
+                const SdfT R = SdfT(invDx * rad);// in voxel units
+                if (this->ignoreParticle(R)) continue;
+                const Vec3R P = mMap.applyInverseMap(pos);
+                this->getAtt<DisableT>(id, att);
+                run = this->makeSphere(P, R, att, acc);
+            }//end loop over particles
+        }//end loop over buckets
+    }
+
+    /// @brief Reguired by tbb::parallel_reduce to multithreaded
+    /// rasterization of particles as spheres with a fixed radius
+    ///
+    /// @param r tbb's default range referring to the list of particles
+    void rasterFixedSpheres(const tbb::blocked_range<size_t>& r, SdfT R)
+    {
+        const SdfT
+            dx = static_cast<SdfT>(mParent.mDx),
+            w = static_cast<SdfT>(mParent.mHalfWidth); // in voxel units
+        AccessorT acc = mGrid->getAccessor(); // local accessor
+        const ValueT inside = -mGrid->background();
+        const SdfT max = R + w;// maximum distance in voxel units
+        const SdfT max2 = math::Pow2(max);//square of maximum distance in voxel units
+        const SdfT min2 = math::Pow2(math::Max(SdfT(0), R - w));//square of minimum distance
+        ValueT v;
+        size_t count = 0;
+        AttT att;
+        Vec3R pos;
+
+        // Loop over buckets
+        for (size_t n = r.begin(), N = r.end(); n < N; ++n) {
+            // Loop over particles in bucket n.
+            typename PointPartitionerT::IndexIterator iter = mPointPartitioner->indices(n);
+            for ( ; iter; ++iter) {
+                const Index32& id = *iter;
+                this->getAtt<DisableT>(id, att);
+                mParticles.getPos(id, pos);
+                const Vec3R P = mMap.applyInverseMap(pos);
+                const Coord a(math::Floor(P[0]-max),math::Floor(P[1]-max),math::Floor(P[2]-max));
+                const Coord b(math::Ceil( P[0]+max),math::Ceil( P[1]+max),math::Ceil( P[2]+max));
+                for (Coord c = a; c.x() <= b.x(); ++c.x()) {
+                    //only check interrupter every 32'th scan in x
+                    if (!(count++ & ((1<<5)-1)) && util::wasInterrupted(mParent.mInterrupter)) {
+                        tbb::task::self().cancel_group_execution();
+                        return;
+                    }
+                    SdfT x2 = static_cast<SdfT>(math::Pow2(c.x() - P[0]));
+                    for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) {
+                        SdfT x2y2 = static_cast<SdfT>(x2 + math::Pow2(c.y() - P[1]));
+                        for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
+                            SdfT x2y2z2 = static_cast<SdfT>(
+                                x2y2 + math::Pow2(c.z()- P[2])); // square distance from c to P
+                            if (x2y2z2 >= max2 || (!acc.probeValue(c,v) && v<ValueT(0)))
+                                continue;//outside narrow band of particle or inside existing level set
+                            if (x2y2z2 <= min2) {//inside narrow band of the particle.
+                                acc.setValueOff(c, inside);
+                                continue;
+                            }
+                            // convert signed distance from voxel units to world units
+                            const ValueT d=Merge(dx*(math::Sqrt(x2y2z2) - R), att);
+                            if (d < v) acc.setValue(c, d);//CSG union
+                        }//end loop over z
+                    }//end loop over y
+                }//end loop over x
+            }//end loop over particles
+        }// end loop over buckts
+    }
+
+    /// @brief Reguired by tbb::parallel_reduce to multithreaded
+    /// rasterization of particles as spheres with velocity blurring
+    ///
+    /// @param r tbb's default range referring to the list of particles
+    void rasterTrails(const tbb::blocked_range<size_t>& r, SdfT delta)
+    {
+        AccessorT acc = mGrid->getAccessor(); // local accessor
+        bool run = true;
+        AttT att;
+        Vec3R pos, vel;
+        Real rad;
+        const Vec3R origin = mMap.applyInverseMap(Vec3R(0,0,0));
+        const SdfT Rmin = SdfT(mParent.mRmin), invDx = SdfT(1/mParent.mDx);
+
+        // Loop over buckets
+        for (size_t n = r.begin(), N = r.end(); n < N; ++n) {
+            // Loop over particles in bucket n.
+            typename PointPartitionerT::IndexIterator iter = mPointPartitioner->indices(n);
+            for ( ; run && iter; ++iter) {
+                const Index32& id = *iter;
+                mParticles.getPosRadVel(id, pos, rad, vel);
+                const SdfT R0 = SdfT(invDx*rad);
+                if (this->ignoreParticle(R0)) continue;
+                this->getAtt<DisableT>(id, att);
+                const Vec3R P0 = mMap.applyInverseMap(pos);
+                const Vec3R V  = mMap.applyInverseMap(vel) - origin;//exclude translation
+                const SdfT speed = SdfT(V.length()), inv_speed = SdfT(1.0/speed);
+                const Vec3R Nrml = -V*inv_speed;// inverse normalized direction
+                Vec3R P = P0;// local position of instance
+                SdfT R = R0, d=0;// local radius and length of trail
+                for (size_t m=0; run && d <= speed ; ++m) {
+                    run = this->makeSphere(P, R, att, acc);
+                    P += 0.5*delta*R*Nrml;// adaptive offset along inverse velocity direction
+                    d  = SdfT((P-P0).length());// current length of trail
+                    R  = R0-(R0-Rmin)*d*inv_speed;// R = R0 -> mRmin(e.g. 1.5)
+                }//end loop over sphere instances
+            }//end loop over particles
+        }//end loop over buckets
+    }
+
+    void cook()
+    {
+        // parallelize over the point buckets
+        const Index32 bucketCount = Index32(mPointPartitioner->size());
+
+        if (mParent.mGrainSize>0) {
+            tbb::parallel_reduce(
+              tbb::blocked_range<size_t>(0, bucketCount, mParent.mGrainSize), *this);
+        } else {
+            (*this)(tbb::blocked_range<size_t>(0, bucketCount));
+        }
+    }
+
+    /// @brief Rasterize sphere at position P and radius R into a
+    /// narrow-band level set with half-width, mHalfWidth.
+    /// @return false if it was interrupted
+    ///
+    /// @param P coordinates of the particle position in voxel units
+    /// @param R radius of particle in voxel units
+    /// @param id
+    /// @param accessor grid accessor with a private copy of the grid
+    ///
+    /// @note For best performance all computations are performed in
+    /// voxel-space with the important exception of the final level set
+    /// value that is converted to world units (e.g. the grid stores
+    /// the closest Euclidean signed distances measured in world
+    /// units). Also note we use the convention of positive distances
+    /// outside the surface and negative distances inside the surface.
+    bool makeSphere(const Vec3R &P, SdfT R, const AttT& att, AccessorT& acc)
+    {
+        const ValueT inside = -mGrid->background();
+        const SdfT dx = SdfT(mParent.mDx), w = SdfT(mParent.mHalfWidth);
+        const SdfT max = R + w;// maximum distance in voxel units
+        const Coord a(math::Floor(P[0]-max),math::Floor(P[1]-max),math::Floor(P[2]-max));
+        const Coord b(math::Ceil( P[0]+max),math::Ceil( P[1]+max),math::Ceil( P[2]+max));
+        const SdfT max2 = math::Pow2(max);//square of maximum distance in voxel units
+        const SdfT min2 = math::Pow2(math::Max(SdfT(0), R - w));//square of minimum distance
+        ValueT v;
+        size_t count = 0;
+        for ( Coord c = a; c.x() <= b.x(); ++c.x() ) {
+            //only check interrupter every 32'th scan in x
+            if (!(count++ & ((1<<5)-1)) && util::wasInterrupted(mParent.mInterrupter)) {
+                tbb::task::self().cancel_group_execution();
+                return false;
+            }
+            SdfT x2 = SdfT(math::Pow2(c.x() - P[0]));
+            for (c.y() = a.y(); c.y() <= b.y(); ++c.y()) {
+                SdfT x2y2 = SdfT(x2 + math::Pow2(c.y() - P[1]));
+                for (c.z() = a.z(); c.z() <= b.z(); ++c.z()) {
+                    SdfT x2y2z2 = SdfT(x2y2 + math::Pow2(c.z()-P[2]));//square distance from c to P
+                    if (x2y2z2 >= max2 || (!acc.probeValue(c,v) && v<ValueT(0)))
+                        continue;//outside narrow band of the particle or inside existing level set
+                    if (x2y2z2 <= min2) {//inside narrow band of the particle.
+                        acc.setValueOff(c, inside);
+                        continue;
+                    }
+                    // convert signed distance from voxel units to world units
+                    //const ValueT d=dx*(math::Sqrt(x2y2z2) - R);
+                    const ValueT d=Merge(dx*(math::Sqrt(x2y2z2) - R), att);
+                    if (d < v) acc.setValue(c, d);//CSG union
+                }//end loop over z
+            }//end loop over y
+        }//end loop over x
+        return true;
+    }
+    typedef typename boost::function<void (Raster*, const tbb::blocked_range<size_t>&)> FuncType;
+
+    template <typename DisableType>
+    typename boost::enable_if<DisableType>::type
+    getAtt(size_t, AttT&) const {;}
+
+    template <typename DisableType>
+    typename boost::disable_if<DisableType>::type
+    getAtt(size_t n, AttT& a) const { mParticles.getAtt(n, a); }
+
+    template <typename T>
+    typename boost::enable_if<boost::is_same<T,ValueT>, ValueT>::type
+    Merge(T s, const AttT&) const { return s; }
+
+    template <typename T>
+    typename boost::disable_if<boost::is_same<T,ValueT>, ValueT>::type
+    Merge(T s, const AttT& a) const { return ValueT(s,a); }
+
+    ParticlesToLevelSetT& mParent;
+    const ParticleListT&  mParticles;//list of particles
+    GridT*                mGrid;
+    const math::MapBase&  mMap;
+    size_t                mMinCount, mMaxCount;//counters for ignored particles!
+    FuncType              mTask;
+    const bool            mIsCopy;
+    PointPartitionerT*    mPointPartitioner;
+};//end of Raster struct
+
+
+///////////////////// YOU CAN SAFELY IGNORE THIS SECTION /////////////////////
+
+namespace {
+
+// This is a simple type that combines a distance value and a particle
+// attribute. It's required for attribute transfer which is defined in the
+// Raster class above.
+template<typename VisibleT, typename BlindT>
+class BlindData
+{
+public:
+    typedef VisibleT type;
+    typedef VisibleT VisibleType;
+    typedef BlindT   BlindType;
+
+    BlindData() {}
+    explicit BlindData(VisibleT v) : mVisible(v), mBlind(zeroVal<BlindType>()) {}
+    BlindData(VisibleT v, BlindT b) : mVisible(v), mBlind(b) {}
+    BlindData& operator=(const BlindData& rhs)
+    {
+        mVisible = rhs.mVisible;
+        mBlind = rhs.mBlind;
+        return *this;
+    }
+    const VisibleT& visible() const { return mVisible; }
+    const BlindT&   blind()   const { return mBlind; }
+    OPENVDB_NO_FP_EQUALITY_WARNING_BEGIN
+    bool operator==(const BlindData& rhs)     const { return mVisible == rhs.mVisible; }
+    OPENVDB_NO_FP_EQUALITY_WARNING_END
+    bool operator< (const BlindData& rhs)     const { return mVisible <  rhs.mVisible; }
+    bool operator> (const BlindData& rhs)     const { return mVisible >  rhs.mVisible; }
+    BlindData operator+(const BlindData& rhs) const { return BlindData(mVisible + rhs.mVisible); }
+    BlindData operator+(const VisibleT&  rhs) const { return BlindData(mVisible + rhs); }
+    BlindData operator-(const BlindData& rhs) const { return BlindData(mVisible - rhs.mVisible); }
+    BlindData operator-() const { return BlindData(-mVisible, mBlind); }
+
+protected:
+    VisibleT mVisible;
+    BlindT   mBlind;
+};
+
+// Required by several of the tree nodes
+template<typename VisibleT, typename BlindT>
+inline std::ostream& operator<<(std::ostream& ostr, const BlindData<VisibleT, BlindT>& rhs)
+{
+    ostr << rhs.visible();
+    return ostr;
+}
+
+// Required by math::Abs
+template<typename VisibleT, typename BlindT>
+inline BlindData<VisibleT, BlindT> Abs(const BlindData<VisibleT, BlindT>& x)
+{
+    return BlindData<VisibleT, BlindT>(math::Abs(x.visible()), x.blind());
+}
+
+} // unnamed namespace
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_PARTICLES_TO_LEVELSET_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/PointAdvect.h b/nuparu/include/openvdb_new/tools/PointAdvect.h
new file mode 100644
index 00000000..80df623c
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/PointAdvect.h
@@ -0,0 +1,424 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth, D.J. Hill (openvdb port, added staggered grid support)
+/// @file PointAdvect.h
+///
+/// @brief Class PointAdvect advects points (with position) in a static velocity field
+
+#ifndef OPENVDB_TOOLS_POINT_ADVECT_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_ADVECT_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+#include <openvdb/math/Math.h>             // min
+#include <openvdb/Types.h>                 // Vec3 types and version number
+#include <openvdb/Grid.h>                  // grid
+#include <openvdb/util/NullInterrupter.h>
+#include "Interpolation.h"                 // sampling
+#include "VelocityFields.h"                // VelocityIntegrator
+
+#include <boost/static_assert.hpp>
+#include <tbb/blocked_range.h>             // threading
+#include <tbb/parallel_for.h>              // threading
+#include <tbb/task.h>                      // for cancel
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// Class that holds a Vec3 grid, to be interpreted as the closest point to a constraint
+/// surface.  Supports a method to allow a point to be projected onto the closest point
+/// on the constraint surface.  Uses Caching.
+template<typename CptGridT = Vec3fGrid>
+class ClosestPointProjector
+{
+public:
+    typedef CptGridT                            CptGridType;
+    typedef typename CptGridType::ConstAccessor CptAccessor;
+    typedef typename CptGridType::ValueType     CptValueType;
+
+    ClosestPointProjector():
+        mCptIterations(0)
+    {
+    }
+    ClosestPointProjector(const CptGridType& cptGrid, int n):
+        mCptGrid(&cptGrid),
+        mCptAccessor(cptGrid.getAccessor()),
+        mCptIterations(n)
+    {
+    }
+    ClosestPointProjector(const ClosestPointProjector &other):
+        mCptGrid(other.mCptGrid),
+        mCptAccessor(mCptGrid->getAccessor()),
+        mCptIterations(other.mCptIterations)
+    {
+    }
+    void setConstraintIterations(unsigned int cptIterations) { mCptIterations = cptIterations; }
+    unsigned int numIterations() { return mCptIterations; }
+
+    // point constraint
+    template <typename LocationType>
+    inline void projectToConstraintSurface(LocationType& W) const
+    {
+        /// Entries in the CPT tree are the closest point to the constraint surface.
+        /// The interpolation step in sample introduces error so that the result
+        /// of a single sample may not lie exactly on the surface.  The iterations
+        /// in the loop exist to minimize this error.
+        CptValueType result(W[0], W[1],W[2]);
+        for (unsigned int i = 0; i < mCptIterations; ++i) {
+            const Vec3R location = mCptGrid->worldToIndex(Vec3R(result[0], result[1], result[2]));
+            BoxSampler::sample<CptAccessor>(mCptAccessor, location, result);
+        }
+        W[0] = result[0];
+        W[1] = result[1];
+        W[2] = result[2];
+    }
+
+private:
+    const CptGridType*  mCptGrid; // Closest-Point-Transform vector field
+    CptAccessor         mCptAccessor;
+    unsigned int        mCptIterations;
+};// end of ClosestPointProjector class
+
+////////////////////////////////////////
+
+
+/// Performs passive or constrained advection of points in a velocity field
+/// represented by an OpenVDB grid and an optional closest-point-transform (CPT)
+/// represented in another OpenVDB grid.  Note the CPT is assumed to be
+/// in world coordinates and NOT index coordinates!
+/// Supports both collocated velocity grids and staggered velocity grids
+///
+/// The @c PointListT template argument refers to any class with the following
+/// interface (e.g., std::vector<openvdb::Vec3f>):
+/// @code
+/// class PointList {
+///     ...
+/// public:
+///     typedef internal_vector3_type value_type; // must support [] component access
+///     openvdb::Index size() const;              // number of points in list
+///     value_type& operator[](int n);            // world space position of nth point
+/// };
+/// @endcode
+///
+/// @note All methods (except size) are assumed to be thread-safe and
+/// the positions are returned as non-const references since the
+/// advection method needs to modify them!
+template<typename GridT = Vec3fGrid,
+         typename PointListT = std::vector<typename GridT::ValueType>,
+         bool StaggeredVelocity = false,
+         typename InterrupterType = util::NullInterrupter>
+class PointAdvect
+{
+public:
+    typedef GridT                                        GridType;
+    typedef PointListT                                   PointListType;
+    typedef typename PointListT::value_type              LocationType;
+    typedef VelocityIntegrator<GridT, StaggeredVelocity> VelocityFieldIntegrator;
+
+    PointAdvect(const GridT& velGrid, InterrupterType* interrupter=NULL) :
+        mVelGrid(&velGrid),
+        mPoints(NULL),
+        mIntegrationOrder(1),
+        mThreaded(true),
+        mInterrupter(interrupter)
+    {
+    }
+    PointAdvect(const PointAdvect &other) :
+        mVelGrid(other.mVelGrid),
+        mPoints(other.mPoints),
+        mDt(other.mDt),
+        mAdvIterations(other.mAdvIterations),
+        mIntegrationOrder(other.mIntegrationOrder),
+        mThreaded(other.mThreaded),
+        mInterrupter(other.mInterrupter)
+    {
+    }
+    virtual ~PointAdvect()
+    {
+    }
+    /// If the order of the integration is set to zero no advection is performed
+    bool earlyOut() const { return (mIntegrationOrder==0);}
+    /// get & set
+    void setThreaded(bool threaded) { mThreaded = threaded; }
+    bool getThreaded() { return mThreaded; }
+    void setIntegrationOrder(unsigned int order) {mIntegrationOrder = order;}
+
+    /// Constrained advection of a list of points over a time = dt * advIterations
+    void advect(PointListT& points, float dt, unsigned int advIterations = 1)
+    {
+        if (this->earlyOut()) return; // nothing to do!
+        mPoints        = &points;
+        mDt            = dt;
+        mAdvIterations = advIterations;
+
+        if (mInterrupter) mInterrupter->start("Advecting points by OpenVDB velocity field: ");
+        if (mThreaded) {
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, mPoints->size()), *this);
+        } else {
+            (*this)(tbb::blocked_range<size_t>(0, mPoints->size()));
+        }
+        if (mInterrupter) mInterrupter->end();
+    }
+
+    /// Never call this method directly - it is use by TBB and has to be public!
+    void operator() (const tbb::blocked_range<size_t> &range) const
+    {
+        if (mInterrupter && mInterrupter->wasInterrupted()) {
+            tbb::task::self().cancel_group_execution();
+        }
+
+        VelocityFieldIntegrator  velField(*mVelGrid);
+        switch (mIntegrationOrder) {
+        case 1:
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    // loop over number of time steps
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<1>(mDt, X0);
+                    }
+                }
+            }
+            break;
+        case 2:
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    // loop over number of time steps
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<2>(mDt, X0);
+                    }
+                }
+            }
+            break;
+        case 3:
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    // loop over number of time steps
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<3>(mDt, X0);
+                    }
+                }
+            }
+            break;
+        case 4:
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    // loop over number of time steps
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<4>(mDt, X0);
+                    }
+                }
+            }
+            break;
+        }
+    }
+
+private:
+    // the velocity field
+    const GridType*        mVelGrid;
+
+    // vertex list of all the points
+    PointListT*            mPoints;
+
+    // time integration parameters
+    float                  mDt;                // time step
+    unsigned int           mAdvIterations;     // number of time steps
+    unsigned int           mIntegrationOrder;
+
+    // operational parameters
+    bool                   mThreaded;
+    InterrupterType*       mInterrupter;
+
+};//end of PointAdvect class
+
+
+template<typename GridT = Vec3fGrid,
+         typename PointListT = std::vector<typename GridT::ValueType>,
+         bool StaggeredVelocity = false,
+         typename CptGridType = GridT,
+         typename InterrupterType = util::NullInterrupter>
+class ConstrainedPointAdvect
+{
+public:
+    typedef GridT                                        GridType;
+    typedef typename PointListT::value_type              LocationType;
+    typedef VelocityIntegrator<GridT, StaggeredVelocity> VelocityIntegratorType;
+    typedef ClosestPointProjector<CptGridType>           ClosestPointProjectorType;
+    typedef PointListT PointListType;
+
+    ConstrainedPointAdvect(const GridType& velGrid,
+        const GridType& cptGrid, int cptn, InterrupterType* interrupter = NULL):
+        mVelGrid(&velGrid),
+        mCptGrid(&cptGrid),
+        mCptIter(cptn),
+        mInterrupter(interrupter)
+    {
+    }
+    ConstrainedPointAdvect(const ConstrainedPointAdvect& other):
+        mVelGrid(other.mVelGrid),
+        mCptGrid(other.mCptGrid),
+        mCptIter(other.mCptIter),
+        mPoints(other.mPoints),
+        mDt(other.mDt),
+        mAdvIterations(other.mAdvIterations),
+        mIntegrationOrder(other.mIntegrationOrder),
+        mThreaded(other.mThreaded),
+        mInterrupter(other.mInterrupter)
+    {
+    }
+    virtual ~ConstrainedPointAdvect(){}
+
+    void setConstraintIterations(unsigned int cptIter) {mCptIter = cptIter;}
+    void setIntegrationOrder(unsigned int order) {mIntegrationOrder = order;}
+
+    void setThreaded(bool threaded) { mThreaded = threaded; }
+    bool getThreaded() { return mThreaded; }
+
+    /// Constrained Advection a list of points over a time = dt * advIterations
+    void advect(PointListT& points, float dt, unsigned int advIterations = 1)
+    {
+        mPoints = &points;
+        mDt     = dt;
+
+        if (mIntegrationOrder==0 && mCptIter == 0) {
+            return; // nothing to do!
+        }
+        (mIntegrationOrder>0) ? mAdvIterations = advIterations : mAdvIterations = 1;
+
+        if (mInterrupter) mInterrupter->start("Advecting points by OpenVDB velocity field: ");
+        const size_t N = mPoints->size();
+
+        if (mThreaded) {
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, N), *this);
+        } else {
+            (*this)(tbb::blocked_range<size_t>(0, N));
+        }
+        if (mInterrupter) mInterrupter->end();
+    }
+
+
+    /// Never call this method directly - it is use by TBB and has to be public!
+    void operator() (const tbb::blocked_range<size_t> &range) const
+    {
+        if (mInterrupter && mInterrupter->wasInterrupted()) {
+            tbb::task::self().cancel_group_execution();
+        }
+
+        VelocityIntegratorType velField(*mVelGrid);
+        ClosestPointProjectorType cptField(*mCptGrid, mCptIter);
+        switch (mIntegrationOrder) {
+        case 0://pure CPT projection
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        cptField.projectToConstraintSurface(X0);
+                    }
+                }
+            }
+            break;
+        case 1://1'th order advection and CPT projection
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<1>(mDt, X0);
+                        cptField.projectToConstraintSurface(X0);
+                    }
+                }
+            }
+            break;
+        case 2://2'nd order advection and CPT projection
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<2>(mDt, X0);
+                        cptField.projectToConstraintSurface(X0);
+                    }
+                }
+            }
+            break;
+
+        case 3://3'rd order advection and CPT projection
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<3>(mDt, X0);
+                        cptField.projectToConstraintSurface(X0);
+                    }
+                }
+            }
+            break;
+        case 4://4'th order advection and CPT projection
+            {
+                for (size_t n = range.begin(); n != range.end(); ++n) {
+                    LocationType& X0 = (*mPoints)[n];
+                    for (unsigned int i = 0; i < mAdvIterations; ++i) {
+                        velField.template rungeKutta<4>(mDt, X0);
+                        cptField.projectToConstraintSurface(X0);
+                    }
+                }
+            }
+            break;
+        }
+    }
+
+private:
+    const GridType*         mVelGrid;           // the velocity field
+    const GridType*         mCptGrid;
+    int                     mCptIter;
+    PointListT*             mPoints;            // vertex list of all the points
+
+    // time integration parameters
+    float                   mDt;                // time step
+    unsigned int            mAdvIterations;     // number of time steps
+    unsigned int            mIntegrationOrder;  // order of Runge-Kutta integration
+    // operational parameters
+    bool                    mThreaded;
+    InterrupterType*        mInterrupter;
+};// end of ConstrainedPointAdvect class
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_POINT_ADVECT_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/PointIndexGrid.h b/nuparu/include/openvdb_new/tools/PointIndexGrid.h
new file mode 100644
index 00000000..c903017b
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/PointIndexGrid.h
@@ -0,0 +1,1822 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file   PointIndexGrid.h
+///
+/// @brief  Space-partitioning acceleration structure for points. Partitions
+///         the points into voxels to accelerate range and nearest neighbor
+///         searches.
+///
+/// @note   Leaf nodes store a single point-index array and the voxels are only
+///         integer offsets into that array. The actual points are never stored
+///         in the acceleration structure, only offsets into an external array.
+///
+/// @author Mihai Alden
+
+#ifndef OPENVDB_TOOLS_POINT_INDEX_GRID_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_INDEX_GRID_HAS_BEEN_INCLUDED
+
+
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include <openvdb/math/Transform.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/tree/LeafNode.h>
+#include <openvdb/tree/LeafManager.h>
+#include "PointPartitioner.h"
+
+#include <boost/scoped_array.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/atomic.h>
+#include <iostream>
+#include <deque>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+namespace tree {
+template<Index, typename> struct SameLeafConfig; // forward declaration
+}
+
+namespace tools {
+
+template<typename T, Index Log2Dim> struct PointIndexLeafNode; // forward declaration
+
+/// Point index tree configured to match the default OpenVDB tree configuration
+typedef tree::Tree<tree::RootNode<tree::InternalNode<tree::InternalNode
+    <PointIndexLeafNode<PointIndex32, 3>, 4>, 5> > > PointIndexTree;
+
+/// Point index grid
+typedef Grid<PointIndexTree> PointIndexGrid;
+
+
+////////////////////////////////////////
+
+
+/// @interface PointArray
+/// Expected interface for the PointArray container:
+/// @code
+/// template<typename VectorType>
+/// struct PointArray
+/// {
+///     // The type used to represent world-space point positions
+///     typedef VectorType  PosType;
+///
+///     // Return the number of points in the array
+///     size_t size() const;
+///
+///     // Return the world-space position of the nth point in the array.
+///     void getPos(size_t n, PosType& xyz) const;
+/// };
+/// @endcode
+
+
+////////////////////////////////////////
+
+
+/// @brief  Partition points into a point index grid to accelerate range and
+///         nearest-neighbor searches.
+///
+/// @param points       world-space point array conforming to the PointArray interface
+/// @param voxelSize    voxel size in world units
+template<typename GridT, typename PointArrayT>
+inline typename GridT::Ptr
+createPointIndexGrid(const PointArrayT& points, double voxelSize);
+
+
+/// @brief  Partition points into a point index grid to accelerate range and
+///         nearest-neighbor searches.
+///
+/// @param points   world-space point array conforming to the PointArray interface
+/// @param xform    world-to-index-space transform
+template<typename GridT, typename PointArrayT>
+inline typename GridT::Ptr
+createPointIndexGrid(const PointArrayT& points, const math::Transform& xform);
+
+
+/// @brief  Return @c true if the given point index grid represents a valid partitioning
+///         of the given point array.
+///
+/// @param points   world-space point array conforming to the PointArray interface
+/// @param grid     point index grid to validate
+template<typename PointArrayT, typename GridT>
+inline bool
+isValidPartition(const PointArrayT& points, const GridT& grid);
+
+
+/// Repartition the @a points if needed, otherwise return the input @a grid.
+template<typename GridT, typename PointArrayT>
+inline typename GridT::ConstPtr
+getValidPointIndexGrid(const PointArrayT& points, const typename GridT::ConstPtr& grid);
+
+/// Repartition the @a points if needed, otherwise return the input @a grid.
+template<typename GridT, typename PointArrayT>
+inline typename GridT::Ptr
+getValidPointIndexGrid(const PointArrayT& points, const typename GridT::Ptr& grid);
+
+
+////////////////////////////////////////
+
+
+/// Accelerated range and nearest-neighbor searches for point index grids
+template<typename TreeType = PointIndexTree>
+struct PointIndexIterator
+{
+    typedef tree::ValueAccessor<const TreeType> ConstAccessor;
+    typedef typename TreeType::LeafNodeType     LeafNodeType;
+    typedef typename TreeType::ValueType        ValueType;
+
+
+    PointIndexIterator();
+    PointIndexIterator(const PointIndexIterator& rhs);
+    PointIndexIterator& operator=(const PointIndexIterator& rhs);
+
+
+    /// @brief Construct an iterator over the indices of the points contained in voxel (i, j, k).
+    /// @param ijk  the voxel containing the points over which to iterate
+    /// @param acc  an accessor for the grid or tree that holds the point indices
+    PointIndexIterator(const Coord& ijk, ConstAccessor& acc);
+
+
+    /// @brief Construct an iterator over the indices of the points contained in
+    ///        the given bounding box.
+    /// @param bbox  the bounding box of the voxels containing the points over which to iterate
+    /// @param acc   an accessor for the grid or tree that holds the point indices
+    /// @note  The range of the @a bbox is inclusive. Thus, a bounding box with
+    ///        min = max is not empty but rather encloses a single voxel.
+    PointIndexIterator(const CoordBBox& bbox, ConstAccessor& acc);
+
+
+    /// @brief Clear the iterator and update it with the result of the given voxel query.
+    /// @param ijk  the voxel containing the points over which to iterate
+    /// @param acc  an accessor for the grid or tree that holds the point indices
+    void searchAndUpdate(const Coord& ijk, ConstAccessor& acc);
+
+
+    /// @brief Clear the iterator and update it with the result of the given voxel region query.
+    /// @param bbox  the bounding box of the voxels containing the points over which to iterate
+    /// @param acc   an accessor for the grid or tree that holds the point indices
+    /// @note  The range of the @a bbox is inclusive. Thus, a bounding box with
+    ///        min = max is not empty but rather encloses a single voxel.
+    void searchAndUpdate(const CoordBBox& bbox, ConstAccessor& acc);
+
+
+    /// @brief Clear the iterator and update it with the result of the given
+    ///        index-space bounding box query.
+    /// @param bbox     index-space bounding box
+    /// @param acc      an accessor for the grid or tree that holds the point indices
+    /// @param points   world-space point array conforming to the PointArray interface
+    /// @param xform    linear, uniform-scale transform (i.e., cubical voxels)
+    template<typename PointArray>
+    void searchAndUpdate(const BBoxd& bbox, ConstAccessor& acc,
+        const PointArray& points, const math::Transform& xform);
+
+
+    /// @brief Clear the iterator and update it with the result of the given
+    ///        index-space radial query.
+    /// @param center   index-space center
+    /// @param radius   index-space radius
+    /// @param acc      an accessor for the grid or tree that holds the point indices
+    /// @param points   world-space point array conforming to the PointArray interface
+    /// @param xform    linear, uniform-scale transform (i.e., cubical voxels)
+    /// @param subvoxelAccuracy  if true, check individual points against the search region,
+    ///                 otherwise return all points that reside in voxels that are inside
+    ///                 or intersect the search region
+    template<typename PointArray>
+    void searchAndUpdate(const Vec3d& center, double radius, ConstAccessor& acc,
+        const PointArray& points, const math::Transform& xform, bool subvoxelAccuracy = true);
+
+
+    /// @brief Clear the iterator and update it with the result of the given
+    ///        world-space bounding box query.
+    /// @param bbox     world-space bounding box
+    /// @param acc      an accessor for the grid or tree that holds the point indices
+    /// @param points   world-space point array conforming to the PointArray interface
+    /// @param xform    linear, uniform-scale transform (i.e., cubical voxels)
+    template<typename PointArray>
+    void worldSpaceSearchAndUpdate(const BBoxd& bbox, ConstAccessor& acc,
+        const PointArray& points, const math::Transform& xform);
+
+
+    /// @brief Clear the iterator and update it with the result of the given
+    ///        world-space radial query.
+    /// @param center   world-space center
+    /// @param radius   world-space radius
+    /// @param acc      an accessor for the grid or tree that holds the point indices
+    /// @param points   world-space point array conforming to the PointArray interface
+    /// @param xform    linear, uniform-scale transform (i.e., cubical voxels)
+    /// @param subvoxelAccuracy  if true, check individual points against the search region,
+    ///                 otherwise return all points that reside in voxels that are inside
+    ///                 or intersect the search region
+    template<typename PointArray>
+    void worldSpaceSearchAndUpdate(const Vec3d& center, double radius, ConstAccessor& acc,
+        const PointArray& points, const math::Transform& xform, bool subvoxelAccuracy = true);
+
+
+    /// Reset the iterator to point to the first item.
+    void reset();
+
+    /// Return a const reference to the item to which this iterator is pointing.
+    const ValueType& operator*() const { return *mRange.first; }
+
+    /// @{
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    bool test() const { return mRange.first < mRange.second || mIter != mRangeList.end(); }
+    operator bool() const { return this->test(); }
+    /// @}
+
+    /// Advance iterator to next item.
+    void increment();
+
+    /// Advance iterator to next item.
+    void operator++() { this->increment(); }
+
+
+    /// @brief Advance iterator to next item.
+    /// @return @c true if this iterator is not yet exhausted.
+    bool next();
+
+    /// Return the number of point indices in the iterator range.
+    size_t size() const;
+
+    /// Return @c true if both iterators point to the same element.
+    bool operator==(const PointIndexIterator& p) const { return mRange.first == p.mRange.first; }
+    bool operator!=(const PointIndexIterator& p) const { return !this->operator==(p); }
+
+
+private:
+    typedef std::pair<const ValueType*, const ValueType*> Range;
+    typedef std::deque<Range>                             RangeDeque;
+    typedef typename RangeDeque::const_iterator           RangeDequeCIter;
+    typedef boost::scoped_array<ValueType>                IndexArray;
+
+    void clear();
+
+    // Primary index collection
+    Range           mRange;
+    RangeDeque      mRangeList;
+    RangeDequeCIter mIter;
+    // Secondary index collection
+    IndexArray      mIndexArray;
+    size_t          mIndexArraySize;
+}; // struct PointIndexIterator
+
+
+/// @brief Selectively extract and filter point data using a custom filter operator.
+///
+/// @par FilterType example:
+/// @interface FilterType
+/// @code
+/// template<typename T>
+/// struct WeightedAverageAccumulator {
+///   typedef T ValueType;
+///
+///   WeightedAverageAccumulator(T const * const array, const T radius)
+///     : mValues(array), mInvRadius(1.0/radius), mWeightSum(0.0), mValueSum(0.0) {}
+///
+///   void reset() { mWeightSum = mValueSum = T(0.0); }
+///
+///   // the following method is invoked by the PointIndexFilter
+///   void operator()(const T distSqr, const size_t pointIndex) {
+///     const T weight = T(1.0) - openvdb::math::Sqrt(distSqr) * mInvRadius;
+///     mWeightSum += weight;
+///     mValueSum += weight * mValues[pointIndex];
+///   }
+///
+///   T result() const { return mWeightSum > T(0.0) ? mValueSum / mWeightSum : T(0.0); }
+///
+/// private:
+///   T const * const mValues;
+///   const T mInvRadius;
+///   T mWeightSum, mValueSum;
+/// }; // struct WeightedAverageAccumulator
+/// @endcode
+template<typename PointArray, typename TreeType = PointIndexTree>
+struct PointIndexFilter
+{
+    typedef typename PointArray::PosType        PosType;
+    typedef typename PosType::value_type        ScalarType;
+    typedef tree::ValueAccessor<const TreeType> ConstAccessor;
+
+    /// @brief Constructor
+    /// @param points   world-space point array conforming to the PointArray interface
+    /// @param tree     a point index tree
+    /// @param xform    linear, uniform-scale transform (i.e., cubical voxels)
+    PointIndexFilter(const PointArray& points, const TreeType& tree, const math::Transform& xform);
+
+    /// Thread safe copy constructor
+    PointIndexFilter(const PointIndexFilter& rhs);
+
+    /// @brief  Perform a radial search query and apply the given filter
+    ///         operator to the selected points.
+    /// @param center  world-space center
+    /// @param radius  world-space radius
+    /// @param op      custom filter operator (see the FilterType example for interface details)
+    template<typename FilterType>
+    void searchAndApply(const PosType& center, ScalarType radius, FilterType& op);
+
+private:
+    PointArray const * const mPoints;
+    ConstAccessor mAcc;
+    const math::Transform mXform;
+    const ScalarType mInvVoxelSize;
+    PointIndexIterator<TreeType> mIter;
+}; // struct PointIndexFilter
+
+
+////////////////////////////////////////
+
+// Internal operators and implementation details
+
+
+namespace point_index_grid_internal {
+
+template<typename PointArrayT>
+struct ValidPartitioningOp
+{
+    ValidPartitioningOp(tbb::atomic<bool>& hasChanged,
+        const PointArrayT& points, const math::Transform& xform)
+        : mPoints(&points)
+        , mTransform(&xform)
+        , mHasChanged(&hasChanged)
+    {
+    }
+
+    template <typename LeafT>
+    void operator()(LeafT &leaf, size_t /*leafIndex*/) const
+    {
+        if ((*mHasChanged)) {
+            tbb::task::self().cancel_group_execution();
+            return;
+        }
+
+        typedef typename LeafT::IndexArray          IndexArrayT;
+        typedef typename IndexArrayT::value_type    IndexT;
+        typedef typename PointArrayT::PosType       PosType;
+
+        typename LeafT::ValueOnCIter iter;
+        Coord voxelCoord;
+        PosType point;
+
+        const IndexT *begin = static_cast<IndexT*>(NULL), *end = static_cast<IndexT*>(NULL);
+
+        for (iter = leaf.cbeginValueOn(); iter; ++iter) {
+
+            if ((*mHasChanged)) break;
+
+            voxelCoord = iter.getCoord();
+            leaf.getIndices(iter.pos(), begin, end);
+
+            while (begin < end) {
+
+                mPoints->getPos(*begin, point);
+                if (voxelCoord != mTransform->worldToIndexCellCentered(point)) {
+                    mHasChanged->fetch_and_store(true);
+                    break;
+                }
+
+                ++begin;
+            }
+        }
+    }
+
+private:
+    PointArrayT         const * const mPoints;
+    math::Transform     const * const mTransform;
+    tbb::atomic<bool>         * const mHasChanged;
+};
+
+
+template<typename LeafNodeT>
+struct PopulateLeafNodesOp
+{
+    typedef uint32_t IndexT;
+    typedef PointPartitioner<IndexT, LeafNodeT::LOG2DIM> Partitioner;
+
+    PopulateLeafNodesOp(boost::scoped_array<LeafNodeT*>& leafNodes,
+        const Partitioner& partitioner)
+        : mLeafNodes(leafNodes.get())
+        , mPartitioner(&partitioner)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typedef typename Partitioner::VoxelOffsetType VoxelOffsetT;
+
+        size_t maxPointCount = 0;
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+            maxPointCount = std::max(maxPointCount, mPartitioner->indices(n).size());
+        }
+
+        const IndexT voxelCount = LeafNodeT::SIZE;
+
+        // allocate histogram buffers
+        boost::scoped_array<VoxelOffsetT> offsets(new VoxelOffsetT[maxPointCount]);
+        boost::scoped_array<IndexT> histogram(new IndexT[voxelCount]);
+
+        VoxelOffsetT const * const voxelOffsets = mPartitioner->voxelOffsets().get();
+
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+            LeafNodeT* node = new LeafNodeT();
+            node->setOrigin(mPartitioner->origin(n));
+
+            typename Partitioner::IndexIterator it = mPartitioner->indices(n);
+
+            const size_t pointCount = it.size();
+            IndexT const * const indices = &*it;
+
+            // local copy of voxel offsets.
+            for (IndexT i = 0; i < pointCount; ++i) {
+                offsets[i] = voxelOffsets[ indices[i] ];
+            }
+
+            // compute voxel-offset histogram
+            memset(&histogram[0], 0, voxelCount * sizeof(IndexT));
+            for (IndexT i = 0; i < pointCount; ++i) {
+                ++histogram[ offsets[i] ];
+            }
+
+            typename LeafNodeT::NodeMaskType& mask = node->getValueMask();
+            typename LeafNodeT::Buffer& buffer = node->buffer();
+
+            // scan histogram (all-prefix-sums)
+            IndexT count = 0, startOffset;
+            for (int i = 0; i < int(voxelCount); ++i) {
+                if (histogram[i] > 0) {
+                    startOffset = count;
+                    count += histogram[i];
+                    histogram[i] = startOffset;
+                    mask.setOn(i);
+                }
+                buffer.setValue(i, count);
+            }
+
+            // allocate point-index array
+            node->indices().resize(pointCount);
+            typename LeafNodeT::ValueType * const orderedIndices = node->indices().data();
+
+            // rank and permute
+            for (IndexT i = 0; i < pointCount; ++i) {
+                orderedIndices[ histogram[ offsets[i] ]++ ] = indices[i];
+            }
+
+            mLeafNodes[n] = node;
+        }
+    }
+
+    //////////
+
+    LeafNodeT*        * const mLeafNodes;
+    Partitioner const * const mPartitioner;
+};
+
+
+/// Construct a @c PointIndexTree
+template<typename TreeType, typename PointArray>
+inline void
+constructPointTree(TreeType& tree, const math::Transform& xform, const PointArray& points)
+{
+    typedef typename TreeType::LeafNodeType LeafType;
+
+    boost::scoped_array<LeafType*> leafNodes;
+    size_t leafNodeCount = 0;
+
+    {
+        PointPartitioner<uint32_t, LeafType::LOG2DIM> partitioner;
+        partitioner.construct(points, xform, /*voxelOrder=*/false, /*recordVoxelOffsets=*/true);
+
+        leafNodeCount = partitioner.size();
+        leafNodes.reset(new LeafType*[leafNodeCount]);
+
+        const tbb::blocked_range<size_t> range(0, leafNodeCount);
+        tbb::parallel_for(range, PopulateLeafNodesOp<LeafType>(leafNodes, partitioner));
+    }
+
+    tree::ValueAccessor<TreeType> acc(tree);
+    for (size_t n = 0; n < leafNodeCount; ++n) {
+        acc.addLeaf(leafNodes[n]);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T>
+inline void
+dequeToArray(const std::deque<T>& d, boost::scoped_array<T>& a, size_t& size)
+{
+    size = d.size();
+    a.reset(new T[size]);
+    typename std::deque<T>::const_iterator it = d.begin(), itEnd = d.end();
+    T* item = a.get();
+    for ( ; it != itEnd; ++it, ++item) *item = *it;
+}
+
+
+inline void
+constructExclusiveRegions(std::vector<CoordBBox>& regions,
+    const CoordBBox& bbox, const CoordBBox& ibox)
+{
+    regions.clear();
+    regions.reserve(6);
+    Coord cmin = ibox.min();
+    Coord cmax = ibox.max();
+
+    // left-face bbox
+    regions.push_back(bbox);
+    regions.back().max().z() = cmin.z();
+
+    // right-face bbox
+    regions.push_back(bbox);
+    regions.back().min().z() = cmax.z();
+
+    --cmax.z(); // accounting for cell centered bucketing.
+    ++cmin.z();
+
+    // front-face bbox
+    regions.push_back(bbox);
+    CoordBBox* lastRegion = &regions.back();
+    lastRegion->min().z() = cmin.z();
+    lastRegion->max().z() = cmax.z();
+    lastRegion->max().x() = cmin.x();
+
+    // back-face bbox
+    regions.push_back(*lastRegion);
+    lastRegion = &regions.back();
+    lastRegion->min().x() = cmax.x();
+    lastRegion->max().x() = bbox.max().x();
+
+    --cmax.x();
+    ++cmin.x();
+
+    // bottom-face bbox
+    regions.push_back(*lastRegion);
+    lastRegion = &regions.back();
+    lastRegion->min().x() = cmin.x();
+    lastRegion->max().x() = cmax.x();
+    lastRegion->max().y() = cmin.y();
+
+    // top-face bbox
+    regions.push_back(*lastRegion);
+    lastRegion = &regions.back();
+    lastRegion->min().y() = cmax.y();
+    lastRegion->max().y() = bbox.max().y();
+}
+
+
+template<typename PointArray, typename IndexT>
+struct BBoxFilter
+{
+    typedef typename PointArray::PosType            PosType;
+    typedef typename PosType::value_type            ScalarType;
+    typedef std::pair<const IndexT*, const IndexT*> Range;
+    typedef std::deque<Range>                       RangeDeque;
+    typedef std::deque<IndexT>                      IndexDeque;
+
+    BBoxFilter(RangeDeque& ranges, IndexDeque& indices, const BBoxd& bbox,
+        const PointArray& points, const math::Transform& xform)
+        : mRanges(ranges)
+        , mIndices(indices)
+        , mRegion(bbox)
+        , mPoints(points)
+        , mMap(*xform.baseMap())
+    {
+    }
+
+    template <typename LeafNodeType>
+    void filterLeafNode(const LeafNodeType& leaf)
+    {
+        typename LeafNodeType::ValueOnCIter iter;
+        const IndexT *begin = static_cast<IndexT*>(NULL), *end = static_cast<IndexT*>(NULL);
+        for (iter = leaf.cbeginValueOn(); iter; ++iter) {
+            leaf.getIndices(iter.pos(), begin, end);
+            filterVoxel(iter.getCoord(), begin, end);
+        }
+    }
+
+    void filterVoxel(const Coord&, const IndexT* begin, const IndexT* end)
+    {
+        PosType vec;
+
+        for (; begin < end; ++begin) {
+            mPoints.getPos(*begin, vec);
+
+            if (mRegion.isInside(mMap.applyInverseMap(vec))) {
+                mIndices.push_back(*begin);
+            }
+        }
+    }
+
+private:
+    RangeDeque& mRanges;
+    IndexDeque& mIndices;
+    const BBoxd mRegion;
+    const PointArray& mPoints;
+    const math::MapBase& mMap;
+};
+
+
+template<typename PointArray, typename IndexT>
+struct RadialRangeFilter
+{
+    typedef typename PointArray::PosType            PosType;
+    typedef typename PosType::value_type            ScalarType;
+    typedef std::pair<const IndexT*, const IndexT*> Range;
+    typedef std::deque<Range>                       RangeDeque;
+    typedef std::deque<IndexT>                      IndexDeque;
+
+    RadialRangeFilter(RangeDeque& ranges, IndexDeque& indices, const Vec3d& xyz, double radius,
+        const PointArray& points, const math::Transform& xform,
+        const double leafNodeDim, const bool subvoxelAccuracy)
+        : mRanges(ranges)
+        , mIndices(indices)
+        , mCenter(xyz)
+        , mWSCenter(xform.indexToWorld(xyz))
+        , mVoxelDist1(ScalarType(0.0))
+        , mVoxelDist2(ScalarType(0.0))
+        , mLeafNodeDist1(ScalarType(0.0))
+        , mLeafNodeDist2(ScalarType(0.0))
+        , mWSRadiusSqr(ScalarType(radius * xform.voxelSize()[0]))
+        , mPoints(points)
+        , mSubvoxelAccuracy(subvoxelAccuracy)
+    {
+        const ScalarType voxelRadius = ScalarType(std::sqrt(3.0) * 0.5);
+        mVoxelDist1 = voxelRadius + ScalarType(radius);
+        mVoxelDist1 *= mVoxelDist1;
+
+        if (radius > voxelRadius) {
+            mVoxelDist2 = ScalarType(radius) - voxelRadius;
+            mVoxelDist2 *= mVoxelDist2;
+        }
+
+        const ScalarType leafNodeRadius = ScalarType(leafNodeDim * std::sqrt(3.0) * 0.5);
+        mLeafNodeDist1 = leafNodeRadius + ScalarType(radius);
+        mLeafNodeDist1 *= mLeafNodeDist1;
+
+        if (radius > leafNodeRadius) {
+            mLeafNodeDist2 = ScalarType(radius) - leafNodeRadius;
+            mLeafNodeDist2 *= mLeafNodeDist2;
+        }
+
+        mWSRadiusSqr *= mWSRadiusSqr;
+    }
+
+    template <typename LeafNodeType>
+    void filterLeafNode(const LeafNodeType& leaf)
+    {
+        {
+            const Coord& ijk = leaf.origin();
+            PosType vec;
+            vec[0] = ScalarType(ijk[0]);
+            vec[1] = ScalarType(ijk[1]);
+            vec[2] = ScalarType(ijk[2]);
+            vec += ScalarType(LeafNodeType::DIM - 1) * 0.5;
+            vec -= mCenter;
+
+            const ScalarType dist = vec.lengthSqr();
+            if (dist > mLeafNodeDist1) return;
+
+            if (mLeafNodeDist2 > 0.0 && dist < mLeafNodeDist2) {
+                const IndexT* begin = &leaf.indices().front();
+                mRanges.push_back(Range(begin, begin + leaf.indices().size()));
+                return;
+            }
+        }
+
+        typename LeafNodeType::ValueOnCIter iter;
+        const IndexT *begin = static_cast<IndexT*>(NULL), *end = static_cast<IndexT*>(NULL);
+        for (iter = leaf.cbeginValueOn(); iter; ++iter) {
+            leaf.getIndices(iter.pos(), begin, end);
+            filterVoxel(iter.getCoord(), begin, end);
+        }
+    }
+
+    void filterVoxel(const Coord& ijk, const IndexT* begin, const IndexT* end)
+    {
+        PosType vec;
+
+        {
+            vec[0] = mCenter[0] - ScalarType(ijk[0]);
+            vec[1] = mCenter[1] - ScalarType(ijk[1]);
+            vec[2] = mCenter[2] - ScalarType(ijk[2]);
+
+            const ScalarType dist = vec.lengthSqr();
+            if (dist > mVoxelDist1) return;
+
+            if (!mSubvoxelAccuracy || (mVoxelDist2 > 0.0 && dist < mVoxelDist2)) {
+                if (!mRanges.empty() && mRanges.back().second == begin) {
+                    mRanges.back().second = end;
+                } else {
+                    mRanges.push_back(Range(begin, end));
+                }
+                return;
+            }
+        }
+
+
+        while (begin < end) {
+            mPoints.getPos(*begin, vec);
+            vec = mWSCenter - vec;
+
+            if (vec.lengthSqr() < mWSRadiusSqr) {
+                mIndices.push_back(*begin);
+            }
+            ++begin;
+        }
+    }
+
+private:
+    RangeDeque& mRanges;
+    IndexDeque& mIndices;
+    const PosType mCenter, mWSCenter;
+    ScalarType mVoxelDist1, mVoxelDist2, mLeafNodeDist1, mLeafNodeDist2, mWSRadiusSqr;
+    const PointArray& mPoints;
+    const bool mSubvoxelAccuracy;
+}; // struct RadialRangeFilter
+
+
+////////////////////////////////////////
+
+
+template<typename RangeFilterType, typename LeafNodeType>
+inline void
+filteredPointIndexSearchVoxels(RangeFilterType& filter,
+    const LeafNodeType& leaf, const Coord& min, const Coord& max)
+{
+    typedef typename LeafNodeType::ValueType PointIndexT;
+    Index xPos(0), yPos(0), pos(0);
+    Coord ijk(0);
+
+    const PointIndexT* dataPtr = &leaf.indices().front();
+    PointIndexT beginOffset, endOffset;
+
+    for (ijk[0] = min[0]; ijk[0] <= max[0]; ++ijk[0]) {
+        xPos = (ijk[0] & (LeafNodeType::DIM - 1u)) << (2 * LeafNodeType::LOG2DIM);
+        for (ijk[1] = min[1]; ijk[1] <= max[1]; ++ijk[1]) {
+            yPos = xPos + ((ijk[1] & (LeafNodeType::DIM - 1u)) << LeafNodeType::LOG2DIM);
+            for (ijk[2] = min[2]; ijk[2] <= max[2]; ++ijk[2]) {
+                pos = yPos + (ijk[2] & (LeafNodeType::DIM - 1u));
+
+                beginOffset = (pos == 0 ? PointIndexT(0) : leaf.getValue(pos - 1));
+                endOffset = leaf.getValue(pos);
+
+                if (endOffset > beginOffset) {
+                    filter.filterVoxel(ijk, dataPtr + beginOffset, dataPtr + endOffset);
+                }
+            }
+        }
+    }
+}
+
+
+template<typename RangeFilterType, typename ConstAccessor>
+inline void
+filteredPointIndexSearch(RangeFilterType& filter, ConstAccessor& acc, const CoordBBox& bbox)
+{
+    typedef typename ConstAccessor::TreeType::LeafNodeType LeafNodeType;
+    Coord ijk(0), ijkMax(0), ijkA(0), ijkB(0);
+    const Coord leafMin = bbox.min() & ~(LeafNodeType::DIM - 1);
+    const Coord leafMax = bbox.max() & ~(LeafNodeType::DIM - 1);
+
+    for (ijk[0] = leafMin[0]; ijk[0] <= leafMax[0]; ijk[0] += LeafNodeType::DIM) {
+        for (ijk[1] = leafMin[1]; ijk[1] <= leafMax[1]; ijk[1] += LeafNodeType::DIM) {
+            for (ijk[2] = leafMin[2]; ijk[2] <= leafMax[2]; ijk[2] += LeafNodeType::DIM) {
+
+                if (const LeafNodeType* leaf = acc.probeConstLeaf(ijk)) {
+                    ijkMax = ijk;
+                    ijkMax.offset(LeafNodeType::DIM - 1);
+
+                    // intersect leaf bbox with search region.
+                    ijkA = Coord::maxComponent(bbox.min(), ijk);
+                    ijkB = Coord::minComponent(bbox.max(), ijkMax);
+
+                    if (ijkA != ijk || ijkB != ijkMax) {
+                        filteredPointIndexSearchVoxels(filter, *leaf, ijkA, ijkB);
+                    } else { // leaf bbox is inside the search region
+                        filter.filterLeafNode(*leaf);
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RangeDeque, typename LeafNodeType>
+inline void
+pointIndexSearchVoxels(RangeDeque& rangeList,
+    const LeafNodeType& leaf, const Coord& min, const Coord& max)
+{
+    typedef typename LeafNodeType::ValueType PointIndexT;
+    typedef typename PointIndexT::IntType    IntT;
+    typedef typename RangeDeque::value_type  Range;
+
+    Index xPos(0), pos(0), zStride = Index(max[2] - min[2]);
+    const PointIndexT* dataPtr = &leaf.indices().front();
+    PointIndexT beginOffset(0), endOffset(0),
+        previousOffset(static_cast<IntT>(leaf.indices().size() + 1u));
+    Coord ijk(0);
+
+    for (ijk[0] = min[0]; ijk[0] <= max[0]; ++ijk[0]) {
+        xPos = (ijk[0] & (LeafNodeType::DIM - 1u)) << (2 * LeafNodeType::LOG2DIM);
+
+        for (ijk[1] = min[1]; ijk[1] <= max[1]; ++ijk[1]) {
+            pos = xPos + ((ijk[1] & (LeafNodeType::DIM - 1u)) << LeafNodeType::LOG2DIM);
+            pos += (min[2] & (LeafNodeType::DIM - 1u));
+
+            beginOffset = (pos == 0 ? PointIndexT(0) : leaf.getValue(pos - 1));
+            endOffset = leaf.getValue(pos+zStride);
+
+            if (endOffset > beginOffset) {
+
+                if (beginOffset == previousOffset) {
+                    rangeList.back().second = dataPtr + endOffset;
+                } else {
+                    rangeList.push_back(Range(dataPtr + beginOffset, dataPtr + endOffset));
+                }
+
+                previousOffset = endOffset;
+            }
+        }
+    }
+}
+
+
+template<typename RangeDeque, typename ConstAccessor>
+inline void
+pointIndexSearch(RangeDeque& rangeList, ConstAccessor& acc, const CoordBBox& bbox)
+{
+    typedef typename ConstAccessor::TreeType::LeafNodeType LeafNodeType;
+    typedef typename LeafNodeType::ValueType PointIndexT;
+    typedef typename RangeDeque::value_type  Range;
+
+    Coord ijk(0), ijkMax(0), ijkA(0), ijkB(0);
+    const Coord leafMin = bbox.min() & ~(LeafNodeType::DIM - 1);
+    const Coord leafMax = bbox.max() & ~(LeafNodeType::DIM - 1);
+
+    for (ijk[0] = leafMin[0]; ijk[0] <= leafMax[0]; ijk[0] += LeafNodeType::DIM) {
+        for (ijk[1] = leafMin[1]; ijk[1] <= leafMax[1]; ijk[1] += LeafNodeType::DIM) {
+            for (ijk[2] = leafMin[2]; ijk[2] <= leafMax[2]; ijk[2] += LeafNodeType::DIM) {
+
+                if (const LeafNodeType* leaf = acc.probeConstLeaf(ijk)) {
+                    ijkMax = ijk;
+                    ijkMax.offset(LeafNodeType::DIM - 1);
+
+                    // intersect leaf bbox with search region.
+                    ijkA = Coord::maxComponent(bbox.min(), ijk);
+                    ijkB = Coord::minComponent(bbox.max(), ijkMax);
+
+                    if (ijkA != ijk || ijkB != ijkMax) {
+                        pointIndexSearchVoxels(rangeList, *leaf, ijkA, ijkB);
+                    } else {
+                        // leaf bbox is inside the search region, add all indices.
+                        const PointIndexT* begin = &leaf->indices().front();
+                        rangeList.push_back(Range(begin, (begin + leaf->indices().size())));
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+} // namespace point_index_grid_internal
+
+
+// PointIndexIterator implementation
+
+template<typename TreeType>
+inline
+PointIndexIterator<TreeType>::PointIndexIterator()
+    : mRange(static_cast<ValueType*>(NULL), static_cast<ValueType*>(NULL))
+    , mRangeList()
+    , mIter(mRangeList.begin())
+    , mIndexArray()
+    , mIndexArraySize(0)
+{
+}
+
+
+template<typename TreeType>
+inline
+PointIndexIterator<TreeType>::PointIndexIterator(const PointIndexIterator& rhs)
+    : mRange(rhs.mRange)
+    , mRangeList(rhs.mRangeList)
+    , mIter(mRangeList.begin())
+    , mIndexArray()
+    , mIndexArraySize(rhs.mIndexArraySize)
+{
+    if (rhs.mIndexArray) {
+        mIndexArray.reset(new ValueType[mIndexArraySize]);
+        memcpy(mIndexArray.get(), rhs.mIndexArray.get(), mIndexArraySize * sizeof(ValueType));
+    }
+}
+
+
+template<typename TreeType>
+inline PointIndexIterator<TreeType>&
+PointIndexIterator<TreeType>::operator=(const PointIndexIterator& rhs)
+{
+    if (&rhs != this) {
+        mRange = rhs.mRange;
+        mRangeList = rhs.mRangeList;
+        mIter = mRangeList.begin();
+        mIndexArray.reset();
+        mIndexArraySize = rhs.mIndexArraySize;
+
+        if (rhs.mIndexArray) {
+            mIndexArray.reset(new ValueType[mIndexArraySize]);
+            memcpy(mIndexArray.get(), rhs.mIndexArray.get(), mIndexArraySize * sizeof(ValueType));
+        }
+    }
+    return *this;
+}
+
+
+template<typename TreeType>
+inline
+PointIndexIterator<TreeType>::PointIndexIterator(const Coord& ijk, ConstAccessor& acc)
+    : mRange(static_cast<ValueType*>(NULL), static_cast<ValueType*>(NULL))
+    , mRangeList()
+    , mIter(mRangeList.begin())
+    , mIndexArray()
+    , mIndexArraySize(0)
+{
+    const LeafNodeType* leaf = acc.probeConstLeaf(ijk);
+    if (leaf && leaf->getIndices(ijk, mRange.first, mRange.second)) {
+        mRangeList.push_back(mRange);
+        mIter = mRangeList.begin();
+    }
+}
+
+
+template<typename TreeType>
+inline
+PointIndexIterator<TreeType>::PointIndexIterator(const CoordBBox& bbox, ConstAccessor& acc)
+    : mRange(static_cast<ValueType*>(NULL), static_cast<ValueType*>(NULL))
+    , mRangeList()
+    , mIter(mRangeList.begin())
+    , mIndexArray()
+    , mIndexArraySize(0)
+{
+    point_index_grid_internal::pointIndexSearch(mRangeList, acc, bbox);
+
+    if (!mRangeList.empty()) {
+        mIter = mRangeList.begin();
+        mRange = mRangeList.front();
+    }
+}
+
+
+template<typename TreeType>
+inline void
+PointIndexIterator<TreeType>::reset()
+{
+    mIter = mRangeList.begin();
+    if (!mRangeList.empty()) {
+        mRange = mRangeList.front();
+    } else if (mIndexArray) {
+        mRange.first = mIndexArray.get();
+        mRange.second = mRange.first + mIndexArraySize;
+    } else {
+        mRange.first = static_cast<ValueType*>(NULL);
+        mRange.second = static_cast<ValueType*>(NULL);
+    }
+}
+
+
+template<typename TreeType>
+inline void
+PointIndexIterator<TreeType>::increment()
+{
+    ++mRange.first;
+    if (mRange.first >= mRange.second && mIter != mRangeList.end()) {
+        ++mIter;
+        if (mIter != mRangeList.end()) {
+            mRange = *mIter;
+        } else if (mIndexArray) {
+            mRange.first = mIndexArray.get();
+            mRange.second = mRange.first + mIndexArraySize;
+        }
+    }
+}
+
+
+template<typename TreeType>
+inline bool
+PointIndexIterator<TreeType>::next()
+{
+    if (!this->test()) return false;
+    this->increment();
+    return this->test();
+}
+
+
+template<typename TreeType>
+inline size_t
+PointIndexIterator<TreeType>::size() const
+{
+    size_t count = 0;
+    typename RangeDeque::const_iterator it = mRangeList.begin();
+
+    for ( ; it != mRangeList.end(); ++it) {
+        count += it->second - it->first;
+    }
+
+    return count + mIndexArraySize;
+}
+
+
+template<typename TreeType>
+inline void
+PointIndexIterator<TreeType>::clear()
+{
+    mRange.first = static_cast<ValueType*>(NULL);
+    mRange.second = static_cast<ValueType*>(NULL);
+    mRangeList.clear();
+    mIter = mRangeList.end();
+    mIndexArray.reset();
+    mIndexArraySize = 0;
+}
+
+
+template<typename TreeType>
+inline void
+PointIndexIterator<TreeType>::searchAndUpdate(const Coord& ijk, ConstAccessor& acc)
+{
+    this->clear();
+    const LeafNodeType* leaf = acc.probeConstLeaf(ijk);
+    if (leaf && leaf->getIndices(ijk, mRange.first, mRange.second)) {
+        mRangeList.push_back(mRange);
+        mIter = mRangeList.begin();
+    }
+}
+
+
+template<typename TreeType>
+inline void
+PointIndexIterator<TreeType>::searchAndUpdate(const CoordBBox& bbox, ConstAccessor& acc)
+{
+    this->clear();
+    point_index_grid_internal::pointIndexSearch(mRangeList, acc, bbox);
+
+    if (!mRangeList.empty()) {
+        mIter = mRangeList.begin();
+        mRange = mRangeList.front();
+    }
+}
+
+
+template<typename TreeType>
+template<typename PointArray>
+inline void
+PointIndexIterator<TreeType>::searchAndUpdate(const BBoxd& bbox, ConstAccessor& acc,
+    const PointArray& points, const math::Transform& xform)
+{
+    this->clear();
+
+    std::vector<CoordBBox> searchRegions;
+    CoordBBox region(Coord::round(bbox.min()), Coord::round(bbox.max()));
+
+    const Coord dim = region.dim();
+    const int minExtent = std::min(dim[0], std::min(dim[1], dim[2]));
+
+    if (minExtent > 2) {
+        // collect indices that don't need to be tested
+        CoordBBox ibox = region;
+        ibox.expand(-1);
+
+        point_index_grid_internal::pointIndexSearch(mRangeList, acc, ibox);
+
+        // define regions for the filtered search
+        ibox.expand(1);
+        point_index_grid_internal::constructExclusiveRegions(searchRegions, region, ibox);
+    } else {
+        searchRegions.push_back(region);
+    }
+
+    // filtered search
+    std::deque<ValueType> filteredIndices;
+    point_index_grid_internal::BBoxFilter<PointArray, ValueType>
+        filter(mRangeList, filteredIndices, bbox, points, xform);
+
+    for (size_t n = 0, N = searchRegions.size(); n < N; ++n) {
+        point_index_grid_internal::filteredPointIndexSearch(filter, acc, searchRegions[n]);
+    }
+
+    point_index_grid_internal::dequeToArray(filteredIndices, mIndexArray, mIndexArraySize);
+
+    this->reset();
+}
+
+
+template<typename TreeType>
+template<typename PointArray>
+inline void
+PointIndexIterator<TreeType>::searchAndUpdate(const Vec3d& center, double radius,
+    ConstAccessor& acc, const PointArray& points, const math::Transform& xform,
+    bool subvoxelAccuracy)
+{
+    this->clear();
+    std::vector<CoordBBox> searchRegions;
+
+    // bounding box
+    CoordBBox bbox(
+        Coord::round(Vec3d(center[0] - radius, center[1] - radius, center[2] - radius)),
+        Coord::round(Vec3d(center[0] + radius, center[1] + radius, center[2] + radius)));
+    bbox.expand(1);
+
+    const double iRadius = radius * double(1.0 / std::sqrt(3.0));
+    if (iRadius > 2.0) {
+        // inscribed box
+        CoordBBox ibox(
+            Coord::round(Vec3d(center[0] - iRadius, center[1] - iRadius, center[2] - iRadius)),
+            Coord::round(Vec3d(center[0] + iRadius, center[1] + iRadius, center[2] + iRadius)));
+        ibox.expand(-1);
+
+        // collect indices that don't need to be tested
+        point_index_grid_internal::pointIndexSearch(mRangeList, acc, ibox);
+
+        ibox.expand(1);
+        point_index_grid_internal::constructExclusiveRegions(searchRegions, bbox, ibox);
+    } else {
+        searchRegions.push_back(bbox);
+    }
+
+    // filtered search
+    std::deque<ValueType> filteredIndices;
+    const double leafNodeDim = double(TreeType::LeafNodeType::DIM);
+
+    typedef point_index_grid_internal::RadialRangeFilter<PointArray, ValueType> FilterT;
+
+    FilterT filter(mRangeList, filteredIndices,
+        center, radius, points, xform, leafNodeDim, subvoxelAccuracy);
+
+    for (size_t n = 0, N = searchRegions.size(); n < N; ++n) {
+        point_index_grid_internal::filteredPointIndexSearch(filter, acc, searchRegions[n]);
+    }
+
+    point_index_grid_internal::dequeToArray(filteredIndices, mIndexArray, mIndexArraySize);
+
+    this->reset();
+}
+
+
+template<typename TreeType>
+template<typename PointArray>
+inline void
+PointIndexIterator<TreeType>::worldSpaceSearchAndUpdate(const BBoxd& bbox, ConstAccessor& acc,
+    const PointArray& points, const math::Transform& xform)
+{
+    this->searchAndUpdate(
+        BBoxd(xform.worldToIndex(bbox.min()), xform.worldToIndex(bbox.max())), acc, points, xform);
+}
+
+
+template<typename TreeType>
+template<typename PointArray>
+inline void
+PointIndexIterator<TreeType>::worldSpaceSearchAndUpdate(const Vec3d& center, double radius,
+    ConstAccessor& acc, const PointArray& points, const math::Transform& xform,
+    bool subvoxelAccuracy)
+{
+    this->searchAndUpdate(xform.worldToIndex(center),
+        (radius / xform.voxelSize()[0]), acc, points, xform, subvoxelAccuracy);
+}
+
+
+////////////////////////////////////////
+
+// PointIndexFilter implementation
+
+template<typename PointArray, typename TreeType>
+inline
+PointIndexFilter<PointArray, TreeType>::PointIndexFilter(
+    const PointArray& points, const TreeType& tree, const math::Transform& xform)
+    : mPoints(&points), mAcc(tree), mXform(xform), mInvVoxelSize(1.0/xform.voxelSize()[0])
+{
+}
+
+
+template<typename PointArray, typename TreeType>
+inline
+PointIndexFilter<PointArray, TreeType>::PointIndexFilter(const PointIndexFilter& rhs)
+    : mPoints(rhs.mPoints)
+    , mAcc(rhs.mAcc.tree())
+    , mXform(rhs.mXform)
+    , mInvVoxelSize(rhs.mInvVoxelSize)
+{
+}
+
+
+template<typename PointArray, typename TreeType>
+template<typename FilterType>
+inline void
+PointIndexFilter<PointArray, TreeType>::searchAndApply(
+    const PosType& center, ScalarType radius, FilterType& op)
+{
+    if (radius * mInvVoxelSize < ScalarType(8.0)) {
+        mIter.searchAndUpdate(openvdb::CoordBBox(
+            mXform.worldToIndexCellCentered(center - radius),
+            mXform.worldToIndexCellCentered(center + radius)), mAcc);
+    } else {
+        mIter.worldSpaceSearchAndUpdate(
+            center, radius, mAcc, *mPoints, mXform, /*subvoxelAccuracy=*/false);
+    }
+
+    const ScalarType radiusSqr = radius * radius;
+    ScalarType distSqr = 0.0;
+    PosType pos;
+    for (; mIter; ++mIter) {
+        mPoints->getPos(*mIter, pos);
+        pos -= center;
+        distSqr = pos.lengthSqr();
+
+        if (distSqr < radiusSqr) {
+            op(distSqr, *mIter);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename GridT, typename PointArrayT>
+inline typename GridT::Ptr
+createPointIndexGrid(const PointArrayT& points, const math::Transform& xform)
+{
+    typename GridT::Ptr grid = GridT::create(typename GridT::ValueType(0));
+    grid->setTransform(xform.copy());
+
+    if (points.size() > 0) {
+        point_index_grid_internal::constructPointTree(
+            grid->tree(), grid->transform(), points);
+    }
+
+    return grid;
+}
+
+
+template<typename GridT, typename PointArrayT>
+inline typename GridT::Ptr
+createPointIndexGrid(const PointArrayT& points, double voxelSize)
+{
+    math::Transform::Ptr xform = math::Transform::createLinearTransform(voxelSize);
+    return createPointIndexGrid<GridT>(points, *xform);
+}
+
+
+template<typename PointArrayT, typename GridT>
+inline bool
+isValidPartition(const PointArrayT& points, const GridT& grid)
+{
+    tree::LeafManager<const typename GridT::TreeType> leafs(grid.tree());
+
+    size_t pointCount = 0;
+    for (size_t n = 0, N = leafs.leafCount(); n < N; ++n) {
+        pointCount += leafs.leaf(n).indices().size();
+    }
+
+    if (points.size() != pointCount) {
+        return false;
+    }
+
+    tbb::atomic<bool> changed;
+    changed = false;
+
+    point_index_grid_internal::ValidPartitioningOp<PointArrayT>
+        op(changed, points, grid.transform());
+
+    leafs.foreach(op);
+
+    return !bool(changed);
+}
+
+
+template<typename GridT, typename PointArrayT>
+inline typename GridT::ConstPtr
+getValidPointIndexGrid(const PointArrayT& points, const typename GridT::ConstPtr& grid)
+{
+    if (isValidPartition(points, *grid)) {
+        return grid;
+    }
+
+    return createPointIndexGrid<GridT>(points, grid->transform());
+}
+
+
+template<typename GridT, typename PointArrayT>
+inline typename GridT::Ptr
+getValidPointIndexGrid(const PointArrayT& points, const typename GridT::Ptr& grid)
+{
+    if (isValidPartition(points, *grid)) {
+        return grid;
+    }
+
+    return createPointIndexGrid<GridT>(points, grid->transform());
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+struct PointIndexLeafNode : public tree::LeafNode<T, Log2Dim>
+{
+    typedef PointIndexLeafNode<T, Log2Dim>          LeafNodeType;
+    typedef boost::shared_ptr<PointIndexLeafNode>   Ptr;
+
+    typedef T                       ValueType;
+    typedef std::vector<ValueType>  IndexArray;
+
+
+    IndexArray& indices() { return mIndices; }
+    const IndexArray& indices() const { return mIndices; }
+
+    bool getIndices(const Coord& ijk, const ValueType*& begin, const ValueType*& end) const;
+    bool getIndices(Index offset, const ValueType*& begin, const ValueType*& end) const;
+
+    void setOffsetOn(Index offset, const ValueType& val);
+    void setOffsetOnly(Index offset, const ValueType& val);
+
+    bool isEmpty(const CoordBBox& bbox) const;
+
+private:
+    IndexArray mIndices;
+
+    ////////////////////////////////////////
+
+    // The following methods had to be copied from the LeafNode class
+    // to make the derived PointIndexLeafNode class compatible with the tree structure.
+
+public:
+    typedef tree::LeafNode<T, Log2Dim>  BaseLeaf;
+    typedef util::NodeMask<Log2Dim>     NodeMaskType;
+
+    using BaseLeaf::LOG2DIM;
+    using BaseLeaf::TOTAL;
+    using BaseLeaf::DIM;
+    using BaseLeaf::NUM_VALUES;
+    using BaseLeaf::NUM_VOXELS;
+    using BaseLeaf::SIZE;
+    using BaseLeaf::LEVEL;
+
+    /// Default constructor
+    PointIndexLeafNode() : BaseLeaf(), mIndices() {}
+
+    explicit
+    PointIndexLeafNode(const Coord& coords, const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(coords, value, active)
+        , mIndices()
+    {
+    }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    PointIndexLeafNode(PartialCreate, const Coord& coords,
+        const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(PartialCreate(), coords, value, active)
+        , mIndices()
+    {
+    }
+#endif
+
+    /// Deep copy constructor
+    PointIndexLeafNode(const PointIndexLeafNode& rhs) : BaseLeaf(rhs), mIndices(rhs.mIndices) {}
+
+    /// @brief Return @c true if the given node (which may have a different @c ValueType
+    /// than this node) has the same active value topology as this node.
+    template<typename OtherType, Index OtherLog2Dim>
+    bool hasSameTopology(const PointIndexLeafNode<OtherType, OtherLog2Dim>* other) const {
+        return BaseLeaf::hasSameTopology(other);
+    }
+
+    /// Check for buffer, state and origin equivalence.
+    bool operator==(const PointIndexLeafNode& other) const { return BaseLeaf::operator==(other); }
+
+    bool operator!=(const PointIndexLeafNode& other) const { return !(other == *this); }
+
+    template<MergePolicy Policy> void merge(const PointIndexLeafNode& rhs) {
+        BaseLeaf::merge<Policy>(rhs);
+    }
+    template<MergePolicy Policy> void merge(const ValueType& tileValue, bool tileActive) {
+         BaseLeaf::template merge<Policy>(tileValue, tileActive);
+    }
+
+    template<MergePolicy Policy>
+    void merge(const PointIndexLeafNode& other,
+        const ValueType& /*bg*/, const ValueType& /*otherBG*/)
+    {
+         BaseLeaf::template merge<Policy>(other);
+    }
+
+    void addLeaf(PointIndexLeafNode*) {}
+    template<typename AccessorT>
+    void addLeafAndCache(PointIndexLeafNode*, AccessorT&) {}
+
+    //@{
+    /// @brief Return a pointer to this node.
+    PointIndexLeafNode* touchLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    PointIndexLeafNode* touchLeafAndCache(const Coord&, AccessorT&) { return this; }
+
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord&, AccessorT&)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,PointIndexLeafNode>::value)) return NULL;
+        return reinterpret_cast<NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    PointIndexLeafNode* probeLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    PointIndexLeafNode* probeLeafAndCache(const Coord&, AccessorT&) { return this; }
+    //@}
+
+    //@{
+    /// @brief Return a @const pointer to this node.
+    const PointIndexLeafNode* probeConstLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const PointIndexLeafNode* probeConstLeafAndCache(const Coord&, AccessorT&) const {return this;}
+    template<typename AccessorT>
+    const PointIndexLeafNode* probeLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    const PointIndexLeafNode* probeLeaf(const Coord&) const { return this; }
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord&, AccessorT&) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,PointIndexLeafNode>::value)) return NULL;
+        return reinterpret_cast<const NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+
+
+    // I/O methods
+
+    void readBuffers(std::istream& is, bool fromHalf = false);
+    void readBuffers(std::istream& is, const CoordBBox&, bool fromHalf = false);
+    void writeBuffers(std::ostream& os, bool toHalf = false) const;
+
+
+    Index64 memUsage() const;
+
+
+    ////////////////////////////////////////
+
+    // Disable all write methods to avoid unintentional changes
+    // to the point-array offsets.
+
+    void assertNonmodifiable() {
+        assert(false && "Cannot modify voxel values in a PointIndexTree.");
+    }
+
+    void setActiveState(const Coord&, bool) { assertNonmodifiable(); }
+    void setActiveState(Index, bool) { assertNonmodifiable(); }
+
+    void setValueOnly(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOnly(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValueOff(const Coord&) { assertNonmodifiable(); }
+    void setValueOff(Index) { assertNonmodifiable(); }
+
+    void setValueOff(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOff(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValueOn(const Coord&) { assertNonmodifiable(); }
+    void setValueOn(Index) { assertNonmodifiable(); }
+
+    void setValueOn(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOn(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValue(const Coord&, const ValueType&) { assertNonmodifiable(); }
+
+    void setValuesOn() { assertNonmodifiable(); }
+    void setValuesOff() { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValue(Index, const ModifyOp&) { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValue(const Coord&, const ModifyOp&) { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord&, const ModifyOp&) { assertNonmodifiable(); }
+
+    void clip(const CoordBBox&, const ValueType&) { assertNonmodifiable(); }
+
+    void fill(const CoordBBox&, const ValueType&, bool) { assertNonmodifiable(); }
+    void fill(const ValueType&) {}
+    void fill(const ValueType&, bool) { assertNonmodifiable(); }
+
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord&, const ValueType&, AccessorT&) {assertNonmodifiable();}
+
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord&, const ModifyOp&, AccessorT&) {
+        assertNonmodifiable();
+    }
+
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord&, const ValueType&, AccessorT&) { assertNonmodifiable(); }
+
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord&, bool, AccessorT&) { assertNonmodifiable(); }
+
+    void resetBackground(const ValueType&, const ValueType&) { assertNonmodifiable(); }
+
+    void signedFloodFill(const ValueType&) { assertNonmodifiable(); }
+    void signedFloodFill(const ValueType&, const ValueType&) { assertNonmodifiable(); }
+
+    void negate() { assertNonmodifiable(); }
+
+protected:
+    typedef typename BaseLeaf::ValueOn ValueOn;
+    typedef typename BaseLeaf::ValueOff ValueOff;
+    typedef typename BaseLeaf::ValueAll ValueAll;
+    typedef typename BaseLeaf::ChildOn ChildOn;
+    typedef typename BaseLeaf::ChildOff ChildOff;
+    typedef typename BaseLeaf::ChildAll ChildAll;
+
+    typedef typename NodeMaskType::OnIterator    MaskOnIterator;
+    typedef typename NodeMaskType::OffIterator   MaskOffIterator;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIterator;
+
+    // During topology-only construction, access is needed
+    // to protected/private members of other template instances.
+    template<typename, Index> friend struct PointIndexLeafNode;
+
+    friend class tree::IteratorBase<MaskOnIterator, PointIndexLeafNode>;
+    friend class tree::IteratorBase<MaskOffIterator, PointIndexLeafNode>;
+    friend class tree::IteratorBase<MaskDenseIterator, PointIndexLeafNode>;
+
+public:
+
+
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOnIterator, PointIndexLeafNode, const ValueType, ValueOn> ValueOnIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOnIterator, const PointIndexLeafNode, const ValueType, ValueOn> ValueOnCIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOffIterator, PointIndexLeafNode, const ValueType, ValueOff> ValueOffIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOffIterator,const PointIndexLeafNode,const ValueType,ValueOff> ValueOffCIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskDenseIterator, PointIndexLeafNode, const ValueType, ValueAll> ValueAllIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskDenseIterator,const PointIndexLeafNode,const ValueType,ValueAll> ValueAllCIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOnIterator, PointIndexLeafNode, ChildOn> ChildOnIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOnIterator, const PointIndexLeafNode, ChildOn> ChildOnCIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOffIterator, PointIndexLeafNode, ChildOff> ChildOffIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOffIterator, const PointIndexLeafNode, ChildOff> ChildOffCIter;
+    typedef typename BaseLeaf::template DenseIter<
+        PointIndexLeafNode, ValueType, ChildAll> ChildAllIter;
+    typedef typename BaseLeaf::template DenseIter<
+        const PointIndexLeafNode, const ValueType, ChildAll> ChildAllCIter;
+
+#define VMASK_ this->getValueMask()
+    ValueOnCIter  cbeginValueOn() const  { return ValueOnCIter(VMASK_.beginOn(), this); }
+    ValueOnCIter   beginValueOn() const  { return ValueOnCIter(VMASK_.beginOn(), this); }
+    ValueOnIter    beginValueOn()        { return ValueOnIter(VMASK_.beginOn(), this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(VMASK_.beginOff(), this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(VMASK_.beginOff(), this); }
+    ValueOffIter   beginValueOff()       { return ValueOffIter(VMASK_.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(VMASK_.beginDense(), this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(VMASK_.beginDense(), this); }
+    ValueAllIter   beginValueAll()       { return ValueAllIter(VMASK_.beginDense(), this); }
+
+    ValueOnCIter  cendValueOn() const    { return ValueOnCIter(VMASK_.endOn(), this); }
+    ValueOnCIter   endValueOn() const    { return ValueOnCIter(VMASK_.endOn(), this); }
+    ValueOnIter    endValueOn()          { return ValueOnIter(VMASK_.endOn(), this); }
+    ValueOffCIter cendValueOff() const   { return ValueOffCIter(VMASK_.endOff(), this); }
+    ValueOffCIter  endValueOff() const   { return ValueOffCIter(VMASK_.endOff(), this); }
+    ValueOffIter   endValueOff()         { return ValueOffIter(VMASK_.endOff(), this); }
+    ValueAllCIter cendValueAll() const   { return ValueAllCIter(VMASK_.endDense(), this); }
+    ValueAllCIter  endValueAll() const   { return ValueAllCIter(VMASK_.endDense(), this); }
+    ValueAllIter   endValueAll()         { return ValueAllIter(VMASK_.endDense(), this); }
+
+    ChildOnCIter  cbeginChildOn() const  { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnCIter   beginChildOn() const  { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnIter    beginChildOn()        { return ChildOnIter(VMASK_.endOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffCIter  beginChildOff() const { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffIter   beginChildOff()       { return ChildOffIter(VMASK_.endOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(VMASK_.beginDense(), this); }
+    ChildAllCIter  beginChildAll() const { return ChildAllCIter(VMASK_.beginDense(), this); }
+    ChildAllIter   beginChildAll()       { return ChildAllIter(VMASK_.beginDense(), this); }
+
+    ChildOnCIter  cendChildOn() const    { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnCIter   endChildOn() const    { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnIter    endChildOn()          { return ChildOnIter(VMASK_.endOn(), this); }
+    ChildOffCIter cendChildOff() const   { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffCIter  endChildOff() const   { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffIter   endChildOff()         { return ChildOffIter(VMASK_.endOff(), this); }
+    ChildAllCIter cendChildAll() const   { return ChildAllCIter(VMASK_.endDense(), this); }
+    ChildAllCIter  endChildAll() const   { return ChildAllCIter(VMASK_.endDense(), this); }
+    ChildAllIter   endChildAll()         { return ChildAllIter(VMASK_.endDense(), this); }
+#undef VMASK_
+}; // struct PointIndexLeafNode
+
+
+template<typename T, Index Log2Dim>
+inline bool
+PointIndexLeafNode<T, Log2Dim>::getIndices(const Coord& ijk,
+    const ValueType*& begin, const ValueType*& end) const
+{
+    return getIndices(LeafNodeType::coordToOffset(ijk), begin, end);
+}
+
+
+template<typename T, Index Log2Dim>
+inline bool
+PointIndexLeafNode<T, Log2Dim>::getIndices(Index offset,
+    const ValueType*& begin, const ValueType*& end) const
+{
+    if (this->isValueMaskOn(offset)) {
+        const ValueType* dataPtr = &mIndices.front();
+        begin = dataPtr + (offset == 0 ? ValueType(0) : this->buffer()[offset - 1]);
+        end = dataPtr + this->buffer()[offset];
+        return true;
+    }
+    return false;
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+PointIndexLeafNode<T, Log2Dim>::setOffsetOn(Index offset, const ValueType& val)
+{
+    this->buffer().setValue(offset, val);
+    this->setValueMaskOn(offset);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+PointIndexLeafNode<T, Log2Dim>::setOffsetOnly(Index offset, const ValueType& val)
+{
+    this->buffer().setValue(offset, val);
+}
+
+
+template<typename T, Index Log2Dim>
+inline bool
+PointIndexLeafNode<T, Log2Dim>::isEmpty(const CoordBBox& bbox) const
+{
+    Index xPos, pos, zStride = Index(bbox.max()[2] - bbox.min()[2]);
+    Coord ijk;
+
+    for (ijk[0] = bbox.min()[0]; ijk[0] <= bbox.max()[0]; ++ijk[0]) {
+        xPos = (ijk[0] & (DIM - 1u)) << (2 * LOG2DIM);
+
+        for (ijk[1] = bbox.min()[1]; ijk[1] <= bbox.max()[1]; ++ijk[1]) {
+            pos = xPos + ((ijk[1] & (DIM - 1u)) << LOG2DIM);
+            pos += (bbox.min()[2] & (DIM - 1u));
+
+            if (this->buffer()[pos+zStride] > (pos == 0 ? T(0) : this->buffer()[pos - 1])) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+PointIndexLeafNode<T, Log2Dim>::readBuffers(std::istream& is, bool fromHalf)
+{
+    BaseLeaf::readBuffers(is, fromHalf);
+
+    Index64 numIndices = Index64(0);
+    is.read(reinterpret_cast<char*>(&numIndices), sizeof(Index64));
+
+    mIndices.resize(size_t(numIndices));
+    is.read(reinterpret_cast<char*>(mIndices.data()), numIndices * sizeof(T));
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+PointIndexLeafNode<T, Log2Dim>::readBuffers(std::istream& is, const CoordBBox& bbox, bool fromHalf)
+{
+    // Read and clip voxel values.
+    BaseLeaf::readBuffers(is, bbox, fromHalf);
+
+    Index64 numIndices = Index64(0);
+    is.read(reinterpret_cast<char*>(&numIndices), sizeof(Index64));
+
+    const Index64 numBytes = numIndices * sizeof(T);
+
+    if (bbox.hasOverlap(this->getNodeBoundingBox())) {
+        mIndices.resize(size_t(numIndices));
+        is.read(reinterpret_cast<char*>(mIndices.data()), numBytes);
+
+        /// @todo If any voxels were deactivated as a result of clipping in the call to
+        /// BaseLeaf::readBuffers(), the point index list will need to be regenerated.
+    } else {
+        // Read and discard voxel values.
+        boost::scoped_array<char> buf(new char[numBytes]);
+        is.read(buf.get(), numBytes);
+    }
+
+    // Reserved for future use
+    Index64 auxDataBytes = Index64(0);
+    is.read(reinterpret_cast<char*>(&auxDataBytes), sizeof(Index64));
+    if (auxDataBytes > 0) {
+        // For now, read and discard any auxiliary data.
+        boost::scoped_array<char> auxData(new char[auxDataBytes]);
+        is.read(auxData.get(), auxDataBytes);
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+PointIndexLeafNode<T, Log2Dim>::writeBuffers(std::ostream& os, bool toHalf) const
+{
+    BaseLeaf::writeBuffers(os, toHalf);
+
+    Index64 numIndices = Index64(mIndices.size());
+    os.write(reinterpret_cast<const char*>(&numIndices), sizeof(Index64));
+    os.write(reinterpret_cast<const char*>(mIndices.data()), numIndices * sizeof(T));
+
+    // Reserved for future use
+    const Index64 auxDataBytes = Index64(0);
+    os.write(reinterpret_cast<const char*>(&auxDataBytes), sizeof(Index64));
+}
+
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointIndexLeafNode<T, Log2Dim>::memUsage() const
+{
+    return BaseLeaf::memUsage() + Index64((sizeof(T)*mIndices.capacity()) + sizeof(mIndices));
+}
+
+} // namespace tools
+
+
+////////////////////////////////////////
+
+
+namespace tree {
+
+/// Helper metafunction used to implement LeafNode::SameConfiguration
+/// (which, as an inner class, can't be independently specialized)
+template<Index Dim1, typename T2>
+struct SameLeafConfig<Dim1, openvdb::tools::PointIndexLeafNode<T2, Dim1> >
+{
+    static const bool value = true;
+};
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_POINT_INDEX_GRID_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/PointMaskGrid.h b/nuparu/include/openvdb_new/tools/PointMaskGrid.h
new file mode 100644
index 00000000..bd896f2c
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/PointMaskGrid.h
@@ -0,0 +1,283 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file PointMaskGrid.h
+///
+/// @brief This tool produces a grid where every voxel that contains a
+/// point is active. It employes thread-local storage for best performance.
+///
+/// The @c PointListT template argument below refers to any class
+/// with the following interface (see unittest/TestPointMaskGrid.cc
+/// and SOP_OpenVDB_From_Particles.cc for practical examples):
+/// @code
+///
+/// class PointList {
+///   ...
+/// public:
+///
+///   // Return the total number of particles in list.
+///   size_t size() const;
+///
+///   // Get the world space position of the nth particle.
+///   void getPos(size_t n, Vec3R& xyz) const;
+/// };
+/// @endcode
+///
+/// @note See unittest/TestPointMaskGrid.cc for an example.
+///
+/// The @c InterruptT template argument below refers to any class
+/// with the following interface:
+/// @code
+/// class Interrupter {
+///   ...
+/// public:
+///   void start(const char* name = NULL)// called when computations begin
+///   void end()                         // called when computations end
+///   bool wasInterrupted(int percent=-1)// return true to break computation
+/// };
+/// @endcode
+///
+/// @note If no template argument is provided for this InterruptT
+/// the util::NullInterrupter is used which implies that all
+/// interrupter calls are no-ops (i.e. incurs no computational overhead).
+
+#ifndef OPENVDB_TOOLS_POINT_MASK_GRID_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_MASK_GRID_HAS_BEEN_INCLUDED
+
+#include <tbb/tbb_thread.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range.h>
+
+#include <openvdb/openvdb.h>
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include <openvdb/util/NullInterrupter.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+// Forward declaration of main class
+template<typename GridT = MaskGrid, typename InterrupterT = util::NullInterrupter>
+class PointMaskGrid;        
+
+/// @brief Makes every voxel of the  @c grid active if it contains a point.
+///
+/// @param points     points that active the voxels of @c grid
+/// @param grid       on out its voxels with points are active
+template<typename PointListT, typename GridT>
+inline void
+pointMaskGrid(const PointListT& points, GridT& grid)
+{
+    PointMaskGrid<GridT, util::NullInterrupter> tmp(grid, NULL);
+    tmp.addPoints(points);
+}
+
+/// @brief Return a MaskGrid where each binary voxel value
+/// is on if the voxel contains one (or more) points (i.e.
+/// the 3D position of a point is closer to this voxel than
+/// any other voxels).
+///
+/// @param points     points that active the voxels in the returned grid.
+/// @param xform      transform from world space to voxels in grid space.
+template<typename PointListT>
+inline MaskGrid::Ptr
+createPointMaskGrid(const PointListT& points, const math::Transform& xform)
+{
+    MaskGrid::Ptr grid = createGrid<MaskGrid>( false );
+    grid->setTransform( xform.copy() );
+    pointMaskGrid( points, *grid );
+    return grid;
+}
+
+////////////////////////////////////////
+
+/// @brief Makes every voxel of a grid active if it contains a point.    
+template<typename GridT, typename InterrupterT>
+class PointMaskGrid
+{
+public:
+    typedef typename GridT::ValueType ValueT;
+
+    /// @brief Constructor from a grid and optional interrupter
+    ///
+    /// @param grid        Grid whoes voxels will have their state activated by points.
+    /// @param interrupter Optional interrupter to prematurely terminate execution.
+    explicit PointMaskGrid(GridT& grid, InterrupterT* interrupter = NULL)
+        : mGrid(&grid)
+        , mInterrupter(interrupter)
+    {
+    }
+
+    /// @brief Activates the state of any voxel in the input grid that contains a point.
+    ///
+    /// @param points    List of points that active the voxels in the input grid.
+    /// @param grainSize Set the grain-size used for multi-threading. A value of 0
+    ///                  disables multi-threading!
+    template<typename PointListT>
+    void addPoints(const PointListT& points, size_t grainSize = 1024)
+    {
+        if (mInterrupter) mInterrupter->start("PointMaskGrid: adding points");
+        if (grainSize>0) {
+            typename GridT::Ptr examplar = mGrid->copy( CP_NEW );
+            PoolType pool( *examplar );//thread local storage pool of grids
+            AddPoints<PointListT> tmp(points, pool, grainSize, *this );
+            if ( this->interrupt() ) return;
+            ReducePool reducePool(pool, mGrid, size_t(0));
+        } else {
+            const math::Transform& xform = mGrid->transform();
+            typename GridT::Accessor acc = mGrid->getAccessor();
+            Vec3R wPos;
+            for (size_t i = 0, n = points.size(); i < n; ++i) {
+                if ( this->interrupt() ) break;
+                points.getPos(i, wPos);
+                acc.setValueOn( xform.worldToIndexCellCentered( wPos ) );
+            }
+        }
+        if (mInterrupter) mInterrupter->end();
+    }
+
+private:
+    // Disallow copy construction and copy by assignment!
+    PointMaskGrid(const PointMaskGrid&);// not implemented
+    PointMaskGrid& operator=(const PointMaskGrid&);// not implemented
+    
+    bool interrupt() const
+    {
+        if (mInterrupter && util::wasInterrupted(mInterrupter)) {
+            tbb::task::self().cancel_group_execution();
+            return true;
+        }
+        return false;
+    }
+
+    // Private struct that implements concurrent thread-local
+    // insersion of points into a grid
+    typedef tbb::enumerable_thread_specific<GridT> PoolType;
+    template<typename PointListT> struct AddPoints;
+
+    // Private class that implements concurrent reduction of a thread-local pool
+    struct ReducePool;
+    
+    GridT*        mGrid;
+    InterrupterT* mInterrupter;
+};// PointMaskGrid
+
+// Private member class that implements concurrent thread-local
+// insersion of points into a grid
+template<typename GridT, typename InterrupterT>
+template<typename PointListT>
+struct PointMaskGrid<GridT, InterrupterT>::AddPoints
+{   
+    AddPoints(const PointListT& points,
+              PoolType& pool,
+              size_t grainSize,
+              const PointMaskGrid& parent)
+        : mPoints(&points)
+        , mParent(&parent)
+        , mPool(&pool)
+    {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mPoints->size(), grainSize), *this);
+    }
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        if (mParent->interrupt()) return;
+        GridT& grid = mPool->local();
+        const math::Transform& xform = grid.transform();
+        typename GridT::Accessor acc = grid.getAccessor();
+        Vec3R wPos;
+        for (size_t i=range.begin(), n=range.end(); i!=n; ++i) {
+            mPoints->getPos(i, wPos);
+            acc.setValueOn( xform.worldToIndexCellCentered( wPos ) );
+        }
+    }
+    const PointListT*    mPoints;
+    const PointMaskGrid* mParent;
+    PoolType*            mPool;
+
+};// end of private member class AddPoints 
+
+// Private member class that implements concurrent reduction of a thread-local pool
+template<typename GridT, typename InterrupterT>
+struct PointMaskGrid<GridT, InterrupterT>::ReducePool
+{
+    typedef std::vector<GridT*>       VecT;
+    typedef typename VecT::iterator   IterT;
+    typedef tbb::blocked_range<IterT> RangeT;
+    
+    ReducePool(PoolType& pool, GridT* grid, size_t grainSize = 1)
+        : mOwnsGrid(false)
+        , mGrid(grid)
+    {
+        if ( grainSize == 0 ) {
+            typedef typename PoolType::const_iterator IterT;
+            for (IterT i=pool.begin(); i!=pool.end(); ++i) mGrid->topologyUnion( *i );
+        } else {
+            VecT grids( pool.size() );
+            typename PoolType::iterator i = pool.begin();
+            for (size_t j=0; j != pool.size(); ++i, ++j) grids[j] = &(*i);
+            tbb::parallel_reduce( RangeT( grids.begin(), grids.end(), grainSize ), *this );
+        }
+    }
+    
+    ReducePool(const ReducePool&, tbb::split)
+        : mOwnsGrid(true)
+        , mGrid(new GridT())
+    {
+    }
+    
+    ~ReducePool() { if (mOwnsGrid) delete mGrid; }
+    
+    void operator()(const RangeT& r)
+    {
+        for (IterT i=r.begin(); i!=r.end(); ++i) mGrid->topologyUnion( *(*i) );
+    }
+    
+    void join(ReducePool& other) { mGrid->topologyUnion(*other.mGrid); }
+    
+    const bool mOwnsGrid;    
+    GridT*     mGrid;
+};// end of private member class ReducePool
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif //OPENVDB_TOOLS_POINT_MASK_GRID_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/PointPartitioner.h b/nuparu/include/openvdb_new/tools/PointPartitioner.h
new file mode 100644
index 00000000..9ada5210
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/PointPartitioner.h
@@ -0,0 +1,1038 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file    PointPartitioner.h
+///
+/// @brief   Spatially partitions points using a parallel radix-based
+///          sorting algorithm.
+///
+/// @details Performs a stable deterministic sort; partitioning the same
+///          point sequence will produce the same result each time.
+/// @details The algorithm is unbounded meaning that points may be
+///          distributed anywhere in index space.
+/// @details The actual points are never stored in the tool, only
+///          offsets into an external array.
+///
+/// @author  Mihai Alden
+
+#ifndef OPENVDB_TOOLS_POINT_PARTITIONER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_PARTITIONER_HAS_BEEN_INCLUDED
+
+
+#include <openvdb/Types.h>
+#include <openvdb/math/Transform.h>
+
+#include <deque>
+#include <map>
+#include <set>
+#include <utility> // std::pair
+#include <vector>
+
+#include <boost/integer.hpp> // boost::int_t<N>::least
+#include <boost/scoped_array.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/math/special_functions/fpclassify.hpp> // boost::math::isfinite
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/task_scheduler_init.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+/// @brief   Partitions points into @c BucketLog2Dim aligned buckets
+///          using a parallel radix-based sorting algorithm.
+///
+/// @interface PointArray
+/// Expected interface for the PointArray container:
+/// @code
+/// template<typename VectorType>
+/// struct PointArray
+/// {
+///     // The type used to represent world-space point positions
+///     typedef VectorType  PosType;
+///
+///     // Return the number of points in the array
+///     size_t size() const;
+///
+///     // Return the world-space position of the nth point in the array.
+///     void getPos(size_t n, PosType& xyz) const;
+/// };
+/// @endcode
+///
+/// @details Performs a stable deterministic sort; partitioning the same
+///          point sequence will produce the same result each time.
+/// @details The algorithm is unbounded meaning that points may be
+///          distributed anywhere in index space.
+/// @details The actual points are never stored in the tool, only
+///          offsets into an external array.
+/// @details @c BucketLog2Dim defines the bucket coordinate dimensions,
+///          i.e. BucketLog2Dim = 3 corresponds to a bucket that spans
+///          a (2^3)^3 = 8^3 voxel region.
+template<typename PointIndexType = uint32_t, Index BucketLog2Dim = 3>
+class PointPartitioner
+{
+public:
+    enum { LOG2DIM = BucketLog2Dim };
+
+    typedef boost::shared_ptr<PointPartitioner>                     Ptr;
+    typedef boost::shared_ptr<const PointPartitioner>               ConstPtr;
+
+    typedef PointIndexType                                          IndexType;
+    typedef typename boost::int_t<1 + (3 * BucketLog2Dim)>::least   VoxelOffsetType;
+    typedef boost::scoped_array<VoxelOffsetType>                    VoxelOffsetArray;
+
+    class IndexIterator;
+
+    //////////
+
+    PointPartitioner();
+
+    /// @brief  Partitions point indices into @c BucketLog2Dim aligned buckets.
+    ///
+    /// @param points               list of world space points.
+    /// @param xform                world to index space transform.
+    /// @param voxelOrder           sort point indices by local voxel offsets.
+    /// @param recordVoxelOffsets   construct local voxel offsets
+    template<typename PointArray>
+    void construct(const PointArray& points, const math::Transform& xform,
+        bool voxelOrder = false, bool recordVoxelOffsets = false);
+
+
+    /// @brief  Partitions point indices into @c BucketLog2Dim aligned buckets.
+    ///
+    /// @param points               list of world space points.
+    /// @param xform                world to index space transform.
+    /// @param voxelOrder           sort point indices by local voxel offsets.
+    /// @param recordVoxelOffsets   construct local voxel offsets
+    template<typename PointArray>
+    static Ptr create(const PointArray& points, const math::Transform& xform,
+        bool voxelOrder = false, bool recordVoxelOffsets = false);
+
+
+    /// @brief Returns the number of buckets.
+    size_t size() const { return mPageCount; }
+
+    /// @brief true if the container size is 0, false otherwise.
+    bool empty() const { return mPageCount == 0; }
+
+    /// @brief Removes all data and frees up memory.
+    void clear();
+
+    /// @brief Exchanges the content of the container by another.
+    void swap(PointPartitioner&);
+
+    /// @brief Returns the point indices for bucket @a n
+    IndexIterator indices(size_t n) const;
+
+    /// @brief Returns the coordinate-aligned bounding box for bucket @a n
+    CoordBBox getBBox(size_t n) const {
+        return CoordBBox::createCube(mPageCoordinates[n], (1u << BucketLog2Dim));
+    }
+
+    /// @brief Returns the origin coordinate for bucket @a n
+    const Coord& origin(size_t n) const  { return mPageCoordinates[n]; }
+
+    /// @brief  Returns a list of @c LeafNode voxel offsets for the points.
+    /// @note   The list is optionally constructed.
+    const VoxelOffsetArray&  voxelOffsets() const { return mVoxelOffsets; }
+
+private:
+    // Disallow copying
+    PointPartitioner(const PointPartitioner&);
+    PointPartitioner& operator=(const PointPartitioner&);
+
+    boost::scoped_array<IndexType>  mPointIndices;
+    VoxelOffsetArray                mVoxelOffsets;
+
+    boost::scoped_array<IndexType>  mPageOffsets;
+    boost::scoped_array<Coord>      mPageCoordinates;
+    IndexType mPageCount;
+}; // class PointPartitioner
+
+
+typedef PointPartitioner<uint32_t, 3> UInt32PointPartitioner;
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+class PointPartitioner<PointIndexType, BucketLog2Dim>::IndexIterator
+{
+public:
+    typedef PointIndexType     IndexType;
+
+    IndexIterator(IndexType* begin = NULL, IndexType* end = NULL)
+        : mBegin(begin), mEnd(end), mItem(begin) {}
+
+    /// @brief Rewind to first item.
+    void reset() { mItem = mBegin; }
+
+    /// @brief  Number of point indices in the iterator range.
+    size_t size() const { return mEnd - mBegin; }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    IndexType& operator*() { assert(mItem != NULL); return *mItem; }
+    const IndexType& operator*() const { assert(mItem != NULL); return *mItem; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    operator bool() const { return mItem < mEnd; }
+    bool test() const { return mItem < mEnd; }
+
+    /// @brief  Advance to the next item.
+    IndexIterator& operator++() { assert(this->test()); ++mItem; return *this; }
+
+    /// @brief  Advance to the next item.
+    bool next() { this->operator++(); return this->test(); }
+    bool increment() { this->next(); return this->test(); }
+
+    /// @brief Equality operators
+    bool operator==(const IndexIterator& other) const { return mItem == other.mItem; }
+    bool operator!=(const IndexIterator& other) const { return !this->operator==(other); }
+
+private:
+    IndexType * const mBegin, * const mEnd;
+    IndexType * mItem;
+}; // class PointPartitioner::IndexIterator
+
+
+////////////////////////////////////////
+////////////////////////////////////////
+
+// Implementation details
+
+
+namespace point_partitioner_internal {
+
+
+template<typename PointIndexType>
+struct ComputePointOrderOp
+{
+    ComputePointOrderOp(PointIndexType* pointOrder,
+        const PointIndexType* bucketCounters, const PointIndexType* bucketOffsets)
+        : mPointOrder(pointOrder)
+        , mBucketCounters(bucketCounters)
+        , mBucketOffsets(bucketOffsets)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+            mPointOrder[n] += mBucketCounters[mBucketOffsets[n]];
+        }
+    }
+
+    PointIndexType       * const mPointOrder;
+    PointIndexType const * const mBucketCounters;
+    PointIndexType const * const mBucketOffsets;
+}; // struct ComputePointOrderOp
+
+
+template<typename PointIndexType>
+struct CreateOrderedPointIndexArrayOp
+{
+    CreateOrderedPointIndexArrayOp(PointIndexType* orderedIndexArray,
+        const PointIndexType* pointOrder, const PointIndexType* indices)
+        : mOrderedIndexArray(orderedIndexArray)
+        , mPointOrder(pointOrder)
+        , mIndices(indices)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+            mOrderedIndexArray[mPointOrder[n]] = mIndices[n];
+        }
+    }
+
+    PointIndexType       * const mOrderedIndexArray;
+    PointIndexType const * const mPointOrder;
+    PointIndexType const * const mIndices;
+}; // struct CreateOrderedPointIndexArrayOp
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+struct VoxelOrderOp
+{
+    typedef typename boost::int_t<1 + (3 * BucketLog2Dim)>::least     VoxelOffsetType;
+    typedef boost::scoped_array<VoxelOffsetType>                VoxelOffsetArray;
+    typedef boost::scoped_array<PointIndexType>                 IndexArray;
+
+    VoxelOrderOp(IndexArray& indices, const IndexArray& pages,const VoxelOffsetArray& offsets)
+        : mIndices(indices.get())
+        , mPages(pages.get())
+        , mVoxelOffsets(offsets.get())
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        PointIndexType pointCount = 0;
+        for (size_t n(range.begin()), N(range.end()); n != N; ++n) {
+            pointCount = std::max(pointCount, (mPages[n + 1] - mPages[n]));
+        }
+
+        const PointIndexType voxelCount = 1 << (3 * BucketLog2Dim);
+
+        // allocate histogram buffers
+        boost::scoped_array<VoxelOffsetType> offsets(new VoxelOffsetType[pointCount]);
+        boost::scoped_array<PointIndexType> sortedIndices(new PointIndexType[pointCount]);
+        boost::scoped_array<PointIndexType> histogram(new PointIndexType[voxelCount]);
+
+        for (size_t n(range.begin()), N(range.end()); n != N; ++n) {
+
+            PointIndexType * const indices = mIndices + mPages[n];
+            pointCount = mPages[n + 1] - mPages[n];
+
+            // local copy of voxel offsets.
+            for (PointIndexType i = 0; i < pointCount; ++i) {
+                offsets[i] = mVoxelOffsets[ indices[i] ];
+            }
+
+            // reset histogram
+            memset(&histogram[0], 0, voxelCount * sizeof(PointIndexType));
+
+            // compute histogram
+            for (PointIndexType i = 0; i < pointCount; ++i) {
+                ++histogram[ offsets[i] ];
+            }
+
+            PointIndexType count = 0, startOffset;
+            for (int i = 0; i < int(voxelCount); ++i) {
+                if (histogram[i] > 0) {
+                    startOffset = count;
+                    count += histogram[i];
+                    histogram[i] = startOffset;
+                }
+            }
+
+            // sort indices based on voxel offset
+            for (PointIndexType i = 0; i < pointCount; ++i) {
+                sortedIndices[ histogram[ offsets[i] ]++ ] = indices[i];
+            }
+
+            memcpy(&indices[0], &sortedIndices[0], sizeof(PointIndexType) * pointCount);
+        }
+    }
+
+    PointIndexType        * const mIndices;
+    PointIndexType  const * const mPages;
+    VoxelOffsetType const * const mVoxelOffsets;
+}; // struct VoxelOrderOp
+
+
+template<typename PointArray, typename PointIndexType>
+struct LeafNodeOriginOp
+{
+    typedef boost::scoped_array<PointIndexType>     IndexArray;
+    typedef boost::scoped_array<Coord>              CoordArray;
+
+    LeafNodeOriginOp(CoordArray& coordinates,
+        const IndexArray& indices, const IndexArray& pages,
+        const PointArray& points, const math::Transform& m, int log2dim)
+        : mCoordinates(coordinates.get())
+        , mIndices(indices.get())
+        , mPages(pages.get())
+        , mPoints(&points)
+        , mXForm(m)
+        , mLog2Dim(log2dim)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        typedef typename PointArray::PosType  PosType;
+
+        const int mask = ~((1 << mLog2Dim) - 1);
+        Coord ijk;
+        PosType pos;
+
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+            mPoints->getPos(mIndices[mPages[n]], pos);
+
+            if (boost::math::isfinite(pos[0]) &&
+                boost::math::isfinite(pos[1]) &&
+                boost::math::isfinite(pos[2])) {
+
+                ijk = mXForm.worldToIndexCellCentered(pos);
+
+                ijk[0] &= mask;
+                ijk[1] &= mask;
+                ijk[2] &= mask;
+
+                mCoordinates[n] = ijk;
+            }
+        }
+    }
+
+    Coord                 * const mCoordinates;
+    PointIndexType  const * const mIndices;
+    PointIndexType  const * const mPages;
+    PointArray      const * const mPoints;
+    math::Transform         const mXForm;
+    int                     const mLog2Dim;
+}; // struct LeafNodeOriginOp
+
+
+////////////////////////////////////////
+
+
+template<typename T>
+struct Array
+{
+    typedef boost::shared_ptr<Array> Ptr;
+
+    Array(size_t size) : mSize(size), mData(new T[size]) { }
+
+    size_t size() const { return mSize; }
+
+    T* data() { return mData.get(); }
+    const T* data() const { return mData.get(); }
+
+    void clear() { mSize = 0; mData.reset(); }
+
+private:
+    size_t                  mSize;
+    boost::scoped_array<T>  mData;
+}; // struct Array
+
+
+template<typename PointIndexType>
+struct MoveSegmentDataOp
+{
+    typedef Array<PointIndexType>   Segment;
+    typedef typename Segment::Ptr   SegmentPtr;
+
+    MoveSegmentDataOp(std::vector<PointIndexType*>& indexLists, SegmentPtr* segments)
+        : mIndexLists(&indexLists[0]), mSegments(segments)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+        for (size_t n(range.begin()), N(range.end()); n != N; ++n) {
+            PointIndexType* indices = mIndexLists[n];
+            SegmentPtr& segment = mSegments[n];
+
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, segment->size()),
+                CopyData(indices, segment->data()));
+
+            segment.reset(); // clear data
+        }
+    }
+
+private:
+
+    struct CopyData
+    {
+        CopyData(PointIndexType* lhs, const PointIndexType* rhs) : mLhs(lhs), mRhs(rhs) { }
+
+        void operator()(const tbb::blocked_range<size_t>& range) const {
+            for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+                mLhs[n] = mRhs[n];
+            }
+        }
+
+        PointIndexType       * const mLhs;
+        PointIndexType const * const mRhs;
+    };
+
+    PointIndexType * const * const mIndexLists;
+    SegmentPtr             * const mSegments;
+}; // struct MoveSegmentDataOp
+
+
+template<typename PointIndexType>
+struct MergeBinsOp
+{
+    typedef Array<PointIndexType>                       Segment;
+    typedef typename Segment::Ptr                       SegmentPtr;
+
+    typedef std::pair<PointIndexType, PointIndexType>   IndexPair;
+    typedef std::deque<IndexPair>                       IndexPairList;
+    typedef boost::shared_ptr<IndexPairList>            IndexPairListPtr;
+    typedef std::map<Coord, IndexPairListPtr>           IndexPairListMap;
+    typedef boost::shared_ptr<IndexPairListMap>         IndexPairListMapPtr;
+
+    MergeBinsOp(IndexPairListMapPtr* bins,
+        SegmentPtr* indexSegments,
+        SegmentPtr* offsetSegments,
+        Coord* coords,
+        size_t numSegments)
+        : mBins(bins)
+        , mIndexSegments(indexSegments)
+        , mOffsetSegments(offsetSegments)
+        , mCoords(coords)
+        , mNumSegments(numSegments)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        std::vector<IndexPairListPtr*> data;
+        std::vector<PointIndexType> arrayOffsets;
+
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+            const Coord& ijk = mCoords[n];
+            size_t numIndices = 0;
+
+            data.clear();
+
+            for (size_t i = 0, I = mNumSegments; i < I; ++i) {
+
+                IndexPairListMap& idxMap = *mBins[i];
+                typename IndexPairListMap::iterator iter = idxMap.find(ijk);
+
+                if (iter != idxMap.end() && iter->second) {
+                    IndexPairListPtr& idxListPtr = iter->second;
+
+                    data.push_back(&idxListPtr);
+                    numIndices += idxListPtr->size();
+                }
+            }
+
+            if (data.empty() || numIndices == 0) continue;
+
+            SegmentPtr& indexSegment = mIndexSegments[n];
+            SegmentPtr& offsetSegment = mOffsetSegments[n];
+
+            indexSegment.reset(new Segment(numIndices));
+            offsetSegment.reset(new Segment(numIndices));
+
+            arrayOffsets.clear();
+            arrayOffsets.reserve(data.size());
+
+            for (size_t i = 0, count = 0, I = data.size(); i < I; ++i) {
+                arrayOffsets.push_back(PointIndexType(count));
+                count += (*data[i])->size();
+            }
+
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, data.size()),
+                CopyData(&data[0], &arrayOffsets[0], indexSegment->data(), offsetSegment->data()));
+        }
+    }
+
+private:
+
+    struct CopyData
+    {
+        CopyData(IndexPairListPtr** indexLists,
+            const PointIndexType* arrayOffsets,
+            PointIndexType* indices,
+            PointIndexType* offsets)
+            : mIndexLists(indexLists)
+            , mArrayOffsets(arrayOffsets)
+            , mIndices(indices)
+            , mOffsets(offsets)
+        {
+        }
+
+        void operator()(const tbb::blocked_range<size_t>& range) const {
+
+            typedef typename IndexPairList::const_iterator CIter;
+
+            for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+                const PointIndexType arrayOffset = mArrayOffsets[n];
+                PointIndexType* indexPtr = &mIndices[arrayOffset];
+                PointIndexType* offsetPtr = &mOffsets[arrayOffset];
+
+                IndexPairListPtr& list = *mIndexLists[n];
+
+                for (CIter it = list->begin(), end = list->end(); it != end; ++it) {
+                    const IndexPair& data = *it;
+                    *indexPtr++ = data.first;
+                    *offsetPtr++ = data.second;
+                }
+
+                list.reset(); // clear data
+            }
+        }
+
+        IndexPairListPtr * const * const mIndexLists;
+        PointIndexType     const * const mArrayOffsets;
+        PointIndexType           * const mIndices;
+        PointIndexType           * const mOffsets;
+    }; // struct CopyData
+
+    IndexPairListMapPtr       * const mBins;
+    SegmentPtr                * const mIndexSegments;
+    SegmentPtr                * const mOffsetSegments;
+    Coord               const * const mCoords;
+    size_t                      const mNumSegments;
+}; // struct MergeBinsOp
+
+
+template<typename PointArray, typename PointIndexType, typename VoxelOffsetType>
+struct BinPointIndicesOp
+{
+    typedef typename PointArray::PosType                PosType;
+    typedef std::pair<PointIndexType, PointIndexType>   IndexPair;
+    typedef std::deque<IndexPair>                       IndexPairList;
+    typedef boost::shared_ptr<IndexPairList>            IndexPairListPtr;
+    typedef std::map<Coord, IndexPairListPtr>           IndexPairListMap;
+    typedef boost::shared_ptr<IndexPairListMap>         IndexPairListMapPtr;
+
+    BinPointIndicesOp(IndexPairListMapPtr* data,
+        const PointArray& points,
+        VoxelOffsetType* voxelOffsets,
+        const math::Transform& m,
+        Index binLog2Dim,
+        Index bucketLog2Dim,
+        size_t numSegments)
+        : mData(data)
+        , mPoints(&points)
+        , mVoxelOffsets(voxelOffsets)
+        , mXForm(m)
+        , mBinLog2Dim(binLog2Dim)
+        , mBucketLog2Dim(bucketLog2Dim)
+        , mNumSegments(numSegments)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        const Index log2dim = mBucketLog2Dim;
+        const Index log2dim2 = 2 * log2dim;
+        const Index bucketMask = (1u << log2dim) - 1u;
+
+        const Index binLog2dim = mBinLog2Dim;
+        const Index binLog2dim2 = 2 * binLog2dim;
+
+        const Index binMask = (1u << (log2dim + binLog2dim)) - 1u;
+        const Index invBinMask = ~binMask;
+
+        IndexPairList * idxList = NULL;
+        Coord ijk(0, 0, 0), loc(0, 0, 0), binCoord(0, 0, 0), lastBinCoord(1, 2, 3);
+        PosType pos;
+
+        PointIndexType bucketOffset = 0;
+        VoxelOffsetType voxelOffset = 0;
+
+        const size_t numPoints = mPoints->size();
+        const size_t segmentSize = numPoints / mNumSegments;
+
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+            IndexPairListMapPtr& dataPtr = mData[n];
+            if (!dataPtr) dataPtr.reset(new IndexPairListMap());
+            IndexPairListMap& idxMap = *dataPtr;
+
+            const bool isLastSegment = (n + 1) >= mNumSegments;
+
+            const size_t start = n * segmentSize;
+            const size_t end = isLastSegment ? numPoints : (start + segmentSize);
+
+            for (size_t i = start; i != end; ++i) {
+
+                mPoints->getPos(i, pos);
+
+                if (boost::math::isfinite(pos[0]) &&
+                    boost::math::isfinite(pos[1]) &&
+                    boost::math::isfinite(pos[2])) {
+
+                    ijk = mXForm.worldToIndexCellCentered(pos);
+
+                    if (mVoxelOffsets) {
+                        loc[0] = ijk[0] & bucketMask;
+                        loc[1] = ijk[1] & bucketMask;
+                        loc[2] = ijk[2] & bucketMask;
+                        voxelOffset = VoxelOffsetType((loc[0] << log2dim2) + (loc[1] << log2dim) + loc[2]);
+                    }
+
+                    binCoord[0] = ijk[0] & invBinMask;
+                    binCoord[1] = ijk[1] & invBinMask;
+                    binCoord[2] = ijk[2] & invBinMask;
+
+                    ijk[0] &= binMask;
+                    ijk[1] &= binMask;
+                    ijk[2] &= binMask;
+
+                    ijk[0] >>= log2dim;
+                    ijk[1] >>= log2dim;
+                    ijk[2] >>= log2dim;
+
+                    bucketOffset = PointIndexType((ijk[0] << binLog2dim2) + (ijk[1] << binLog2dim) + ijk[2]);
+
+                    if (lastBinCoord != binCoord) {
+                        lastBinCoord = binCoord;
+                        IndexPairListPtr& idxListPtr = idxMap[lastBinCoord];
+                        if (!idxListPtr) idxListPtr.reset(new IndexPairList());
+                        idxList = idxListPtr.get();
+                    }
+
+                    idxList->push_back(IndexPair(PointIndexType(i), bucketOffset));
+                    if (mVoxelOffsets) mVoxelOffsets[i] = voxelOffset;
+                }
+            }
+        }
+    }
+
+    IndexPairListMapPtr        * const mData;
+    PointArray           const * const mPoints;
+    VoxelOffsetType            * const mVoxelOffsets;
+    math::Transform              const mXForm;
+    Index                        const mBinLog2Dim;
+    Index                        const mBucketLog2Dim;
+    size_t                       const mNumSegments;
+}; // struct BinPointIndicesOp
+
+
+template<typename PointIndexType>
+struct OrderSegmentsOp
+{
+    typedef boost::scoped_array<PointIndexType>     IndexArray;
+    typedef typename Array<PointIndexType>::Ptr     SegmentPtr;
+
+    OrderSegmentsOp(SegmentPtr* indexSegments, SegmentPtr* offestSegments,
+        IndexArray* pageOffsetArrays, Index binVolume)
+        : mIndexSegments(indexSegments)
+        , mOffsetSegments(offestSegments)
+        , mPageOffsetArrays(pageOffsetArrays)
+        , mBinVolume(binVolume)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const {
+
+        const size_t bucketCountersSize = size_t(mBinVolume);
+        IndexArray bucketCounters(new PointIndexType[bucketCountersSize]);
+
+        size_t maxSegmentSize = 0;
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+            maxSegmentSize = std::max(maxSegmentSize, mIndexSegments[n]->size());
+        }
+
+        IndexArray bucketIndices(new PointIndexType[maxSegmentSize]);
+
+
+        for (size_t n = range.begin(), N = range.end(); n != N; ++n) {
+
+            memset(bucketCounters.get(), 0, sizeof(PointIndexType) * bucketCountersSize);
+
+            const size_t segmentSize = mOffsetSegments[n]->size();
+            PointIndexType* offsets = mOffsetSegments[n]->data();
+
+            // Count the number of points per bucket and assign a local bucket index
+            // to each point.
+            for (size_t i = 0; i < segmentSize; ++i) {
+                bucketIndices[i] = bucketCounters[offsets[i]]++;
+            }
+
+            PointIndexType nonemptyBucketCount = 0;
+            for (size_t i = 0; i < bucketCountersSize; ++i) {
+                nonemptyBucketCount += static_cast<PointIndexType>(bucketCounters[i] != 0);
+            }
+
+
+            IndexArray& pageOffsets = mPageOffsetArrays[n];
+            pageOffsets.reset(new PointIndexType[nonemptyBucketCount + 1]);
+            pageOffsets[0] = nonemptyBucketCount + 1; // stores array size in first element
+
+            // Compute bucket counter prefix sum
+            PointIndexType count = 0, idx = 1;
+            for (size_t i = 0; i < bucketCountersSize; ++i) {
+                if (bucketCounters[i] != 0) {
+                    pageOffsets[idx] = bucketCounters[i];
+                    bucketCounters[i] = count;
+                    count += pageOffsets[idx];
+                    ++idx;
+                }
+            }
+
+            PointIndexType* indices = mIndexSegments[n]->data();
+            const tbb::blocked_range<size_t> segmentRange(0, segmentSize);
+
+            // Compute final point order by incrementing the local bucket point index
+            // with the prefix sum offset.
+            tbb::parallel_for(segmentRange, ComputePointOrderOp<PointIndexType>(
+                bucketIndices.get(), bucketCounters.get(), offsets));
+
+            tbb::parallel_for(segmentRange, CreateOrderedPointIndexArrayOp<PointIndexType>(
+                offsets, bucketIndices.get(), indices));
+
+            mIndexSegments[n]->clear(); // clear data
+        }
+    }
+
+    SegmentPtr * const mIndexSegments;
+    SegmentPtr * const mOffsetSegments;
+    IndexArray * const mPageOffsetArrays;
+    Index        const mBinVolume;
+}; // struct OrderSegmentsOp
+
+
+////////////////////////////////////////
+
+
+/// @brief Segment points using one level of least significant digit radix bins.
+template<typename PointIndexType, typename VoxelOffsetType, typename PointArray>
+inline void binAndSegment(
+    const PointArray& points,
+    const math::Transform& xform,
+    boost::scoped_array<typename Array<PointIndexType>::Ptr>& indexSegments,
+    boost::scoped_array<typename Array<PointIndexType>::Ptr>& offsetSegments,
+    size_t& segmentCount,
+    const Index binLog2Dim,
+    const Index bucketLog2Dim,
+    VoxelOffsetType* voxelOffsets = NULL)
+{
+    typedef std::pair<PointIndexType, PointIndexType>   IndexPair;
+    typedef std::deque<IndexPair>                       IndexPairList;
+    typedef boost::shared_ptr<IndexPairList>            IndexPairListPtr;
+    typedef std::map<Coord, IndexPairListPtr>           IndexPairListMap;
+    typedef boost::shared_ptr<IndexPairListMap>         IndexPairListMapPtr;
+
+    size_t numTasks = 1, numThreads = size_t(tbb::task_scheduler_init::default_num_threads());
+    if (points.size() > (numThreads * 2)) numTasks = numThreads * 2;
+    else if (points.size() > numThreads) numTasks = numThreads;
+
+    boost::scoped_array<IndexPairListMapPtr> bins(new IndexPairListMapPtr[numTasks]);
+
+    typedef BinPointIndicesOp<PointArray, PointIndexType, VoxelOffsetType> BinOp;
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, numTasks),
+        BinOp(bins.get(), points, voxelOffsets, xform, binLog2Dim, bucketLog2Dim, numTasks));
+
+    std::set<Coord> uniqueCoords;
+
+    for (size_t i = 0; i < numTasks; ++i) {
+        IndexPairListMap& idxMap = *bins[i];
+        for (typename IndexPairListMap::iterator it = idxMap.begin(); it != idxMap.end(); ++it) {
+            uniqueCoords.insert(it->first);
+        }
+    }
+
+    std::vector<Coord> coords(uniqueCoords.begin(), uniqueCoords.end());
+    uniqueCoords.clear();
+
+    segmentCount = coords.size();
+
+    typedef typename Array<PointIndexType>::Ptr SegmentPtr;
+
+    indexSegments.reset(new SegmentPtr[segmentCount]);
+    offsetSegments.reset(new SegmentPtr[segmentCount]);
+
+    typedef MergeBinsOp<PointIndexType> MergeOp;
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, segmentCount),
+        MergeOp(bins.get(), indexSegments.get(), offsetSegments.get(), &coords[0], numTasks));
+}
+
+
+template<typename PointIndexType, typename VoxelOffsetType, typename PointArray>
+inline void partition(
+    const PointArray& points,
+    const math::Transform& xform,
+    const Index bucketLog2Dim,
+    boost::scoped_array<PointIndexType>& pointIndices,
+    boost::scoped_array<PointIndexType>& pageOffsets,
+    PointIndexType& pageCount,
+    boost::scoped_array<VoxelOffsetType>& voxelOffsets,
+    bool recordVoxelOffsets)
+{
+    if (recordVoxelOffsets) voxelOffsets.reset(new VoxelOffsetType[points.size()]);
+    else  voxelOffsets.reset();
+
+    const Index binLog2Dim = 5u;
+    // note: Bins span a (2^(binLog2Dim + bucketLog2Dim))^3 voxel region,
+    //       i.e. bucketLog2Dim = 3 and binLog2Dim = 5 corresponds to a
+    //       (2^8)^3 = 256^3 voxel region.
+
+
+    size_t numSegments = 0;
+
+    boost::scoped_array<typename Array<PointIndexType>::Ptr> indexSegments;
+    boost::scoped_array<typename Array<PointIndexType>::Ptr> offestSegments;
+
+    binAndSegment<PointIndexType, VoxelOffsetType, PointArray>(points, xform,
+        indexSegments, offestSegments, numSegments, binLog2Dim, bucketLog2Dim, voxelOffsets.get());
+
+    const tbb::blocked_range<size_t> segmentRange(0, numSegments);
+
+    typedef boost::scoped_array<PointIndexType> IndexArray;
+    boost::scoped_array<IndexArray> pageOffsetArrays(new IndexArray[numSegments]);
+
+    const Index binVolume = 1u << (3u * binLog2Dim);
+
+    tbb::parallel_for(segmentRange, OrderSegmentsOp<PointIndexType>
+        (indexSegments.get(), offestSegments.get(), pageOffsetArrays.get(), binVolume));
+
+    indexSegments.reset();
+
+    pageCount = 0;
+    for (size_t n = 0; n < numSegments; ++n) {
+        pageCount += pageOffsetArrays[n][0] - 1;
+    }
+
+    pageOffsets.reset(new PointIndexType[pageCount + 1]);
+
+    PointIndexType count = 0;
+    for (size_t n = 0, idx = 0; n < numSegments; ++n) {
+
+        PointIndexType* offsets = pageOffsetArrays[n].get();
+        size_t size = size_t(offsets[0]);
+
+        for (size_t i = 1; i < size; ++i) {
+            pageOffsets[idx++] = count;
+            count += offsets[i];
+        }
+    }
+
+    pageOffsets[pageCount] = count;
+
+    pointIndices.reset(new PointIndexType[points.size()]);
+
+    std::vector<PointIndexType*> indexArray;
+    indexArray.reserve(numSegments);
+
+    PointIndexType* index = pointIndices.get();
+    for (size_t n = 0; n < numSegments; ++n) {
+        indexArray.push_back(index);
+        index += offestSegments[n]->size();
+    }
+
+    tbb::parallel_for(segmentRange, MoveSegmentDataOp<PointIndexType>(indexArray, offestSegments.get()));
+}
+
+
+} // namespace point_partitioner_internal
+
+
+////////////////////////////////////////
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+inline PointPartitioner<PointIndexType, BucketLog2Dim>::PointPartitioner()
+    : mPointIndices(NULL)
+    , mVoxelOffsets(NULL)
+    , mPageOffsets(NULL)
+    , mPageCoordinates(NULL)
+    , mPageCount(0)
+{
+}
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+inline void
+PointPartitioner<PointIndexType, BucketLog2Dim>::clear()
+{
+    mPageCount = 0;
+    mPointIndices.reset();
+    mVoxelOffsets.reset();
+    mPageOffsets.reset();
+    mPageCoordinates.reset();
+}
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+inline void
+PointPartitioner<PointIndexType, BucketLog2Dim>::swap(PointPartitioner& rhs)
+{
+    const IndexType tmpLhsPageCount = mPageCount;
+    mPageCount = rhs.mPageCount;
+    rhs.mPageCount = tmpLhsPageCount;
+
+    mPointIndices.swap(rhs.mPointIndices);
+    mVoxelOffsets.swap(rhs.mVoxelOffsets);
+    mPageOffsets.swap(rhs.mPageOffsets);
+    mPageCoordinates.swap(rhs.mPageCoordinates);
+}
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+inline typename PointPartitioner<PointIndexType, BucketLog2Dim>::IndexIterator
+PointPartitioner<PointIndexType, BucketLog2Dim>::indices(size_t n) const
+{
+    assert(bool(mPointIndices) && bool(mPageCount));
+    return IndexIterator(
+        mPointIndices.get() + mPageOffsets[n],
+        mPointIndices.get() + mPageOffsets[n + 1]);
+}
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+template<typename PointArray>
+inline void
+PointPartitioner<PointIndexType, BucketLog2Dim>::construct(const PointArray& points,
+    const math::Transform& xform, bool voxelOrder, bool recordVoxelOffsets)
+{
+    point_partitioner_internal::partition(points, xform, BucketLog2Dim,
+        mPointIndices, mPageOffsets, mPageCount, mVoxelOffsets, (voxelOrder || recordVoxelOffsets));
+
+    const tbb::blocked_range<size_t> pageRange(0, mPageCount);
+    mPageCoordinates.reset(new Coord[mPageCount]);
+
+    tbb::parallel_for(pageRange,
+        point_partitioner_internal::LeafNodeOriginOp<PointArray, IndexType>
+            (mPageCoordinates, mPointIndices, mPageOffsets, points, xform, BucketLog2Dim));
+
+    if (mVoxelOffsets && voxelOrder) {
+        tbb::parallel_for(pageRange, point_partitioner_internal::VoxelOrderOp<
+            IndexType, BucketLog2Dim>(mPointIndices, mPageOffsets, mVoxelOffsets));
+    }
+
+    if (mVoxelOffsets && !recordVoxelOffsets) {
+        mVoxelOffsets.reset();
+    }
+}
+
+
+template<typename PointIndexType, Index BucketLog2Dim>
+template<typename PointArray>
+inline typename PointPartitioner<PointIndexType, BucketLog2Dim>::Ptr
+PointPartitioner<PointIndexType, BucketLog2Dim>::create(const PointArray& points, const math::Transform& xform,
+     bool voxelOrder, bool recordVoxelOffsets)
+{
+    Ptr ret(new PointPartitioner());
+    ret->construct(points, xform, voxelOrder, recordVoxelOffsets);
+    return ret;
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_PARTITIONER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/PointScatter.h b/nuparu/include/openvdb_new/tools/PointScatter.h
new file mode 100644
index 00000000..6ba9e76b
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/PointScatter.h
@@ -0,0 +1,437 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file PointScatter.h
+///
+/// @brief We offer three different algorithms (each in its own class)
+///        for scattering of point in active voxels:
+///
+/// 1) UniformPointScatter. Has two modes: Either randomly distributes
+///    a fixed number of points in the active voxels, or the user can
+///    specify a fixed probability of having a points per unit of volume.
+///
+/// 2) DenseUniformPointScatter. Randomly distributes points in active
+///    voxels using a fixed number of points per voxel.
+///
+/// 3) NonIniformPointScatter. Define the local probability of having
+///    a point in a voxel as the product of a global density and the
+///    value of the voxel itself.
+
+#ifndef OPENVDB_TOOLS_POINT_SCATTER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_SCATTER_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/util/NullInterrupter.h>
+#include <tbb/parallel_sort.h>
+#include <tbb/parallel_for.h>
+#include <boost/scoped_array.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// Forward declaration of base class
+template<typename PointAccessorType,
+         typename RandomGenerator,
+         typename InterruptType = util::NullInterrupter>
+class BasePointScatter;
+
+/// @brief The two point scatters UniformPointScatter and
+/// NonUniformPointScatter depend on the following two classes:
+///
+/// The @c PointAccessorType template argument below refers to any class
+/// with the following interface:
+/// @code
+/// class PointAccessor {
+///   ...
+/// public:
+///   void add(const openvdb::Vec3R &pos);// appends point with world positions pos
+/// };
+/// @endcode
+///
+///
+/// The @c InterruptType template argument below refers to any class
+/// with the following interface:
+/// @code
+/// class Interrupter {
+///   ...
+/// public:
+///   void start(const char* name = NULL)// called when computations begin
+///   void end()                         // called when computations end
+///   bool wasInterrupted(int percent=-1)// return true to break computation
+///};
+/// @endcode
+///
+/// @note If no template argument is provided for this InterruptType
+/// the util::NullInterrupter is used which implies that all
+/// interrupter calls are no-ops (i.e. incurs no computational overhead).
+
+
+/// @brief Uniform scatters of point in the active voxels.
+/// The point count is either explicitly defined or implicitly
+/// through the specification of a global density (=points-per-volume)
+///
+/// @note This uniform scattering technique assumes that the number of
+/// points is generally smaller than the number of active voxels
+/// (including virtual active voxels in active tiles).
+template<typename PointAccessorType,
+         typename RandomGenerator,
+         typename InterruptType = util::NullInterrupter>
+class UniformPointScatter : public BasePointScatter<PointAccessorType,
+                                                    RandomGenerator,
+                                                    InterruptType>
+{
+public:
+    typedef BasePointScatter<PointAccessorType, RandomGenerator, InterruptType> BaseT;
+
+    UniformPointScatter(PointAccessorType& points,
+                        Index64 pointCount,
+                        RandomGenerator& randGen,
+                        InterruptType* interrupt = NULL)
+        : BaseT(points, randGen, interrupt)
+        , mTargetPointCount(pointCount)
+        , mPointsPerVolume(0.0f)
+    {
+    }
+    UniformPointScatter(PointAccessorType& points,
+                        float pointsPerVolume,
+                        RandomGenerator& randGen,
+                        InterruptType* interrupt = NULL)
+        : BaseT(points, randGen, interrupt)
+        , mTargetPointCount(0)
+        , mPointsPerVolume(pointsPerVolume)
+    {
+    }
+
+    /// @brief This is the main functor method implementing the actual
+    /// scattering of points.
+    template<typename GridT>
+    bool operator()(const GridT& grid)
+    {
+        mVoxelCount = grid.activeVoxelCount();
+        if (mVoxelCount == 0) return false;
+        const Vec3d dim = grid.voxelSize();
+        if (mPointsPerVolume>0) {
+            BaseT::start("Uniform scattering with fixed point density");
+            mTargetPointCount = Index64(mPointsPerVolume*dim[0]*dim[1]*dim[2])*mVoxelCount;
+        } else if (mTargetPointCount>0) {
+            BaseT::start("Uniform scattering with fixed point count");
+            mPointsPerVolume = mTargetPointCount/float(dim[0]*dim[1]*dim[2] * mVoxelCount);
+        } else {
+            return false;
+        }
+
+        boost::scoped_array<Index64> list(new Index64[mTargetPointCount]);
+        math::RandInt<Index64, RandomGenerator> rand(BaseT::mRand01.engine(), 0, mVoxelCount-1);
+        for (Index64 i=0; i<mTargetPointCount; ++i) list[i] = rand();
+        tbb::parallel_sort(list.get(), list.get() + mTargetPointCount);
+
+        CoordBBox bbox;
+        const Vec3R offset(0.5, 0.5, 0.5);
+        typename GridT::ValueOnCIter valueIter = grid.cbeginValueOn();
+        for (Index64 i=0, n=valueIter.getVoxelCount() ; i != mTargetPointCount; ++i) {
+            if (BaseT::interrupt()) return false;
+            const Index64 voxelId = list[i];
+            while ( n <= voxelId ) {
+                ++valueIter;
+                n += valueIter.getVoxelCount();
+            }
+            if (valueIter.isVoxelValue()) {// a majority is expected to be voxels
+                BaseT::addPoint(grid, valueIter.getCoord() - offset);
+            } else {// tiles contain multiple (virtual) voxels
+                valueIter.getBoundingBox(bbox);
+                BaseT::addPoint(grid, bbox.min() - offset, bbox.extents());
+            }
+        }//loop over all the active voxels and tiles
+
+        BaseT::end();
+        return true;
+    }
+
+    // The following methods should only be called after the
+    // the operator() method was called
+    void print(const std::string &name, std::ostream& os = std::cout) const
+    {
+        os << "Uniformely scattered " << mPointCount << " points into " << mVoxelCount
+           << " active voxels in \"" << name << "\" corresponding to "
+           << mPointsPerVolume << " points per volume." << std::endl;
+    }
+
+    float   getPointsPerVolume()  const { return mPointsPerVolume; }
+    Index64 getTargetPointCount() const { return mTargetPointCount; }
+
+private:
+
+    using BaseT::mPointCount;
+    using BaseT::mVoxelCount;
+    Index64 mTargetPointCount;
+    float mPointsPerVolume;
+
+}; // class UniformPointScatter
+
+/// @brief Scatters a fixed (and integer) number of points in all
+/// active voxels and tiles.
+template<typename PointAccessorType,
+         typename RandomGenerator,
+         typename InterruptType = util::NullInterrupter>
+class DenseUniformPointScatter : public BasePointScatter<PointAccessorType,
+                                                         RandomGenerator,
+                                                         InterruptType>
+{
+public:
+    typedef BasePointScatter<PointAccessorType, RandomGenerator, InterruptType> BaseT;
+
+    DenseUniformPointScatter(PointAccessorType& points,
+                             float pointsPerVoxel,
+                             RandomGenerator& randGen,
+                             InterruptType* interrupt = NULL)
+        : BaseT(points, randGen, interrupt)
+        , mPointsPerVoxel(pointsPerVoxel)
+    {
+    }
+
+    /// This is the main functor method implementing the actual scattering of points.
+    template<typename GridT>
+    bool operator()(const GridT& grid)
+    {
+        typedef typename GridT::ValueOnCIter ValueIter;
+        if (mPointsPerVoxel < 1.0e-6) return false;
+        mVoxelCount = grid.activeVoxelCount();
+        if (mVoxelCount == 0) return false;
+        BaseT::start("Dense uniform scattering with fixed point count");
+        CoordBBox bbox;
+        const Vec3R offset(0.5, 0.5, 0.5);
+
+        const int ppv = math::Floor(mPointsPerVoxel);
+        const double delta = mPointsPerVoxel - ppv;
+        const bool fractional = !math::isApproxZero(delta, 1.0e-6);
+
+        for (ValueIter iter = grid.cbeginValueOn(); iter; ++iter) {
+            if (BaseT::interrupt()) return false;
+            if (iter.isVoxelValue()) {// a majority is expected to be voxels
+                const Vec3R dmin = iter.getCoord() - offset;
+                for (int n = 0; n != ppv; ++n) BaseT::addPoint(grid, dmin);
+                if (fractional && BaseT::getRand() < delta) BaseT::addPoint(grid, dmin);
+            } else {// tiles contain multiple (virtual) voxels
+                iter.getBoundingBox(bbox);
+                const Coord size(bbox.extents());
+                const Vec3R dmin = bbox.min() - offset;
+                const double d = mPointsPerVoxel * iter.getVoxelCount();
+                const int m = math::Floor(d);
+                for (int n = 0; n != m; ++n)  BaseT::addPoint(grid, dmin, size);
+                if (BaseT::getRand() < d - m) BaseT::addPoint(grid, dmin, size);
+            }
+        }//loop over all the active voxels and tiles
+
+        BaseT::end();
+        return true;
+    }
+
+    // The following methods should only be called after the
+    // the operator() method was called
+    void print(const std::string &name, std::ostream& os = std::cout) const
+    {
+        os << "Dense uniformly scattered " << mPointCount << " points into " << mVoxelCount
+           << " active voxels in \"" << name << "\" corresponding to "
+           << mPointsPerVoxel << " points per voxel." << std::endl;
+    }
+
+    float getPointsPerVoxel() const { return mPointsPerVoxel; }
+
+private:
+    using BaseT::mPointCount;
+    using BaseT::mVoxelCount;
+    float mPointsPerVoxel;
+}; // class DenseUniformPointScatter
+
+/// @brief Non-uniform scatters of point in the active voxels.
+/// The local point count is implicitly defined as a product of
+/// of a global density (called pointsPerVolume) and the local voxel
+/// (or tile) value.
+///
+/// @note This scattering technique can be significantly slower
+/// than a uniform scattering since its computational complexity
+/// is proportional to the active voxel (and tile) count.
+template<typename PointAccessorType,
+         typename RandomGenerator,
+         typename InterruptType = util::NullInterrupter>
+class NonUniformPointScatter : public BasePointScatter<PointAccessorType,
+                                                       RandomGenerator,
+                                                       InterruptType>
+{
+public:
+    typedef BasePointScatter<PointAccessorType, RandomGenerator, InterruptType> BaseT;
+
+    NonUniformPointScatter(PointAccessorType& points,
+                           float pointsPerVolume,
+                           RandomGenerator& randGen,
+                           InterruptType* interrupt = NULL)
+        : BaseT(points, randGen, interrupt)
+        , mPointsPerVolume(pointsPerVolume)//note this is merely a
+                                           //multiplier for the local point density
+    {
+    }
+
+    /// This is the main functor method implementing the actual scattering of points.
+    template<typename GridT>
+    bool operator()(const GridT& grid)
+    {
+        if (mPointsPerVolume <= 0.0f) return false;
+        mVoxelCount = grid.activeVoxelCount();
+        if (mVoxelCount == 0) return false;
+        BaseT::start("Non-uniform scattering with local point density");
+        const Vec3d dim = grid.voxelSize();
+        const double volumePerVoxel = dim[0]*dim[1]*dim[2],
+                     pointsPerVoxel = mPointsPerVolume * volumePerVoxel;
+        CoordBBox bbox;
+        const Vec3R offset(0.5, 0.5, 0.5);
+        for (typename GridT::ValueOnCIter iter = grid.cbeginValueOn(); iter; ++iter) {
+            if (BaseT::interrupt()) return false;
+            const double d = (*iter) * pointsPerVoxel * iter.getVoxelCount();
+            const int n = int(d);
+            if (iter.isVoxelValue()) { // a majority is expected to be voxels
+                const Vec3R dmin =iter.getCoord() - offset;
+                for (int i = 0; i < n; ++i) BaseT::addPoint(grid, dmin);
+                if (BaseT::getRand() < (d - n)) BaseT::addPoint(grid, dmin);
+            } else { // tiles contain multiple (virtual) voxels
+                iter.getBoundingBox(bbox);
+                const Coord size(bbox.extents());
+                const Vec3R dmin = bbox.min() - offset;
+                for (int i = 0; i < n; ++i) BaseT::addPoint(grid, dmin, size);
+                if (BaseT::getRand() < (d - n)) BaseT::addPoint(grid, dmin, size);
+            }
+        }//loop over all the active voxels and tiles
+        BaseT::end();
+        return true;
+    }
+
+    // The following methods should only be called after the
+    // the operator() method was called
+    void print(const std::string &name, std::ostream& os = std::cout) const
+    {
+        os << "Non-uniformly scattered " << mPointCount << " points into " << mVoxelCount
+           << " active voxels in \"" << name << "\"." << std::endl;
+    }
+
+    float getPointPerVolume() const { return mPointsPerVolume; }
+
+private:
+    using BaseT::mPointCount;
+    using BaseT::mVoxelCount;
+    float mPointsPerVolume;
+
+}; // class NonUniformPointScatter
+
+/// Base class of all the point scattering classes defined above
+template<typename PointAccessorType,
+         typename RandomGenerator,
+         typename InterruptType>
+class BasePointScatter
+{
+public:
+
+    Index64 getPointCount() const { return mPointCount; }
+    Index64 getVoxelCount() const { return mVoxelCount; }
+
+protected:
+
+    /// This is a base class so the constructor is protected
+    BasePointScatter(PointAccessorType& points,
+                     RandomGenerator& randGen,
+                     InterruptType* interrupt = NULL)
+        : mPoints(points)
+        , mInterrupter(interrupt)
+        , mPointCount(0)
+        , mVoxelCount(0)
+        , mInterruptCount(0)
+        , mRand01(randGen)
+    {
+    }
+
+    PointAccessorType&        mPoints;
+    InterruptType*            mInterrupter;
+    Index64                   mPointCount;
+    Index64                   mVoxelCount;
+    Index64                   mInterruptCount;
+    math::Rand01<double, RandomGenerator> mRand01;
+
+    inline void start(const char* name)
+    {
+        if (mInterrupter) mInterrupter->start(name);
+    }
+
+    inline void end()
+    {
+        if (mInterrupter) mInterrupter->end();
+    }
+
+    inline bool interrupt()
+    {
+        //only check interrupter for every 32'th call
+        return !(mInterruptCount++ & ((1<<5)-1)) && util::wasInterrupted(mInterrupter);
+    }
+
+    inline double getRand() { return mRand01(); }
+
+    template <typename GridT>
+    inline void addPoint(const GridT &grid, const Vec3R &dmin)
+    {
+        const Vec3R pos(dmin[0] + this->getRand(),
+                        dmin[1] + this->getRand(),
+                        dmin[2] + this->getRand());
+        mPoints.add(grid.indexToWorld(pos));
+        ++mPointCount;
+    }
+
+    template <typename GridT>
+    inline void addPoint(const GridT &grid, const Vec3R &dmin, const Coord &size)
+    {
+        const Vec3R pos(dmin[0] + size[0]*this->getRand(),
+                        dmin[1] + size[1]*this->getRand(),
+                        dmin[2] + size[2]*this->getRand());
+        mPoints.add(grid.indexToWorld(pos));
+        ++mPointCount;
+    }
+};// class BasePointScatter
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_POINT_SCATTER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/PoissonSolver.h b/nuparu/include/openvdb_new/tools/PoissonSolver.h
new file mode 100644
index 00000000..60a1f6d6
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/PoissonSolver.h
@@ -0,0 +1,774 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file PoissonSolver.h
+///
+/// @authors D.J. Hill, Peter Cucka
+///
+/// @brief Solve Poisson's equation &nabla;<sup><small>2</small></sup><i>x</i> = <i>b</i>
+/// for <i>x</i>, where @e b is a vector comprising the values of all of the active voxels
+/// in a grid.
+///
+/// @par Example:
+/// Solve for the pressure in a cubic tank of liquid, assuming uniform boundary conditions:
+/// @code
+/// FloatTree source(/*background=*/0.0f);
+/// // Activate voxels to indicate that they contain liquid.
+/// source.fill(CoordBBox(Coord(0, -10, 0), Coord(10, 0, 10)), /*value=*/0.0f);
+///
+/// math::pcg::State state = math::pcg::terminationDefaults<float>();
+/// FloatTree::Ptr solution = tools::poisson::solve(source, state);
+/// @endcode
+///
+/// @par Example:
+/// Solve for the pressure, <i>P</i>, in a cubic tank of liquid that is open at the top.
+/// Boundary conditions are <i>P</i>&nbsp;=&nbsp;0 at the top,
+/// &part;<i>P</i>/&part;<i>y</i>&nbsp;=&nbsp;&minus;1 at the bottom
+/// and &part;<i>P</i>/&part;<i>x</i>&nbsp;=&nbsp;0 at the sides:
+/// <pre>
+///                P = 0
+///             +--------+ (N,0,N)
+///            /|       /|
+///   (0,0,0) +--------+ |
+///           | |      | | dP/dx = 0
+/// dP/dx = 0 | +------|-+
+///           |/       |/
+///  (0,-N,0) +--------+ (N,-N,N)
+///           dP/dy = -1
+/// </pre>
+/// @code
+/// const int N = 10;
+/// DoubleTree source(/*background=*/0.0);
+/// // Activate voxels to indicate that they contain liquid.
+/// source.fill(CoordBBox(Coord(0, -N, 0), Coord(N, 0, N)), /*value=*/0.0);
+///
+/// // C++11
+/// auto boundary = [](const openvdb::Coord& ijk, const openvdb::Coord& neighbor,
+///     double& source, double& diagonal)
+/// {
+///     if (neighbor.x() == ijk.x() && neighbor.z() == ijk.z()) {
+///         if (neighbor.y() < ijk.y()) source -= 1.0;
+///         else diagonal -= 1.0;
+///     }
+/// };
+///
+/// math::pcg::State state = math::pcg::terminationDefaults<double>();
+/// util::NullInterrupter interrupter;
+///
+/// DoubleTree::Ptr solution = tools::poisson::solveWithBoundaryConditions(
+///     source, boundary, state, interrupter);
+/// @endcode
+
+#ifndef OPENVDB_TOOLS_POISSONSOLVER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POISSONSOLVER_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/math/ConjGradient.h>
+#include <openvdb/tree/LeafManager.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/util/NullInterrupter.h>
+
+#include "Morphology.h" // for erodeVoxels
+
+#include <boost/scoped_array.hpp>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+namespace poisson {
+
+// This type should be at least as wide as math::pcg::SizeType.
+typedef Int32 VIndex;
+
+/// The type of a matrix used to represent a three-dimensional Laplacian operator
+typedef math::pcg::SparseStencilMatrix<double, 7> LaplacianMatrix;
+
+
+//@{
+/// @brief Solve &nabla;<sup><small>2</small></sup><i>x</i> = <i>b</i> for <i>x</i>,
+/// where @e b is a vector comprising the values of all of the active voxels
+/// in the input tree.
+/// @return a new tree, with the same active voxel topology as the input tree,
+/// whose voxel values are the elements of the solution vector <i>x</i>.
+/// @details On input, the State object should specify convergence criteria
+/// (minimum error and maximum number of iterations); on output, it gives
+/// the actual termination conditions.
+/// @details The solution is computed using the conjugate gradient method
+/// with (where possible) incomplete Cholesky preconditioning, falling back
+/// to Jacobi preconditioning.
+/// @sa solveWithBoundaryConditions
+template<typename TreeType>
+inline typename TreeType::Ptr
+solve(const TreeType&, math::pcg::State&);
+
+template<typename TreeType, typename Interrupter>
+inline typename TreeType::Ptr
+solve(const TreeType&, math::pcg::State&, Interrupter&);
+//@}
+
+
+//@{
+/// @brief Solve &nabla;<sup><small>2</small></sup><i>x</i> = <i>b</i> for <i>x</i>
+/// with user-specified boundary conditions, where @e b is a vector comprising
+/// the values of all of the active voxels in the input tree or domain mask if provided
+/// @return a new tree, with the same active voxel topology as the input tree,
+/// whose voxel values are the elements of the solution vector <i>x</i>.
+/// @details On input, the State object should specify convergence criteria
+/// (minimum error and maximum number of iterations); on output, it gives
+/// the actual termination conditions.
+/// @details The solution is computed using the conjugate gradient method with
+/// the specified type of preconditioner (default: incomplete Cholesky),
+/// falling back to Jacobi preconditioning if necessary.
+/// @details Each thread gets its own copy of the BoundaryOp, which should be
+/// a functor of the form
+/// @code
+/// struct BoundaryOp {
+///     typedef LaplacianMatrix::ValueType ValueType;
+///     void operator()(
+///         const Coord& ijk,          // coordinates of a boundary voxel
+///         const Coord& ijkNeighbor,  // coordinates of an exterior neighbor of ijk
+///         ValueType& source,         // element of b corresponding to ijk
+///         ValueType& diagonal        // element of Laplacian matrix corresponding to ijk
+///     ) const;
+/// };
+/// @endcode
+/// The functor is called for each of the exterior neighbors of each boundary voxel @ijk,
+/// and it must specify a boundary condition for @ijk by modifying one or both of two
+/// provided values: the entry in the source vector @e b corresponding to @ijk and
+/// the weighting coefficient for @ijk in the Laplacian operator matrix.
+///
+/// @sa solve
+template<typename TreeType, typename BoundaryOp, typename Interrupter>
+inline typename TreeType::Ptr
+solveWithBoundaryConditions(const TreeType&, const BoundaryOp&, math::pcg::State&, Interrupter&);
+
+template<typename PreconditionerType, typename TreeType, typename BoundaryOp, typename Interrupter>
+inline typename TreeType::Ptr
+solveWithBoundaryConditionsAndPreconditioner(const TreeType&, const BoundaryOp&,
+                                             math::pcg::State&, Interrupter&);
+
+template<typename PreconditionerType, typename TreeType, typename DomainTreeType, typename BoundaryOp, typename Interrupter>
+inline typename TreeType::Ptr
+solveWithBoundaryConditionsAndPreconditioner(const TreeType&, const DomainTreeType&, const BoundaryOp&,
+                                             math::pcg::State&, Interrupter&);
+//@}
+
+
+/// @name Low-level functions
+//@{
+// The following are low-level routines that can be used to assemble custom solvers.
+
+/// @brief Overwrite each active voxel in the given scalar tree
+/// with a sequential index, starting from zero.
+template<typename VIndexTreeType>
+inline void populateIndexTree(VIndexTreeType&);
+
+/// @brief Iterate over the active voxels of the input tree and for each one
+/// assign its index in the iteration sequence to the corresponding voxel
+/// of an integer-valued output tree.
+template<typename TreeType>
+inline typename TreeType::template ValueConverter<VIndex>::Type::Ptr
+createIndexTree(const TreeType&);
+
+
+/// @brief Return a vector of the active voxel values of the scalar-valued @a source tree.
+/// @details The <i>n</i>th element of the vector corresponds to the voxel whose value
+/// in the @a index tree is @e n.
+/// @param source  a tree with a scalar value type
+/// @param index   a tree of the same configuration as @a source but with
+///     value type VIndex that maps voxels to elements of the output vector
+template<typename VectorValueType, typename SourceTreeType>
+inline typename math::pcg::Vector<VectorValueType>::Ptr
+createVectorFromTree(
+    const SourceTreeType& source,
+    const typename SourceTreeType::template ValueConverter<VIndex>::Type& index);
+
+
+/// @brief Return a tree with the same active voxel topology as the @a index tree
+/// but whose voxel values are taken from the the given vector.
+/// @details The voxel whose value in the @a index tree is @e n gets assigned
+/// the <i>n</i>th element of the vector.
+/// @param index   a tree with value type VIndex that maps voxels to elements of @a values
+/// @param values  a vector of values with which to populate the active voxels of the output tree
+/// @param background  the value for the inactive voxels of the output tree
+template<typename TreeValueType, typename VIndexTreeType, typename VectorValueType>
+inline typename VIndexTreeType::template ValueConverter<TreeValueType>::Type::Ptr
+createTreeFromVector(
+    const math::pcg::Vector<VectorValueType>& values,
+    const VIndexTreeType& index,
+    const TreeValueType& background);
+
+
+/// @brief Generate a sparse matrix of the index-space (&Delta;<i>x</i> = 1) Laplacian operator
+/// using second-order finite differences.
+/// @details This construction assumes homogeneous Dirichlet boundary conditions
+/// (exterior grid points are zero).
+template<typename BoolTreeType>
+inline LaplacianMatrix::Ptr
+createISLaplacian(
+    const typename BoolTreeType::template ValueConverter<VIndex>::Type& vectorIndexTree,
+    const BoolTreeType& interiorMask);
+
+
+/// @brief Generate a sparse matrix of the index-space (&Delta;<i>x</i> = 1) Laplacian operator
+/// with user-specified boundary conditions using second-order finite differences.
+/// @details Each thread gets its own copy of @a boundaryOp, which should be
+/// a functor of the form
+/// @code
+/// struct BoundaryOp {
+///     typedef LaplacianMatrix::ValueType ValueType;
+///     void operator()(
+///         const Coord& ijk,          // coordinates of a boundary voxel
+///         const Coord& ijkNeighbor,  // coordinates of an exterior neighbor of ijk
+///         ValueType& source,         // element of source vector corresponding to ijk
+///         ValueType& diagonal        // element of Laplacian matrix corresponding to ijk
+///     ) const;
+/// };
+/// @endcode
+/// The functor is called for each of the exterior neighbors of each boundary voxel @ijk,
+/// and it must specify a boundary condition for @ijk by modifying one or both of two
+/// provided values: an entry in the given @a source vector corresponding to @ijk and
+/// the weighting coefficient for @ijk in the Laplacian matrix.
+template<typename BoolTreeType, typename BoundaryOp>
+inline LaplacianMatrix::Ptr
+createISLaplacianWithBoundaryConditions(
+    const typename BoolTreeType::template ValueConverter<VIndex>::Type& vectorIndexTree,
+    const BoolTreeType& interiorMask,
+    const BoundaryOp& boundaryOp,
+    typename math::pcg::Vector<LaplacianMatrix::ValueType>& source);
+
+//@}
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+/// @brief Functor for use with LeafManager::foreach() to populate an array
+/// with per-leaf active voxel counts
+template<typename LeafType>
+struct LeafCountOp
+{
+    VIndex* count;
+    LeafCountOp(VIndex* count_): count(count_) {}
+    void operator()(const LeafType& leaf, size_t leafIdx) const {
+        count[leafIdx] = static_cast<VIndex>(leaf.onVoxelCount());
+    }
+};
+
+
+/// @brief Functor for use with LeafManager::foreach() to populate
+/// active leaf voxels with sequential indices
+template<typename LeafType>
+struct LeafIndexOp
+{
+    const VIndex* count;
+    LeafIndexOp(const VIndex* count_): count(count_) {}
+    void operator()(LeafType& leaf, size_t leafIdx) const {
+        VIndex idx = (leafIdx == 0) ? 0 : count[leafIdx - 1];
+        for (typename LeafType::ValueOnIter it = leaf.beginValueOn(); it; ++it) {
+            it.setValue(idx++);
+        }
+    }
+};
+
+} // namespace internal
+
+
+template<typename VIndexTreeType>
+inline void
+populateIndexTree(VIndexTreeType& result)
+{
+    typedef typename VIndexTreeType::LeafNodeType       LeafT;
+    typedef typename tree::LeafManager<VIndexTreeType>  LeafMgrT;
+
+    // Linearize the tree.
+    LeafMgrT leafManager(result);
+    const size_t leafCount = leafManager.leafCount();
+
+    // Count the number of active voxels in each leaf node.
+    boost::scoped_array<VIndex> perLeafCount(new VIndex[leafCount]);
+    VIndex* perLeafCountPtr = perLeafCount.get();
+    leafManager.foreach(internal::LeafCountOp<LeafT>(perLeafCountPtr));
+
+    // The starting index for each leaf node is the total number
+    // of active voxels in all preceding leaf nodes.
+    for (size_t i = 1; i < leafCount; ++i) {
+        perLeafCount[i] += perLeafCount[i - 1];
+    }
+
+    // The last accumulated value should be the total of all active voxels.
+    assert(Index64(perLeafCount[leafCount-1]) == result.activeVoxelCount());
+
+    // Parallelize over the leaf nodes of the tree, storing a unique index
+    // in each active voxel.
+    leafManager.foreach(internal::LeafIndexOp<LeafT>(perLeafCountPtr));
+}
+
+
+template<typename TreeType>
+inline typename TreeType::template ValueConverter<VIndex>::Type::Ptr
+createIndexTree(const TreeType& tree)
+{
+    typedef typename TreeType::template ValueConverter<VIndex>::Type VIdxTreeT;
+
+    // Construct an output tree with the same active voxel topology as the input tree.
+    const VIndex invalidIdx = -1;
+    typename VIdxTreeT::Ptr result(
+        new VIdxTreeT(tree, /*background=*/invalidIdx, TopologyCopy()));
+
+    // All active voxels are degrees of freedom, including voxels contained in active tiles.
+    result->voxelizeActiveTiles();
+
+    populateIndexTree(*result);
+
+    return result;
+}
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+/// @brief Functor for use with LeafManager::foreach() to populate a vector
+/// with the values of a tree's active voxels
+template<typename VectorValueType, typename SourceTreeType>
+struct CopyToVecOp
+{
+    typedef typename SourceTreeType::template ValueConverter<VIndex>::Type VIdxTreeT;
+    typedef typename VIdxTreeT::LeafNodeType             VIdxLeafT;
+    typedef typename SourceTreeType::LeafNodeType        LeafT;
+    typedef typename SourceTreeType::ValueType           TreeValueT;
+    typedef typename math::pcg::Vector<VectorValueType>  VectorT;
+
+    const SourceTreeType* tree;
+    VectorT* vector;
+
+    CopyToVecOp(const SourceTreeType& t, VectorT& v): tree(&t), vector(&v) {}
+
+    void operator()(const VIdxLeafT& idxLeaf, size_t /*leafIdx*/) const
+    {
+        VectorT& vec = *vector;
+        if (const LeafT* leaf = tree->probeLeaf(idxLeaf.origin())) {
+            // If a corresponding leaf node exists in the source tree,
+            // copy voxel values from the source node to the output vector.
+            for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
+                vec[*it] = leaf->getValue(it.pos());
+            }
+        } else {
+            // If no corresponding leaf exists in the source tree,
+            // fill the vector with a uniform value.
+            const TreeValueT& value = tree->getValue(idxLeaf.origin());
+            for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
+                vec[*it] = value;
+            }
+        }
+    }
+};
+
+} // namespace internal
+
+
+template<typename VectorValueType, typename SourceTreeType>
+inline typename math::pcg::Vector<VectorValueType>::Ptr
+createVectorFromTree(const SourceTreeType& tree,
+    const typename SourceTreeType::template ValueConverter<VIndex>::Type& idxTree)
+{
+    typedef typename SourceTreeType::template ValueConverter<VIndex>::Type VIdxTreeT;
+    typedef tree::LeafManager<const VIdxTreeT>           VIdxLeafMgrT;
+    typedef typename math::pcg::Vector<VectorValueType>  VectorT;
+
+    // Allocate the vector.
+    const size_t numVoxels = idxTree.activeVoxelCount();
+    typename VectorT::Ptr result(new VectorT(static_cast<math::pcg::SizeType>(numVoxels)));
+
+    // Parallelize over the leaf nodes of the index tree, filling the output vector
+    // with values from corresponding voxels of the source tree.
+    VIdxLeafMgrT leafManager(idxTree);
+    leafManager.foreach(internal::CopyToVecOp<VectorValueType, SourceTreeType>(tree, *result));
+
+    return result;
+}
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+/// @brief Functor for use with LeafManager::foreach() to populate a tree
+/// with values from a vector
+template<typename TreeValueType, typename VIndexTreeType, typename VectorValueType>
+struct CopyFromVecOp
+{
+    typedef typename VIndexTreeType::template ValueConverter<TreeValueType>::Type OutTreeT;
+    typedef typename OutTreeT::LeafNodeType              OutLeafT;
+    typedef typename VIndexTreeType::LeafNodeType        VIdxLeafT;
+    typedef typename math::pcg::Vector<VectorValueType>  VectorT;
+
+    const VectorT* vector;
+    OutTreeT* tree;
+
+    CopyFromVecOp(const VectorT& v, OutTreeT& t): vector(&v), tree(&t) {}
+
+    void operator()(const VIdxLeafT& idxLeaf, size_t /*leafIdx*/) const
+    {
+        const VectorT& vec = *vector;
+        OutLeafT* leaf = tree->probeLeaf(idxLeaf.origin());
+        assert(leaf != NULL);
+        for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
+            leaf->setValueOnly(it.pos(), static_cast<TreeValueType>(vec[*it]));
+        }
+    }
+};
+
+} // namespace internal
+
+
+template<typename TreeValueType, typename VIndexTreeType, typename VectorValueType>
+inline typename VIndexTreeType::template ValueConverter<TreeValueType>::Type::Ptr
+createTreeFromVector(
+    const math::pcg::Vector<VectorValueType>& vector,
+    const VIndexTreeType& idxTree,
+    const TreeValueType& background)
+{
+    typedef typename VIndexTreeType::template ValueConverter<TreeValueType>::Type OutTreeT;
+    typedef typename tree::LeafManager<const VIndexTreeType> VIdxLeafMgrT;
+
+    // Construct an output tree with the same active voxel topology as the index tree.
+    typename OutTreeT::Ptr result(new OutTreeT(idxTree, background, TopologyCopy()));
+    OutTreeT& tree = *result;
+
+    // Parallelize over the leaf nodes of the index tree, populating voxels
+    // of the output tree with values from the input vector.
+    VIdxLeafMgrT leafManager(idxTree);
+    leafManager.foreach(
+        internal::CopyFromVecOp<TreeValueType, VIndexTreeType, VectorValueType>(vector, tree));
+
+    return result;
+}
+
+
+////////////////////////////////////////
+
+
+namespace internal {
+
+/// Constant boundary condition functor
+template<typename ValueType>
+struct DirichletOp {
+    inline void operator()(
+        const Coord&, const Coord&, ValueType&, ValueType& diag) const { diag -= 1; }
+};
+
+
+/// Functor for use with LeafManager::foreach() to populate a sparse Laplacian matrix
+template<typename BoolTreeType, typename BoundaryOp>
+struct ISLaplacianOp
+{
+    typedef typename BoolTreeType::template ValueConverter<VIndex>::Type VIdxTreeT;
+    typedef typename VIdxTreeT::LeafNodeType   VIdxLeafT;
+    typedef LaplacianMatrix::ValueType         ValueT;
+    typedef typename math::pcg::Vector<ValueT> VectorT;
+
+    LaplacianMatrix* laplacian;
+    const VIdxTreeT* idxTree;
+    const BoolTreeType* interiorMask;
+    const BoundaryOp boundaryOp;
+    VectorT* source;
+
+    ISLaplacianOp(LaplacianMatrix& m, const VIdxTreeT& idx,
+        const BoolTreeType& mask, const BoundaryOp& op, VectorT& src):
+        laplacian(&m), idxTree(&idx), interiorMask(&mask), boundaryOp(op), source(&src) {}
+
+    void operator()(const VIdxLeafT& idxLeaf, size_t /*leafIdx*/) const
+    {
+        // Local accessors
+        typename tree::ValueAccessor<const BoolTreeType> interior(*interiorMask);
+        typename tree::ValueAccessor<const VIdxTreeT> vectorIdx(*idxTree);
+
+        Coord ijk;
+        VIndex column;
+        const ValueT diagonal = -6.f, offDiagonal = 1.f;
+
+        // Loop over active voxels in this leaf.
+        for (typename VIdxLeafT::ValueOnCIter it = idxLeaf.cbeginValueOn(); it; ++it) {
+            assert(it.getValue() > -1);
+            const math::pcg::SizeType rowNum = static_cast<math::pcg::SizeType>(it.getValue());
+
+            LaplacianMatrix::RowEditor row = laplacian->getRowEditor(rowNum);
+
+            ijk = it.getCoord();
+            if (interior.isValueOn(ijk)) {
+                // The current voxel is an interior voxel.
+                // All of its neighbors are in the solution domain.
+
+                // -x direction
+                row.setValue(vectorIdx.getValue(ijk.offsetBy(-1, 0, 0)), offDiagonal);
+                // -y direction
+                row.setValue(vectorIdx.getValue(ijk.offsetBy(0, -1, 0)), offDiagonal);
+                // -z direction
+                row.setValue(vectorIdx.getValue(ijk.offsetBy(0, 0, -1)), offDiagonal);
+                // diagonal
+                row.setValue(rowNum, diagonal);
+                // +z direction
+                row.setValue(vectorIdx.getValue(ijk.offsetBy(0, 0, 1)), offDiagonal);
+                // +y direction
+                row.setValue(vectorIdx.getValue(ijk.offsetBy(0, 1, 0)), offDiagonal);
+                // +x direction
+                row.setValue(vectorIdx.getValue(ijk.offsetBy(1, 0, 0)), offDiagonal);
+
+            } else {
+                // The current voxel is a boundary voxel.
+                // At least one of its neighbors is outside the solution domain.
+
+                ValueT modifiedDiagonal = 0.f;
+
+                // -x direction
+                if (vectorIdx.probeValue(ijk.offsetBy(-1, 0, 0), column)) {
+                    row.setValue(column, offDiagonal);
+                    modifiedDiagonal -= 1;
+                } else {
+                    boundaryOp(ijk, ijk.offsetBy(-1, 0, 0), source->at(rowNum), modifiedDiagonal);
+                }
+                // -y direction
+                if (vectorIdx.probeValue(ijk.offsetBy(0, -1, 0), column)) {
+                    row.setValue(column, offDiagonal);
+                    modifiedDiagonal -= 1;
+                } else {
+                    boundaryOp(ijk, ijk.offsetBy(0, -1, 0), source->at(rowNum), modifiedDiagonal);
+                }
+                // -z direction
+                if (vectorIdx.probeValue(ijk.offsetBy(0, 0, -1), column)) {
+                    row.setValue(column, offDiagonal);
+                    modifiedDiagonal -= 1;
+                } else {
+                    boundaryOp(ijk, ijk.offsetBy(0, 0, -1), source->at(rowNum), modifiedDiagonal);
+                }
+                // +z direction
+                if (vectorIdx.probeValue(ijk.offsetBy(0, 0, 1), column)) {
+                    row.setValue(column, offDiagonal);
+                    modifiedDiagonal -= 1;
+                } else {
+                    boundaryOp(ijk, ijk.offsetBy(0, 0, 1), source->at(rowNum), modifiedDiagonal);
+                }
+                // +y direction
+                if (vectorIdx.probeValue(ijk.offsetBy(0, 1, 0), column)) {
+                    row.setValue(column, offDiagonal);
+                    modifiedDiagonal -= 1;
+                } else {
+                    boundaryOp(ijk, ijk.offsetBy(0, 1, 0), source->at(rowNum), modifiedDiagonal);
+                }
+                // +x direction
+                if (vectorIdx.probeValue(ijk.offsetBy(1, 0, 0), column)) {
+                    row.setValue(column, offDiagonal);
+                    modifiedDiagonal -= 1;
+                } else {
+                    boundaryOp(ijk, ijk.offsetBy(1, 0, 0), source->at(rowNum), modifiedDiagonal);
+                }
+                // diagonal
+                row.setValue(rowNum, modifiedDiagonal);
+            }
+        } // end loop over voxels
+    }
+};
+
+} // namespace internal
+
+
+template<typename BoolTreeType>
+inline LaplacianMatrix::Ptr
+createISLaplacian(const typename BoolTreeType::template ValueConverter<VIndex>::Type& idxTree,
+    const BoolTreeType& interiorMask)
+{
+    typedef LaplacianMatrix::ValueType ValueT;
+    math::pcg::Vector<ValueT> unused(
+        static_cast<math::pcg::SizeType>(idxTree.activeVoxelCount()));
+    internal::DirichletOp<ValueT> op;
+    return createISLaplacianWithBoundaryConditions(idxTree, interiorMask, op, unused);
+}
+
+
+template<typename BoolTreeType, typename BoundaryOp>
+inline LaplacianMatrix::Ptr
+createISLaplacianWithBoundaryConditions(
+    const typename BoolTreeType::template ValueConverter<VIndex>::Type& idxTree,
+    const BoolTreeType& interiorMask,
+    const BoundaryOp& boundaryOp,
+    typename math::pcg::Vector<LaplacianMatrix::ValueType>& source)
+{
+    typedef typename BoolTreeType::template ValueConverter<VIndex>::Type VIdxTreeT;
+    typedef typename tree::LeafManager<const VIdxTreeT>  VIdxLeafMgrT;
+
+    // The number of active voxels is the number of degrees of freedom.
+    const Index64 numDoF = idxTree.activeVoxelCount();
+
+    // Construct the matrix.
+    LaplacianMatrix::Ptr laplacianPtr(
+        new LaplacianMatrix(static_cast<math::pcg::SizeType>(numDoF)));
+    LaplacianMatrix& laplacian = *laplacianPtr;
+
+    // Populate the matrix using a second-order, 7-point CD stencil.
+    VIdxLeafMgrT idxLeafManager(idxTree);
+    idxLeafManager.foreach(internal::ISLaplacianOp<BoolTreeType, BoundaryOp>(
+        laplacian, idxTree, interiorMask, boundaryOp, source));
+
+    return laplacianPtr;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeType>
+inline typename TreeType::Ptr
+solve(const TreeType& inTree, math::pcg::State& state)
+{
+    util::NullInterrupter interrupter;
+    return solve(inTree, state, interrupter);
+}
+
+
+template<typename TreeType, typename Interrupter>
+inline typename TreeType::Ptr
+solve(const TreeType& inTree, math::pcg::State& state, Interrupter& interrupter)
+{
+    internal::DirichletOp<LaplacianMatrix::ValueType> boundaryOp;
+    return solveWithBoundaryConditions(inTree, boundaryOp, state, interrupter);
+}
+
+
+template<typename TreeType, typename BoundaryOp, typename Interrupter>
+inline typename TreeType::Ptr
+solveWithBoundaryConditions(const TreeType& inTree, const BoundaryOp& boundaryOp,
+    math::pcg::State& state, Interrupter& interrupter)
+{
+    typedef math::pcg::IncompleteCholeskyPreconditioner<LaplacianMatrix> DefaultPrecondT;
+    return solveWithBoundaryConditionsAndPreconditioner<DefaultPrecondT>(
+        inTree, boundaryOp, state, interrupter);
+}
+
+
+template<typename PreconditionerType, typename TreeType, typename BoundaryOp, typename Interrupter>
+inline typename TreeType::Ptr
+solveWithBoundaryConditionsAndPreconditioner(const TreeType& inTree,
+    const BoundaryOp& boundaryOp, math::pcg::State& state, Interrupter& interrupter)
+{
+
+    return solveWithBoundaryConditionsAndPreconditioner<PreconditionerType>(inTree /*source*/, inTree /*domain mask*/,
+                                                                            boundaryOp, state, interrupter);
+}
+
+template<typename PreconditionerType, typename TreeType, typename DomainTreeType, typename BoundaryOp, typename Interrupter>
+inline typename TreeType::Ptr
+solveWithBoundaryConditionsAndPreconditioner(const TreeType& inTree,
+                                             const DomainTreeType& domainMask,
+                                             const BoundaryOp& boundaryOp,
+                                             math::pcg::State& state, Interrupter& interrupter)
+{
+
+    typedef typename TreeType::ValueType           TreeValueT;
+    typedef LaplacianMatrix::ValueType             VecValueT;
+    typedef typename math::pcg::Vector<VecValueT>  VectorT;
+    typedef typename TreeType::template ValueConverter<VIndex>::Type  VIdxTreeT;
+    typedef typename TreeType::template ValueConverter<bool>::Type    MaskTreeT;
+
+    // 1. Create a mapping from active voxels of the input tree to elements of a vector.
+    typename VIdxTreeT::ConstPtr idxTree = createIndexTree(domainMask);
+
+    // 2. Populate a vector with values from the input tree.
+    typename VectorT::Ptr b = createVectorFromTree<VecValueT>(inTree, *idxTree);
+
+    // 3. Create a mask of the interior voxels of the input tree (from the densified index tree).
+    typename MaskTreeT::Ptr interiorMask(
+        new MaskTreeT(*idxTree, /*background=*/false, TopologyCopy()));
+    tools::erodeVoxels(*interiorMask, /*iterations=*/1, tools::NN_FACE);
+
+    // 4. Create the Laplacian matrix.
+    LaplacianMatrix::Ptr laplacian = createISLaplacianWithBoundaryConditions(
+        *idxTree, *interiorMask, boundaryOp, *b);
+
+    // 5. Solve the Poisson equation.
+    laplacian->scale(-1.0); // matrix is negative-definite; solve -M x = -b
+    b->scale(-1.0);
+    typename VectorT::Ptr x(new VectorT(b->size(), zeroVal<VecValueT>()));
+    typename math::pcg::Preconditioner<VecValueT>::Ptr precond(
+        new PreconditionerType(*laplacian));
+    if (!precond->isValid()) {
+        precond.reset(new math::pcg::JacobiPreconditioner<LaplacianMatrix>(*laplacian));
+    }
+
+    state = math::pcg::solve(*laplacian, *b, *x, *precond, interrupter, state);
+
+    // 6. Populate the output tree with values from the solution vector.
+    /// @todo if (state.success) ... ?
+    return createTreeFromVector<TreeValueT>(*x, *idxTree, /*background=*/zeroVal<TreeValueT>());
+}
+
+} // namespace poisson
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_POISSONSOLVER_HAS_BEEN_INCLUDED
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
diff --git a/nuparu/include/openvdb_new/tools/Prune.h b/nuparu/include/openvdb_new/tools/Prune.h
new file mode 100644
index 00000000..1c6d48d8
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Prune.h
@@ -0,0 +1,417 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Prune.h
+///
+/// @brief Defined various multi-threaded utility functions for trees
+///
+/// @author Ken Museth
+
+#ifndef OPENVDB_TOOLS_PRUNE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_PRUNE_HAS_BEEN_INCLUDED
+
+#include <boost/utility/enable_if.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+
+#include <openvdb/math/Math.h> // for isNegative and negative
+#include <openvdb/Types.h> // for Index typedef
+#include <openvdb/Types.h>
+#include <openvdb/tree/NodeManager.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Reduce the memory footprint of a @a tree by replacing with tiles
+/// any nodes whose values are all the same (optionally to within a tolerance)
+/// and have the same active state.
+///
+/// @note For trees with floating-point values a child node with (approximately)
+/// constant values are replaced with a tile value corresponding to the average
+/// of the extrema values in said child node. Else the first value encountered
+/// in the child node is used.
+///
+/// @param tree       the tree to be pruned
+/// @param tolerance  tolerance within which values are considered to be equal
+/// @param threaded   enable or disable threading (threading is enabled by default)
+/// @param grainSize  used to control the threading granularity (default is 1)
+template<typename TreeT>
+inline void
+prune(TreeT& tree,
+      typename TreeT::ValueType tolerance = zeroVal<typename TreeT::ValueType>(),
+      bool threaded = true,
+      size_t grainSize = 1);
+
+
+/// @brief Reduce the memory footprint of a @a tree by replacing with tiles
+/// any non-leaf nodes whose values are all the same (optionally to within a tolerance)
+/// and have the same active state.
+///
+/// @param tree       the tree to be pruned
+/// @param tolerance  tolerance within which values are considered to be equal
+/// @param threaded   enable or disable threading (threading is enabled by default)
+/// @param grainSize  used to control the threading granularity (default is 1)
+template<typename TreeT>
+inline void
+pruneTiles(TreeT& tree,
+           typename TreeT::ValueType tolerance = zeroVal<typename TreeT::ValueType>(),
+           bool threaded = true,
+           size_t grainSize = 1);
+
+
+/// @brief Reduce the memory footprint of a @a tree by replacing with
+/// background tiles any nodes whose values are all inactive.
+///
+/// @param tree       the tree to be pruned
+/// @param threaded   enable or disable threading (threading is enabled by default)
+/// @param grainSize  used to control the threading granularity (default is 1)
+template<typename TreeT>
+inline void
+pruneInactive(TreeT& tree, bool threaded = true, size_t grainSize = 1);
+
+
+/// @brief Reduce the memory footprint of a @a tree by replacing any nodes
+/// whose values are all inactive with tiles of the given @a value.
+///
+/// @param tree       the tree to be pruned
+/// @param value      value assigned to inactive tiles created during pruning
+/// @param threaded   enable or disable threading (threading is enabled by default)
+/// @param grainSize  used to control the threading granularity (default is 1)
+template<typename TreeT>
+inline void
+pruneInactiveWithValue(
+    TreeT& tree,
+    const typename TreeT::ValueType& value,
+    bool threaded = true,
+    size_t grainSize = 1);
+
+
+/// @brief Reduce the memory footprint of a @a tree by replacing nodes
+/// whose values are all inactive with inactive tiles having a value equal to
+/// the first value encountered in the (inactive) child.
+/// @details This method is faster than tolerance-based prune and
+/// useful for narrow-band level set applications where inactive
+/// values are limited to either an inside or an outside value.
+///
+/// @param tree       the tree to be pruned
+/// @param threaded   enable or disable threading (threading is enabled by default)
+/// @param grainSize  used to control the threading granularity (default is 1)
+///
+/// @throw ValueError if the background of the @a tree is negative (as defined by math::isNegative)
+template<typename TreeT>
+inline void
+pruneLevelSet(TreeT& tree,
+              bool threaded = true,
+              size_t grainSize = 1);
+
+
+/// @brief Reduce the memory footprint of a @a tree by replacing nodes whose voxel values
+/// are all inactive with inactive tiles having the value -| @a insideWidth |
+/// if the voxel values are negative and | @a outsideWidth | otherwise.
+///
+/// @details This method is faster than tolerance-based prune and
+/// useful for narrow-band level set applications where inactive
+/// values are limited to either an inside or an outside value.
+///
+/// @param tree          the tree to be pruned
+/// @param outsideWidth  the width of the outside of the narrow band
+/// @param insideWidth   the width of the inside of the narrow band
+/// @param threaded      enable or disable threading (threading is enabled by default)
+/// @param grainSize     used to control the threading granularity (default is 1)
+///
+/// @throw ValueError if @a outsideWidth is negative or @a insideWidth is
+/// not negative (as defined by math::isNegative).
+template<typename TreeT>
+inline void
+pruneLevelSet(TreeT& tree,
+              const typename TreeT::ValueType& outsideWidth,
+              const typename TreeT::ValueType& insideWidth,
+              bool threaded = true,
+              size_t grainSize = 1);
+
+
+////////////////////////////////////////////////
+
+
+template<typename TreeT, Index TerminationLevel = 0>
+class InactivePruneOp
+{
+public:
+    typedef typename TreeT::ValueType    ValueT;
+    typedef typename TreeT::RootNodeType RootT;
+    typedef typename TreeT::LeafNodeType LeafT;
+    BOOST_STATIC_ASSERT(RootT::LEVEL > TerminationLevel);
+
+    InactivePruneOp(TreeT& tree) : mValue(tree.background())
+    {
+        tree.clearAllAccessors();//clear cache of nodes that could be pruned
+    }
+
+    InactivePruneOp(TreeT& tree, const ValueT& v) : mValue(v)
+    {
+        tree.clearAllAccessors();//clear cache of nodes that could be pruned
+    }
+
+    // Nothing to do at the leaf node level
+    void operator()(LeafT&) const {}
+    // Prune the child nodes of the internal nodes
+    template<typename NodeT>
+    void operator()(NodeT& node) const
+    {
+        if (NodeT::LEVEL > TerminationLevel) {
+            for (typename NodeT::ChildOnIter it=node.beginChildOn(); it; ++it) {
+                if (it->isInactive()) node.addTile(it.pos(), mValue, false);
+            }
+        }
+    }
+    // Prune the child nodes of the root node
+    void operator()(RootT& root) const
+    {
+        for (typename RootT::ChildOnIter it = root.beginChildOn(); it; ++it) {
+            if (it->isInactive()) root.addTile(it.getCoord(), mValue, false);
+        }
+        root.eraseBackgroundTiles();
+    }
+private:
+
+    const ValueT mValue;
+};// InactivePruneOp
+
+
+template<typename TreeT, Index TerminationLevel = 0>
+class TolerancePruneOp
+{
+public:
+    typedef typename TreeT::ValueType    ValueT;
+    typedef typename TreeT::RootNodeType RootT;
+    typedef typename TreeT::LeafNodeType LeafT;
+    BOOST_STATIC_ASSERT(RootT::LEVEL > TerminationLevel);
+
+    TolerancePruneOp(TreeT& tree, const ValueT& t) : mTolerance(t)
+    {
+        tree.clearAllAccessors();//clear cache of nodes that could be pruned
+    }
+
+    // Prune the child nodes of the root node
+    inline void operator()(RootT& root) const
+    {
+        ValueT value;
+        bool   state;
+        for (typename RootT::ChildOnIter it = root.beginChildOn(); it; ++it) {
+            if (this->isConstant(*it, value, state)) root.addTile(it.getCoord(), value, state);
+        }
+        root.eraseBackgroundTiles();
+    }
+
+    // Prune the child nodes of the internal nodes
+    template<typename NodeT>
+    inline void operator()(NodeT& node) const
+    {
+        if (NodeT::LEVEL > TerminationLevel) {
+            ValueT value;
+            bool   state;
+            for (typename NodeT::ChildOnIter it=node.beginChildOn(); it; ++it) {
+                if (this->isConstant(*it, value, state)) node.addTile(it.pos(), value, state);
+            }
+        }
+    }
+
+    // Nothing to do at the leaf node level
+    inline void operator()(LeafT&) const {}
+
+private:
+
+    // For floating-point value types set tile values to
+    // the mean of the extrema values of the constant node
+    template<typename NodeT>
+    inline
+    typename boost::enable_if<boost::is_floating_point<typename NodeT::ValueType>, bool>::type
+    isConstant(const NodeT& node, ValueT& value, bool& state) const
+    {
+        ValueT tmp;
+        const bool test = node.isConstant(value, tmp, state, mTolerance);
+        if (test) value = ValueT(0.5f)*(value + tmp);
+        return test;
+    }
+
+    // For non-floating-point value types set tile values to
+    // the first value encountered in the constant node
+    template<typename NodeT>
+    inline
+    typename boost::disable_if<boost::is_floating_point<typename NodeT::ValueType>, bool>::type
+    isConstant(const NodeT& node, ValueT& value, bool& state) const
+    {
+        return node.isConstant(value, state, mTolerance);
+    }
+
+    const ValueT mTolerance;
+};// TolerancePruneOp
+
+
+template<typename TreeT, Index TerminationLevel = 0>
+class LevelSetPruneOp
+{
+public:
+    typedef typename TreeT::ValueType    ValueT;
+    typedef typename TreeT::RootNodeType RootT;
+    typedef typename TreeT::LeafNodeType LeafT;
+    BOOST_STATIC_ASSERT(RootT::LEVEL > TerminationLevel);
+
+    LevelSetPruneOp(TreeT& tree)
+        : mOutside(tree.background())
+        , mInside(math::negative(mOutside))
+    {
+        if (math::isNegative(mOutside)) {
+            OPENVDB_THROW(ValueError,
+                          "LevelSetPruneOp: the background value cannot be negative!");
+        }
+        tree.clearAllAccessors();//clear cache of nodes that could be pruned
+    }
+    LevelSetPruneOp(TreeT& tree, const ValueT& outside, const ValueT& inside)
+        : mOutside(outside)
+        , mInside(inside)
+    {
+        if (math::isNegative(mOutside)) {
+            OPENVDB_THROW(ValueError,
+                          "LevelSetPruneOp: the outside value cannot be negative!");
+        }
+        if (!math::isNegative(mInside)) {
+            OPENVDB_THROW(ValueError,
+                          "LevelSetPruneOp: the inside value must be negative!");
+        }
+        tree.clearAllAccessors();//clear cache of nodes that could be pruned
+    }
+    // Nothing to do at the leaf node level
+    void operator()(LeafT&) const {}
+    // Prune the child nodes of the internal nodes
+    template<typename NodeT>
+    void operator()(NodeT& node) const
+    {
+        if (NodeT::LEVEL > TerminationLevel) {
+            for (typename NodeT::ChildOnIter it=node.beginChildOn(); it; ++it) {
+                if (it->isInactive()) node.addTile(it.pos(), this->getTileValue(it), false);
+            }
+        }
+    }
+    // Prune the child nodes of the root node
+    void operator()(RootT& root) const
+    {
+        for (typename RootT::ChildOnIter it = root.beginChildOn(); it; ++it) {
+            if (it->isInactive()) root.addTile(it.getCoord(), this->getTileValue(it), false);
+        }
+        root.eraseBackgroundTiles();
+    }
+
+private:
+    template <typename IterT>
+    inline ValueT getTileValue(const IterT& iter) const
+    {
+        return  math::isNegative(iter->getFirstValue()) ? mInside : mOutside;
+    }
+
+    const ValueT mOutside, mInside;
+};// LevelSetPruneOp
+
+
+template<typename TreeT>
+inline void
+prune(TreeT& tree, typename TreeT::ValueType tol, bool threaded, size_t grainSize)
+{
+    tree::NodeManager<TreeT, TreeT::DEPTH-2> nodes(tree);
+    TolerancePruneOp<TreeT> op(tree, tol);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+
+template<typename TreeT>
+inline void
+pruneTiles(TreeT& tree, typename TreeT::ValueType tol, bool threaded, size_t grainSize)
+{
+    tree::NodeManager<TreeT, TreeT::DEPTH-3> nodes(tree);
+    TolerancePruneOp<TreeT> op(tree, tol);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+
+template<typename TreeT>
+inline void
+pruneInactive(TreeT& tree, bool threaded, size_t grainSize)
+{
+    tree::NodeManager<TreeT, TreeT::DEPTH-2> nodes(tree);
+    InactivePruneOp<TreeT> op(tree);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+
+template<typename TreeT>
+inline void
+pruneInactiveWithValue(TreeT& tree, const typename TreeT::ValueType& v,
+    bool threaded, size_t grainSize)
+{
+    tree::NodeManager<TreeT, TreeT::DEPTH-2> nodes(tree);
+    InactivePruneOp<TreeT> op(tree, v);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+
+template<typename TreeT>
+inline void
+pruneLevelSet(TreeT& tree,
+              const typename TreeT::ValueType& outside,
+              const typename TreeT::ValueType& inside,
+              bool threaded,
+              size_t grainSize)
+{
+    tree::NodeManager<TreeT, TreeT::DEPTH-2> nodes(tree);
+    LevelSetPruneOp<TreeT> op(tree, outside, inside);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+
+template<typename TreeT>
+inline void
+pruneLevelSet(TreeT& tree, bool threaded, size_t grainSize)
+{
+    tree::NodeManager<TreeT, TreeT::DEPTH-2> nodes(tree);
+    LevelSetPruneOp<TreeT> op(tree);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_PRUNE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/RayIntersector.h b/nuparu/include/openvdb_new/tools/RayIntersector.h
new file mode 100644
index 00000000..0f177517
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/RayIntersector.h
@@ -0,0 +1,701 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file RayIntersector.h
+///
+/// @author Ken Museth
+///
+/// @brief Accelerated intersection of a ray with a narrow-band level
+/// set or a generic (e.g. density) volume. This will of course be
+/// useful for respectively surface and volume rendering.
+///
+/// @details This file defines two main classes,
+/// LevelSetRayIntersector and VolumeRayIntersector, as well as the
+/// three support classes LevelSetHDDA, VolumeHDDA and LinearSearchImpl.
+/// The LevelSetRayIntersector is templated on the LinearSearchImpl class
+/// and calls instances of the LevelSetHDDA class. The reason to split
+/// level set ray intersection into three classes is twofold. First
+/// LevelSetRayIntersector defines the public API for client code and
+/// LinearSearchImpl defines the actual algorithm used for the
+/// ray level-set intersection. In other words this design will allow
+/// for the public API to be fixed while the intersection algorithm
+/// can change without resolving to (slow) virtual methods. Second,
+/// LevelSetHDDA, which implements a hierarchical Differential Digital
+/// Analyzer, relies on partial template specialization, so it has to
+/// be a standalone class (as opposed to a member class of
+/// LevelSetRayIntersector). The VolumeRayIntersector is conceptually
+/// much simpler than the LevelSetRayIntersector, and hence it only
+/// depends on VolumeHDDA that implements the hierarchical
+/// Differential Digital Analyzer.
+
+
+#ifndef OPENVDB_TOOLS_RAYINTERSECTOR_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_RAYINTERSECTOR_HAS_BEEN_INCLUDED
+
+#include <openvdb/math/DDA.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/Ray.h>
+#include <openvdb/math/Stencils.h>
+#include <openvdb/Grid.h>
+#include <openvdb/Types.h>
+#include "Morphology.h"
+#include <boost/utility.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+// Helper class that implements the actual search of the zero-crossing
+// of the level set along the direction of a ray. This particular
+// implementation uses iterative linear search.
+template<typename GridT, int Iterations = 0, typename RealT = double>
+class LinearSearchImpl;
+
+
+///////////////////////////////////// LevelSetRayIntersector /////////////////////////////////////
+
+
+/// @brief This class provides the public API for intersecting a ray
+/// with a narrow-band level set.
+///
+/// @details It wraps a SearchImplT with a simple public API and
+/// performs the actual hierarchical tree node and voxel traversal.
+///
+/// @warning Use the (default) copy-constructor to make sure each
+/// computational thread has their own instance of this class. This is
+/// important since the SearchImplT contains a ValueAccessor that is
+/// not thread-safe. However copying is very efficient.
+///
+/// @see tools/RayTracer.h for examples of intended usage.
+///
+/// @todo Add TrilinearSearchImpl, as an alternative to LinearSearchImpl,
+/// that performs analytical 3D trilinear intersection tests, i.e., solves
+/// cubic equations. This is slower but also more accurate than the 1D
+/// linear interpolation in LinearSearchImpl.
+template<typename GridT,
+         typename SearchImplT = LinearSearchImpl<GridT>,
+         int NodeLevel = GridT::TreeType::RootNodeType::ChildNodeType::LEVEL,
+         typename RayT = math::Ray<Real> >
+class LevelSetRayIntersector
+{
+public:
+    typedef GridT                         GridType;
+    typedef RayT                          RayType;
+    typedef typename RayT::RealType       RealType;
+    typedef typename RayT::Vec3T          Vec3Type;
+    typedef typename GridT::ValueType     ValueT;
+    typedef typename GridT::TreeType      TreeT;
+
+    BOOST_STATIC_ASSERT( NodeLevel >= -1 && NodeLevel < int(TreeT::DEPTH)-1);
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueT>::value);
+
+    /// @brief Constructor
+    /// @param grid level set grid to intersect rays against.
+    /// @param isoValue optional iso-value for the ray-intersection.
+    LevelSetRayIntersector(const GridT& grid, const ValueT& isoValue = zeroVal<ValueT>())
+        : mTester(grid, isoValue)
+    {
+        if (!grid.hasUniformVoxels() ) {
+            OPENVDB_THROW(RuntimeError,
+                          "LevelSetRayIntersector only supports uniform voxels!");
+        }
+        if (grid.getGridClass() != GRID_LEVEL_SET) {
+            OPENVDB_THROW(RuntimeError,
+                          "LevelSetRayIntersector only supports level sets!"
+                          "\nUse Grid::setGridClass(openvdb::GRID_LEVEL_SET)");
+        }
+    }
+
+    /// @brief Return the iso-value used for ray-intersections
+    const ValueT& getIsoValue() const { return mTester.getIsoValue(); }
+
+    /// @brief Return @c true if the index-space ray intersects the level set.
+    /// @param iRay ray represented in index space.
+    bool intersectsIS(const RayType& iRay) const
+    {
+        if (!mTester.setIndexRay(iRay)) return false;//missed bbox
+        return math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester);
+    }
+
+    /// @brief Return @c true if the index-space ray intersects the level set
+    /// @param iRay  ray represented in index space.
+    /// @param iTime if an intersection was found it is assigned the time of the
+    ///              intersection along the index ray.
+    bool intersectsIS(const RayType& iRay, RealType &iTime) const
+    {
+        if (!mTester.setIndexRay(iRay)) return false;//missed bbox
+        iTime = mTester.getIndexTime();
+        return math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester);
+    }
+
+    /// @brief Return @c true if the index-space ray intersects the level set.
+    /// @param iRay ray represented in index space.
+    /// @param xyz  if an intersection was found it is assigned the
+    ///             intersection point in index space, otherwise it is unchanged.
+    bool intersectsIS(const RayType& iRay, Vec3Type& xyz) const
+    {
+        if (!mTester.setIndexRay(iRay)) return false;//missed bbox
+        if (!math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester)) return false;//missed level set
+        mTester.getIndexPos(xyz);
+        return true;
+    }
+
+    /// @brief Return @c true if the index-space ray intersects the level set.
+    /// @param iRay  ray represented in index space.
+    /// @param xyz   if an intersection was found it is assigned the
+    ///              intersection point in index space, otherwise it is unchanged.
+    /// @param iTime if an intersection was found it is assigned the time of the
+    ///              intersection along the index ray.
+    bool intersectsIS(const RayType& iRay, Vec3Type& xyz, RealType &iTime) const
+    {
+        if (!mTester.setIndexRay(iRay)) return false;//missed bbox
+        if (!math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester)) return false;//missed level set
+        mTester.getIndexPos(xyz);
+        iTime = mTester.getIndexTime();
+        return true;
+    }
+
+    /// @brief Return @c true if the world-space ray intersects the level set.
+    /// @param wRay   ray represented in world space.
+    bool intersectsWS(const RayType& wRay) const
+    {
+        if (!mTester.setWorldRay(wRay)) return false;//missed bbox
+        return math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester);
+    }
+
+    /// @brief Return @c true if the world-space ray intersects the level set.
+    /// @param wRay   ray represented in world space.
+    /// @param wTime  if an intersection was found it is assigned the time of the
+    ///               intersection along the world ray.
+    bool intersectsWS(const RayType& wRay, RealType &wTime) const
+    {
+        if (!mTester.setWorldRay(wRay)) return false;//missed bbox
+        wTime = mTester.getWorldTime();
+        return math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester);
+    }
+
+    /// @brief Return @c true if the world-space ray intersects the level set.
+    /// @param wRay   ray represented in world space.
+    /// @param world  if an intersection was found it is assigned the
+    ///               intersection point in world space, otherwise it is unchanged
+    bool intersectsWS(const RayType& wRay, Vec3Type& world) const
+    {
+        if (!mTester.setWorldRay(wRay)) return false;//missed bbox
+        if (!math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester)) return false;//missed level set
+        mTester.getWorldPos(world);
+        return true;
+    }
+
+    /// @brief Return @c true if the world-space ray intersects the level set.
+    /// @param wRay   ray represented in world space.
+    /// @param world  if an intersection was found it is assigned the
+    ///               intersection point in world space, otherwise it is unchanged.
+    /// @param wTime  if an intersection was found it is assigned the time of the
+    ///               intersection along the world ray.
+    bool intersectsWS(const RayType& wRay, Vec3Type& world, RealType &wTime) const
+    {
+        if (!mTester.setWorldRay(wRay)) return false;//missed bbox
+        if (!math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester)) return false;//missed level set
+        mTester.getWorldPos(world);
+        wTime = mTester.getWorldTime();
+        return true;
+    }
+
+    /// @brief Return @c true if the world-space ray intersects the level set.
+    /// @param wRay   ray represented in world space.
+    /// @param world  if an intersection was found it is assigned the
+    ///               intersection point in world space, otherwise it is unchanged.
+    /// @param normal if an intersection was found it is assigned the normal
+    ///               of the level set surface in world space, otherwise it is unchanged.
+    bool intersectsWS(const RayType& wRay, Vec3Type& world, Vec3Type& normal) const
+    {
+        if (!mTester.setWorldRay(wRay)) return false;//missed bbox
+        if (!math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester)) return false;//missed level set
+        mTester.getWorldPosAndNml(world, normal);
+        return true;
+    }
+
+    /// @brief Return @c true if the world-space ray intersects the level set.
+    /// @param wRay   ray represented in world space.
+    /// @param world  if an intersection was found it is assigned the
+    ///               intersection point in world space, otherwise it is unchanged.
+    /// @param normal if an intersection was found it is assigned the normal
+    ///               of the level set surface in world space, otherwise it is unchanged.
+    /// @param wTime  if an intersection was found it is assigned the time of the
+    ///               intersection along the world ray.
+    bool intersectsWS(const RayType& wRay, Vec3Type& world, Vec3Type& normal, RealType &wTime) const
+    {
+        if (!mTester.setWorldRay(wRay)) return false;//missed bbox
+        if (!math::LevelSetHDDA<TreeT, NodeLevel>::test(mTester)) return false;//missed level set
+        mTester.getWorldPosAndNml(world, normal);
+        wTime = mTester.getWorldTime();
+        return true;
+    }
+
+private:
+
+    mutable SearchImplT mTester;
+
+};// LevelSetRayIntersector
+
+
+////////////////////////////////////// VolumeRayIntersector //////////////////////////////////////
+
+
+/// @brief This class provides the public API for intersecting a ray
+/// with a generic (e.g. density) volume.
+/// @details Internally it performs the actual hierarchical tree node traversal.
+/// @warning Use the (default) copy-constructor to make sure each
+/// computational thread has their own instance of this class. This is
+/// important since it contains a ValueAccessor that is
+/// not thread-safe and a CoordBBox of the active voxels that should
+/// not be re-computed for each thread. However copying is very efficient.
+/// @par Example:
+/// @code
+/// // Create an instance for the master thread
+/// VolumeRayIntersector inter(grid);
+/// // For each additional thread use the copy constructor. This
+/// // amortizes the overhead of computing the bbox of the active voxels!
+/// VolumeRayIntersector inter2(inter);
+/// // Before each ray-traversal set the index ray.
+/// iter.setIndexRay(ray);
+/// // or world ray
+/// iter.setWorldRay(ray);
+/// // Now you can begin the ray-marching using consecutive calls to VolumeRayIntersector::march
+/// double t0=0, t1=0;// note the entry and exit times are with respect to the INDEX ray
+/// while ( inter.march(t0, t1) ) {
+///   // perform line-integration between t0 and t1
+/// }}
+/// @endcode
+template<typename GridT,
+         int NodeLevel = GridT::TreeType::RootNodeType::ChildNodeType::LEVEL,
+         typename RayT = math::Ray<Real> >
+class VolumeRayIntersector
+{
+public:
+    typedef GridT                         GridType;
+    typedef RayT                          RayType;
+    typedef typename RayT::RealType       RealType;
+    typedef typename GridT::TreeType::RootNodeType RootType;
+    typedef tree::Tree<typename RootType::template ValueConverter<bool>::Type> TreeT;
+
+    BOOST_STATIC_ASSERT( NodeLevel >= 0 && NodeLevel < int(TreeT::DEPTH)-1);
+
+    /// @brief Grid constructor
+    /// @param grid Generic grid to intersect rays against.
+    /// @param dilationCount The number of voxel dilations performed
+    /// on (a boolean copy of) the input grid. This allows the
+    /// intersector to account for the size of interpolation kernels
+    /// in client code.
+    /// @throw RuntimeError if the voxels of the grid are not uniform
+    /// or the grid is empty.
+    VolumeRayIntersector(const GridT& grid, int dilationCount = 0)
+        : mIsMaster(true)
+        , mTree(new TreeT(grid.tree(), false, TopologyCopy()))
+        , mGrid(&grid)
+        , mAccessor(*mTree)
+    {
+        if (!grid.hasUniformVoxels() ) {
+            OPENVDB_THROW(RuntimeError,
+                          "VolumeRayIntersector only supports uniform voxels!");
+        }
+        if ( grid.empty() ) {
+            OPENVDB_THROW(RuntimeError, "LinearSearchImpl does not supports empty grids");
+        }
+
+        // Dilate active voxels to better account for the size of interpolation kernels
+        tools::dilateVoxels(*mTree, dilationCount);
+
+        mTree->root().evalActiveBoundingBox(mBBox, /*visit individual voxels*/false);
+
+        mBBox.max().offset(1);//padding so the bbox of a node becomes (origin,origin + node_dim)
+    }
+
+    /// @brief Grid and BBox constructor
+    /// @param grid Generic grid to intersect rays against.
+    /// @param bbox The axis-aligned bounding-box in the index space of the grid.
+    /// @warning It is assumed that bbox = (min, min + dim) where min denotes
+    /// to the smallest grid coordinates and dim are the integer length of the bbox.
+    /// @throw RuntimeError if the voxels of the grid are not uniform
+    /// or the grid is empty.
+    VolumeRayIntersector(const GridT& grid, const math::CoordBBox& bbox)
+        : mIsMaster(true)
+        , mTree(new TreeT(grid.tree(), false, TopologyCopy()))
+        , mGrid(&grid)
+        , mAccessor(*mTree)
+        , mBBox(bbox)
+    {
+        if (!grid.hasUniformVoxels() ) {
+            OPENVDB_THROW(RuntimeError,
+                          "VolumeRayIntersector only supports uniform voxels!");
+        }
+        if ( grid.empty() ) {
+            OPENVDB_THROW(RuntimeError, "LinearSearchImpl does not supports empty grids");
+        }
+    }
+
+    /// @brief Shallow copy constructor
+    /// @warning This copy constructor creates shallow copies of data
+    /// members of the instance passed as the argument. For
+    /// performance reasons we are not using shared pointers (their
+    /// mutex-lock impairs multi-threading).
+    VolumeRayIntersector(const VolumeRayIntersector& other)
+        : mIsMaster(false)
+        , mTree(other.mTree)//shallow copy
+        , mGrid(other.mGrid)//shallow copy
+        , mAccessor(*mTree)//initialize new (vs deep copy)
+        , mRay(other.mRay)//deep copy
+        , mTmax(other.mTmax)//deep copy
+        , mBBox(other.mBBox)//deep copy
+    {
+    }
+
+    /// @brief Destructor
+    ~VolumeRayIntersector() { if (mIsMaster) delete mTree; }
+
+    /// @brief Return @c false if the index ray misses the bbox of the grid.
+    /// @param iRay Ray represented in index space.
+    /// @warning Call this method (or setWorldRay) before the ray
+    /// traversal starts and use the return value to decide if further
+    /// marching is required.
+    inline bool setIndexRay(const RayT& iRay)
+    {
+        mRay = iRay;
+        const bool hit = mRay.clip(mBBox);
+        if (hit) mTmax = mRay.t1();
+        return hit;
+    }
+
+    /// @brief Return @c false if the world ray misses the bbox of the grid.
+    /// @param wRay Ray represented in world space.
+    /// @warning Call this method (or setIndexRay) before the ray
+    /// traversal starts and use the return value to decide if further
+    /// marching is required.
+    /// @details Since hit times are computed with respect to the ray
+    /// represented in index space of the current grid, it is
+    /// recommended that either the client code uses getIndexPos to
+    /// compute index position from hit times or alternatively keeps
+    /// an instance of the index ray and instead uses setIndexRay to
+    /// initialize the ray.
+    inline bool setWorldRay(const RayT& wRay)
+    {
+        return this->setIndexRay(wRay.worldToIndex(*mGrid));
+    }
+
+    inline typename RayT::TimeSpan march()
+    {
+        const typename RayT::TimeSpan t = mHDDA.march(mRay, mAccessor);
+        if (t.t1>0) mRay.setTimes(t.t1 + math::Delta<RealType>::value(), mTmax);
+        return t;
+    }
+
+    /// @brief Return @c true if the ray intersects active values,
+    /// i.e. either active voxels or tiles. Only when a hit is
+    /// detected are t0 and t1 updated with the corresponding entry
+    /// and exit times along the INDEX ray!
+    /// @note Note that t0 and t1 are only resolved at the node level
+    /// (e.g. a LeafNode with active voxels) as opposed to the individual
+    /// active voxels.
+    /// @param t0 If the return value > 0 this is the time of the
+    /// first hit of an active tile or leaf.
+    /// @param t1 If the return value > t0 this is the time of the
+    /// first hit (> t0) of an inactive tile or exit point of the
+    /// BBOX for the leaf nodes.
+    /// @warning t0 and t1 are computed with respect to the ray represented in
+    /// index space of the current grid, not world space!
+    inline bool march(RealType& t0, RealType& t1)
+    {
+        const typename RayT::TimeSpan t = this->march();
+        t.get(t0, t1);
+        return t.valid();
+    }
+
+    /// @brief Generates a list of hits along the ray.
+    ///
+    /// @param list List of hits represented as time spans.
+    ///
+    /// @note ListType is a list of RayType::TimeSpan and is required to
+    /// have the two methods: clear() and push_back(). Thus, it could
+    /// be std::vector<typename RayType::TimeSpan> or
+    /// std::deque<typename RayType::TimeSpan>.
+    template <typename ListType>
+    inline void hits(ListType& list)
+    {
+        mHDDA.hits(mRay, mAccessor, list);
+    }
+
+    /// @brief Return the floating-point index position along the
+    /// current index ray at the specified time.
+    inline Vec3R getIndexPos(RealType time) const { return mRay(time); }
+
+    /// @brief Return the floating-point world position along the
+    /// current index ray at the specified time.
+    inline Vec3R getWorldPos(RealType time) const { return mGrid->indexToWorld(mRay(time)); }
+
+    inline RealType getWorldTime(RealType time) const
+    {
+        return time*mGrid->transform().baseMap()->applyJacobian(mRay.dir()).length();
+    }
+
+    /// @brief Return a const reference to the input grid.
+    const GridT& grid() const { return *mGrid; }
+
+    /// @brief Return a const reference to the (potentially dilated)
+    /// bool tree used to accelerate the ray marching.
+    const TreeT& tree() const { return *mTree; }
+
+    /// @brief Return a const reference to the BBOX of the grid
+    const math::CoordBBox& bbox() const { return mBBox; }
+
+    /// @brief Print bbox, statistics, memory usage and other information.
+    /// @param os            a stream to which to write textual information
+    /// @param verboseLevel  1: print bbox only; 2: include boolean tree
+    ///                      statistics; 3: include memory usage
+    void print(std::ostream& os = std::cout, int verboseLevel = 1)
+    {
+        if (verboseLevel>0) {
+            os << "BBox: " << mBBox << std::endl;
+            if (verboseLevel==2) {
+                mTree->print(os, 1);
+            } else if (verboseLevel>2) {
+                mTree->print(os, 2);
+            }
+        }
+    }
+
+private:
+
+    typedef typename tree::ValueAccessor<const TreeT,/*IsSafe=*/false> AccessorT;
+
+    const bool      mIsMaster;
+    TreeT*          mTree;
+    const GridT*    mGrid;
+    AccessorT       mAccessor;
+    RayT            mRay;
+    RealType        mTmax;
+    math::CoordBBox mBBox;
+    math::VolumeHDDA<TreeT, RayType, NodeLevel> mHDDA;
+
+};// VolumeRayIntersector
+
+
+//////////////////////////////////////// LinearSearchImpl ////////////////////////////////////////
+
+
+/// @brief Implements linear iterative search for an iso-value of
+/// the level set along the direction of the ray.
+///
+/// @note Since this class is used internally in
+/// LevelSetRayIntersector (define above) and LevelSetHDDA (defined below)
+/// client code should never interact directly with its API. This also
+/// explains why we are not concerned with the fact that several of
+/// its methods are unsafe to call unless roots were already detected.
+///
+/// @details It is approximate due to the limited number of iterations
+/// which can can be defined with a template parameter. However the default value
+/// has proven surprisingly accurate and fast. In fact more iterations
+/// are not guaranteed to give significantly better results.
+///
+/// @warning Since the root-searching algorithm is approximate
+/// (first-order) it is possible to miss intersections if the
+/// iso-value is too close to the inside or outside of the narrow
+/// band (typically a distance less than a voxel unit).
+///
+/// @warning Since this class internally stores a ValueAccessor it is NOT thread-safe,
+/// so make sure to give each thread its own instance.  This of course also means that
+/// the cost of allocating an instance should (if possible) be amortized over
+/// as many ray intersections as possible.
+template<typename GridT, int Iterations, typename RealT>
+class LinearSearchImpl
+{
+public:
+    typedef math::Ray<RealT>              RayT;
+    typedef typename GridT::ValueType     ValueT;
+    typedef typename GridT::ConstAccessor AccessorT;
+    typedef math::BoxStencil<GridT>       StencilT;
+
+    /// @brief Constructor from a grid.
+    /// @throw RunTimeError if the grid is empty.
+    /// @throw ValueError if the isoValue is not inside the narrow-band.
+    LinearSearchImpl(const GridT& grid, const ValueT& isoValue = zeroVal<ValueT>())
+        : mStencil(grid),
+          mIsoValue(isoValue),
+          mMinValue(isoValue - ValueT(2 * grid.voxelSize()[0])),
+          mMaxValue(isoValue + ValueT(2 * grid.voxelSize()[0]))
+      {
+          if ( grid.empty() ) {
+              OPENVDB_THROW(RuntimeError, "LinearSearchImpl does not supports empty grids");
+          }
+          if (mIsoValue<= -grid.background() ||
+              mIsoValue>=  grid.background() ){
+              OPENVDB_THROW(ValueError, "The iso-value must be inside the narrow-band!");
+          }
+          grid.tree().root().evalActiveBoundingBox(mBBox, /*visit individual voxels*/false);
+      }
+
+    /// @brief Return the iso-value used for ray-intersections
+    const ValueT& getIsoValue() const { return mIsoValue; }
+
+    /// @brief Return @c false if the ray misses the bbox of the grid.
+    /// @param iRay Ray represented in index space.
+    /// @warning Call this method before the ray traversal starts.
+    inline bool setIndexRay(const RayT& iRay)
+    {
+        mRay = iRay;
+        return mRay.clip(mBBox);//did it hit the bbox
+    }
+
+    /// @brief Return @c false if the ray misses the bbox of the grid.
+    /// @param wRay Ray represented in world space.
+    /// @warning Call this method before the ray traversal starts.
+    inline bool setWorldRay(const RayT& wRay)
+    {
+        mRay = wRay.worldToIndex(mStencil.grid());
+        return mRay.clip(mBBox);//did it hit the bbox
+    }
+
+    /// @brief Get the intersection point in index space.
+    /// @param xyz The position in index space of the intersection.
+    inline void getIndexPos(Vec3d& xyz) const { xyz = mRay(mTime); }
+
+    /// @brief Get the intersection point in world space.
+    /// @param xyz The position in world space of the intersection.
+    inline void getWorldPos(Vec3d& xyz) const { xyz = mStencil.grid().indexToWorld(mRay(mTime)); }
+
+    /// @brief Get the intersection point and normal in world space
+    /// @param xyz The position in world space of the intersection.
+    /// @param nml The surface normal in world space of the intersection.
+    inline void getWorldPosAndNml(Vec3d& xyz, Vec3d& nml)
+    {
+        this->getIndexPos(xyz);
+        mStencil.moveTo(xyz);
+        nml = mStencil.gradient(xyz);
+        nml.normalize();
+        xyz = mStencil.grid().indexToWorld(xyz);
+    }
+
+    /// @brief Return the time of intersection along the index ray.
+    inline RealT getIndexTime() const { return mTime; }
+
+    /// @brief Return the time of intersection along the world ray.
+    inline RealT getWorldTime() const
+    {
+        return mTime*mStencil.grid().transform().baseMap()->applyJacobian(mRay.dir()).length();
+    }
+
+private:
+
+    /// @brief Initiate the local voxel intersection test.
+    /// @warning Make sure to call this method before the local voxel intersection test.
+    inline void init(RealT t0)
+    {
+        mT[0] = t0;
+        mV[0] = static_cast<ValueT>(this->interpValue(t0));
+    }
+
+    inline void setRange(RealT t0, RealT t1) { mRay.setTimes(t0, t1); }
+
+    /// @brief Return a const reference to the ray.
+    inline const RayT& ray() const { return mRay; }
+
+    /// @brief Return true if a node of the specified type exists at ijk.
+    template <typename NodeT>
+    inline bool hasNode(const Coord& ijk)
+    {
+        return mStencil.accessor().template probeConstNode<NodeT>(ijk) != NULL;
+    }
+
+    /// @brief Return @c true if an intersection is detected.
+    /// @param ijk Grid coordinate of the node origin or voxel being tested.
+    /// @param time Time along the index ray being tested.
+    /// @warning Only if an intersection is detected is it safe to
+    /// call getIndexPos, getWorldPos and getWorldPosAndNml!
+    inline bool operator()(const Coord& ijk, RealT time)
+    {
+        ValueT V;
+        if (mStencil.accessor().probeValue(ijk, V) &&//within narrow band
+            V>mMinValue && V<mMaxValue) {// and close to iso-value?
+            mT[1] = time;
+            mV[1] = static_cast<ValueT>(this->interpValue(time));
+            if (math::ZeroCrossing(mV[0], mV[1])) {
+                mTime = this->interpTime();
+                OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+                for (int n=0; Iterations>0 && n<Iterations; ++n) {//resolved at compile-time
+                    V = static_cast<ValueT>(this->interpValue(mTime));
+                    const int m = math::ZeroCrossing(mV[0], V) ? 1 : 0;
+                    mV[m] = V;
+                    mT[m] = mTime;
+                    mTime = this->interpTime();
+                }
+                OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+                return true;
+            }
+            mT[0] = mT[1];
+            mV[0] = mV[1];
+        }
+        return false;
+    }
+
+    inline RealT interpTime()
+    {
+        assert(math::isApproxLarger(mT[1], mT[0], 1e-6));
+        return mT[0]+(mT[1]-mT[0])*mV[0]/(mV[0]-mV[1]);
+    }
+
+    inline RealT interpValue(RealT time)
+    {
+        const Vec3R pos = mRay(time);
+        mStencil.moveTo(pos);
+        return mStencil.interpolation(pos) - mIsoValue;
+    }
+
+    template<typename, int> friend struct math::LevelSetHDDA;
+
+    RayT            mRay;
+    StencilT        mStencil;
+    RealT           mTime;//time of intersection
+    ValueT          mV[2];
+    RealT           mT[2];
+    const ValueT    mIsoValue, mMinValue, mMaxValue;
+    math::CoordBBox mBBox;
+};// LinearSearchImpl
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_RAYINTERSECTOR_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/RayTracer.h b/nuparu/include/openvdb_new/tools/RayTracer.h
new file mode 100644
index 00000000..ac035e57
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/RayTracer.h
@@ -0,0 +1,1102 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file RayTracer.h
+///
+/// @author Ken Museth
+///
+/// @brief Defines two simple but multithreaded renders, a level-set
+/// ray tracer and a volume render. To support these renders we also define
+/// perspective and orthographic cameras (both designed to mimic a Houdini camera),
+/// a Film class and some rather naive shaders.
+///
+/// @note These classes are included mainly as reference implementations for
+/// ray-tracing of OpenVDB volumes. In other words they are not intended for
+/// production-quality rendering, but could be used for fast pre-visualization
+/// or as a starting point for a more serious render.
+
+#ifndef OPENVDB_TOOLS_RAYTRACER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_RAYTRACER_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/math/BBox.h>
+#include <openvdb/math/Ray.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/tools/RayIntersector.h>
+#include <openvdb/tools/Interpolation.h>
+#include <boost/scoped_ptr.hpp>
+#include <boost/scoped_array.hpp>
+#include <fstream>
+#include <vector>
+#include <deque>
+
+#ifdef OPENVDB_TOOLS_RAYTRACER_USE_EXR
+#include <OpenEXR/ImfPixelType.h>
+#include <OpenEXR/ImfChannelList.h>
+#include <OpenEXR/ImfOutputFile.h>
+#include <OpenEXR/ImfHeader.h>
+#include <OpenEXR/ImfFrameBuffer.h>
+#endif
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+// Forward declarations
+class BaseCamera;
+class BaseShader;
+
+/// @brief Ray-trace a volume.
+template<typename GridT>
+inline void rayTrace(const GridT&,
+                     const BaseShader&,
+                     BaseCamera&,
+                     size_t pixelSamples = 1,
+                     unsigned int seed = 0,
+                     bool threaded = true);
+
+/// @brief Ray-trace a volume using a given ray intersector.
+template<typename GridT, typename IntersectorT>
+inline void rayTrace(const GridT&,
+                     const IntersectorT&,
+                     const BaseShader&,
+                     BaseCamera&,
+                     size_t pixelSamples = 1,
+                     unsigned int seed = 0,
+                     bool threaded = true);
+
+
+///////////////////////////////LEVEL SET RAY TRACER ///////////////////////////////////////
+
+/// @brief A (very) simple multithreaded ray tracer specifically for narrow-band level sets.
+/// @details Included primarily as a reference implementation.
+template<typename GridT, typename IntersectorT = tools::LevelSetRayIntersector<GridT> >
+class LevelSetRayTracer
+{
+public:
+    typedef GridT                           GridType;
+    typedef typename IntersectorT::Vec3Type Vec3Type;
+    typedef typename IntersectorT::RayType  RayType;
+
+    /// @brief Constructor based on an instance of the grid to be rendered.
+    LevelSetRayTracer(const GridT& grid,
+                      const BaseShader& shader,
+                      BaseCamera& camera,
+                      size_t pixelSamples = 1,
+                      unsigned int seed = 0);
+
+    /// @brief Constructor based on an instance of the intersector
+    /// performing the ray-intersections.
+    LevelSetRayTracer(const IntersectorT& inter,
+                      const BaseShader& shader,
+                      BaseCamera& camera,
+                      size_t pixelSamples = 1,
+                      unsigned int seed = 0);
+
+    /// @brief Copy constructor
+    LevelSetRayTracer(const LevelSetRayTracer& other);
+
+    /// @brief Destructor
+    ~LevelSetRayTracer();
+
+    /// @brief Set the level set grid to be ray-traced
+    void setGrid(const GridT& grid);
+
+    /// @brief Set the intersector that performs the actual
+    /// intersection of the rays against the narrow-band level set.
+    void setIntersector(const IntersectorT& inter);
+
+    /// @brief Set the shader derived from the abstract BaseShader class.
+    ///
+    /// @note The shader is not assumed to be thread-safe so each
+    /// thread will get its only deep copy. For instance it could
+    /// contains a ValueAccessor into another grid with auxiliary
+    /// shading information. Thus, make sure it is relatively
+    /// light-weight and efficient to copy (which is the case for ValueAccesors).
+    void setShader(const BaseShader& shader);
+
+    /// @brief Set the camera derived from the abstract BaseCamera class.
+    void setCamera(BaseCamera& camera);
+
+    /// @brief Set the number of pixel samples and the seed for
+    /// jittered sub-rays. A value larger than one implies
+    /// anti-aliasing by jittered super-sampling.
+    /// @throw ValueError if pixelSamples is equal to zero.
+    void setPixelSamples(size_t pixelSamples, unsigned int seed = 0);
+
+    /// @brief Perform the actual (potentially multithreaded) ray-tracing.
+    void render(bool threaded = true) const;
+
+    /// @brief Public method required by tbb::parallel_for.
+    /// @warning Never call it directly.
+    void operator()(const tbb::blocked_range<size_t>& range) const;
+
+private:
+    const bool                          mIsMaster;
+    double*                             mRand;
+    IntersectorT                        mInter;
+    boost::scoped_ptr<const BaseShader> mShader;
+    BaseCamera*                         mCamera;
+    size_t                              mSubPixels;
+};// LevelSetRayTracer
+
+
+///////////////////////////////VOLUME RENDER ///////////////////////////////////////
+
+/// @brief A (very) simple multithreaded volume render specifically for scalar density.
+/// @details Included primarily as a reference implementation.
+/// @note It will only compile if the IntersectorT is templated on a Grid with a
+/// floating-point voxel type.
+template <typename IntersectorT, typename SamplerT = tools::BoxSampler>
+class VolumeRender
+{
+public:
+
+    typedef typename IntersectorT::GridType  GridType;
+    typedef typename IntersectorT::RayType   RayType;
+    typedef typename GridType::ValueType     ValueType;
+    typedef typename GridType::ConstAccessor AccessorType;
+    typedef tools::GridSampler<AccessorType, SamplerT> SamplerType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+
+    /// @brief Constructor taking an intersector and a base camera.
+    VolumeRender(const IntersectorT& inter, BaseCamera& camera);
+
+    /// @brief Copy constructor which creates a thread-safe clone
+    VolumeRender(const VolumeRender& other);
+
+    /// @brief Perform the actual (potentially multithreaded) volume rendering.
+    void render(bool threaded=true) const;
+
+    /// @brief Set the camera derived from the abstract BaseCamera class.
+    void setCamera(BaseCamera& camera) { mCamera = &camera; }
+
+    /// @brief Set the intersector that performs the actual
+    /// intersection of the rays against the volume.
+    void setIntersector(const IntersectorT& inter);
+
+    /// @brief Set the vector components of a directional light source
+    /// @throw ArithmeticError if input is a null vector.
+    void setLightDir(Real x, Real y, Real z) { mLightDir = Vec3R(x,y,z).unit(); }
+
+    /// @brief Set the color of the directional light source.
+    void setLightColor(Real r, Real g, Real b) { mLightColor = Vec3R(r,g,b); }
+
+    /// @brief Set the integration step-size in voxel units for the primay ray.
+    void setPrimaryStep(Real primaryStep) { mPrimaryStep = primaryStep; }
+
+    /// @brief Set the integration step-size in voxel units for the primay ray.
+    void setShadowStep(Real shadowStep) { mShadowStep  = shadowStep; }
+
+    /// @brief Set Scattering coefficients.
+    void setScattering(Real x, Real y, Real z) { mScattering = Vec3R(x,y,z); }
+
+    /// @brief Set absorption coefficients.
+    void setAbsorption(Real x, Real y, Real z) { mAbsorption = Vec3R(x,y,z); }
+
+    /// @brief Set parameter that imitates multi-scattering. A value
+    /// of zero implies no multi-scattering.
+    void setLightGain(Real gain) { mLightGain = gain; }
+
+    /// @brief Set the cut-off value for density and transmittance.
+    void setCutOff(Real cutOff) { mCutOff = cutOff; }
+
+    /// @brief Print parameters, statistics, memory usage and other information.
+    /// @param os            a stream to which to write textual information
+    /// @param verboseLevel  1: print parameters only; 2: include grid
+    ///                      statistics; 3: include memory usage
+    void print(std::ostream& os = std::cout, int verboseLevel = 1);
+
+    /// @brief Public method required by tbb::parallel_for.
+    /// @warning Never call it directly.
+    void operator()(const tbb::blocked_range<size_t>& range) const;
+
+private:
+
+    AccessorType mAccessor;
+    BaseCamera*  mCamera;
+    boost::scoped_ptr<IntersectorT> mPrimary, mShadow;
+    Real  mPrimaryStep, mShadowStep, mCutOff, mLightGain;
+    Vec3R mLightDir, mLightColor, mAbsorption, mScattering;
+};//VolumeRender
+
+//////////////////////////////////////// FILM ////////////////////////////////////////
+
+/// @brief A simple class that allows for concurrent writes to pixels in an image,
+/// background initialization of the image, and PPM or EXR file output.
+class Film
+{
+public:
+    /// @brief Floating-point RGBA components in the range [0, 1].
+    /// @details This is our preferred representation for color processing.
+    struct RGBA
+    {
+        typedef float ValueT;
+
+        RGBA() : r(0), g(0), b(0), a(1) {}
+        explicit RGBA(ValueT intensity) : r(intensity), g(intensity), b(intensity), a(1) {}
+        RGBA(ValueT _r, ValueT _g, ValueT _b, ValueT _a = static_cast<ValueT>(1.0)):
+            r(_r), g(_g), b(_b), a(_a)
+        {}
+
+        RGBA  operator* (ValueT scale)  const { return RGBA(r*scale, g*scale, b*scale);}
+        RGBA  operator+ (const RGBA& rhs) const { return RGBA(r+rhs.r, g+rhs.g, b+rhs.b);}
+        RGBA  operator* (const RGBA& rhs) const { return RGBA(r*rhs.r, g*rhs.g, b*rhs.b);}
+        RGBA& operator+=(const RGBA& rhs) { r+=rhs.r; g+=rhs.g; b+=rhs.b, a+=rhs.a; return *this;}
+
+        void over(const RGBA& rhs)
+        {
+            const float s = rhs.a*(1.0f-a);
+            r = a*r+s*rhs.r;
+            g = a*g+s*rhs.g;
+            b = a*b+s*rhs.b;
+            a = a + s;
+        }
+
+        ValueT r, g, b, a;
+    };
+
+
+    Film(size_t width, size_t height)
+        : mWidth(width), mHeight(height), mSize(width*height), mPixels(new RGBA[mSize])
+    {
+    }
+    Film(size_t width, size_t height, const RGBA& bg)
+        : mWidth(width), mHeight(height), mSize(width*height), mPixels(new RGBA[mSize])
+    {
+        this->fill(bg);
+    }
+
+    const RGBA& pixel(size_t w, size_t h) const
+    {
+        assert(w < mWidth);
+        assert(h < mHeight);
+        return mPixels[w + h*mWidth];
+    }
+
+    RGBA& pixel(size_t w, size_t h)
+    {
+        assert(w < mWidth);
+        assert(h < mHeight);
+        return mPixels[w + h*mWidth];
+    }
+
+    void fill(const RGBA& rgb=RGBA(0)) { for (size_t i=0; i<mSize; ++i) mPixels[i] = rgb; }
+    void checkerboard(const RGBA& c1=RGBA(0.3f), const RGBA& c2=RGBA(0.6f), size_t size=32)
+    {
+        RGBA *p = mPixels.get();
+        for (size_t j = 0; j < mHeight; ++j) {
+            for (size_t i = 0; i < mWidth; ++i, ++p) {
+                *p = ((i & size) ^ (j & size)) ? c1 : c2;
+            }
+        }
+    }
+
+    void savePPM(const std::string& fileName)
+    {
+        std::string name(fileName);
+        if (name.find_last_of(".") == std::string::npos) name.append(".ppm");
+
+        boost::scoped_array<unsigned char> buffer(new unsigned char[3*mSize]);
+        unsigned char *tmp = buffer.get(), *q = tmp;
+        RGBA* p = mPixels.get();
+        size_t n = mSize;
+        while (n--) {
+            *q++ = static_cast<unsigned char>(255.0f*(*p  ).r);
+            *q++ = static_cast<unsigned char>(255.0f*(*p  ).g);
+            *q++ = static_cast<unsigned char>(255.0f*(*p++).b);
+        }
+
+        std::ofstream os(name.c_str(), std::ios_base::binary);
+        if (!os.is_open()) {
+            std::cerr << "Error opening PPM file \"" << name << "\"" << std::endl;
+            return;
+        }
+
+        os << "P6\n" << mWidth << " " << mHeight << "\n255\n";
+        os.write((const char *)&(*tmp), 3*mSize*sizeof(unsigned char));
+    }
+
+#ifdef OPENVDB_TOOLS_RAYTRACER_USE_EXR
+    void saveEXR(const std::string& fileName, size_t compression = 2, size_t threads = 8)
+    {
+        std::string name(fileName);
+        if (name.find_last_of(".") == std::string::npos) name.append(".exr");
+
+        if (threads>0) Imf::setGlobalThreadCount(threads);
+        Imf::Header header(mWidth, mHeight);
+        if (compression==0) header.compression() = Imf::NO_COMPRESSION;
+        if (compression==1) header.compression() = Imf::RLE_COMPRESSION;
+        if (compression>=2) header.compression() = Imf::ZIP_COMPRESSION;
+        header.channels().insert("R", Imf::Channel(Imf::FLOAT));
+        header.channels().insert("G", Imf::Channel(Imf::FLOAT));
+        header.channels().insert("B", Imf::Channel(Imf::FLOAT));
+        header.channels().insert("A", Imf::Channel(Imf::FLOAT));
+
+        Imf::FrameBuffer framebuffer;
+        framebuffer.insert("R", Imf::Slice( Imf::FLOAT, (char *) &(mPixels[0].r),
+                                            sizeof (RGBA), sizeof (RGBA) * mWidth));
+        framebuffer.insert("G", Imf::Slice( Imf::FLOAT, (char *) &(mPixels[0].g),
+                                            sizeof (RGBA), sizeof (RGBA) * mWidth));
+        framebuffer.insert("B", Imf::Slice( Imf::FLOAT, (char *) &(mPixels[0].b),
+                                            sizeof (RGBA), sizeof (RGBA) * mWidth));
+        framebuffer.insert("A", Imf::Slice( Imf::FLOAT, (char *) &(mPixels[0].a),
+                                            sizeof (RGBA), sizeof (RGBA) * mWidth));
+
+        Imf::OutputFile file(name.c_str(), header);
+        file.setFrameBuffer(framebuffer);
+        file.writePixels(mHeight);
+    }
+#endif
+
+    size_t width()       const { return mWidth; }
+    size_t height()      const { return mHeight; }
+    size_t numPixels()   const { return mSize; }
+    const RGBA* pixels() const { return mPixels.get(); }
+
+private:
+    size_t mWidth, mHeight, mSize;
+    boost::scoped_array<RGBA> mPixels;
+};// Film
+
+
+//////////////////////////////////////// CAMERAS ////////////////////////////////////////
+
+/// Abstract base class for the perspective and orthographic cameras
+class BaseCamera
+{
+public:
+    BaseCamera(Film& film, const Vec3R& rotation, const Vec3R& translation,
+               double frameWidth, double nearPlane, double farPlane)
+        : mFilm(&film)
+        , mScaleWidth(frameWidth)
+        , mScaleHeight(frameWidth * double(film.height()) / double(film.width()))
+    {
+        assert(nearPlane > 0 && farPlane > nearPlane);
+        mScreenToWorld.accumPostRotation(math::X_AXIS, rotation[0] * M_PI / 180.0);
+        mScreenToWorld.accumPostRotation(math::Y_AXIS, rotation[1] * M_PI / 180.0);
+        mScreenToWorld.accumPostRotation(math::Z_AXIS, rotation[2] * M_PI / 180.0);
+        mScreenToWorld.accumPostTranslation(translation);
+        this->initRay(nearPlane, farPlane);
+    }
+
+    virtual ~BaseCamera() {}
+
+    Film::RGBA& pixel(size_t i, size_t j) { return mFilm->pixel(i, j); }
+
+    size_t width()  const { return mFilm->width(); }
+    size_t height() const { return mFilm->height(); }
+
+    /// Rotate the camera so its negative z-axis points at xyz and its
+    /// y axis is in the plane of the xyz and up vectors. In other
+    /// words the camera will look at xyz and use up as the
+    /// horizontal direction.
+    void lookAt(const Vec3R& xyz, const Vec3R& up = Vec3R(0.0, 1.0, 0.0))
+    {
+        const Vec3R orig = mScreenToWorld.applyMap(Vec3R(0.0));
+        const Vec3R dir  = orig - xyz;
+        try {
+            Mat4d xform = math::aim<Mat4d>(dir, up);
+            xform.postTranslate(orig);
+            mScreenToWorld = math::AffineMap(xform);
+            this->initRay(mRay.t0(), mRay.t1());
+        } catch (...) {}
+    }
+
+    Vec3R rasterToScreen(double i, double j, double z) const
+    {
+        return Vec3R( (2 * i / double(mFilm->width()) - 1)  * mScaleWidth,
+                      (1 - 2 * j / double(mFilm->height())) * mScaleHeight, z );
+    }
+
+    /// @brief Return a Ray in world space given the pixel indices and
+    /// optional offsets in the range [0, 1]. An offset of 0.5 corresponds
+    /// to the center of the pixel.
+    virtual math::Ray<double> getRay(
+        size_t i, size_t j, double iOffset = 0.5, double jOffset = 0.5) const = 0;
+
+protected:
+    void initRay(double t0, double t1)
+    {
+        mRay.setTimes(t0, t1);
+        mRay.setEye(mScreenToWorld.applyMap(Vec3R(0.0)));
+        mRay.setDir(mScreenToWorld.applyJacobian(Vec3R(0.0, 0.0, -1.0)));
+    }
+
+    Film* mFilm;
+    double mScaleWidth, mScaleHeight;
+    math::Ray<double> mRay;
+    math::AffineMap mScreenToWorld;
+};// BaseCamera
+
+
+class PerspectiveCamera: public BaseCamera
+{
+  public:
+    /// @brief Constructor
+    /// @param film         film (i.e. image) defining the pixel resolution
+    /// @param rotation     rotation in degrees of the camera in world space
+    ///                     (applied in x, y, z order)
+    /// @param translation  translation of the camera in world-space units,
+    ///                     applied after rotation
+    /// @param focalLength  focal length of the camera in mm
+    ///                     (the default of 50mm corresponds to Houdini's default camera)
+    /// @param aperture     width in mm of the frame, i.e., the visible field
+    ///                     (the default 41.2136 mm corresponds to Houdini's default camera)
+    /// @param nearPlane    depth of the near clipping plane in world-space units
+    /// @param farPlane     depth of the far clipping plane in world-space units
+    ///
+    /// @details If no rotation or translation is provided, the camera is placed
+    /// at (0,0,0) in world space and points in the direction of the negative z axis.
+    PerspectiveCamera(Film& film,
+                      const Vec3R& rotation    = Vec3R(0.0),
+                      const Vec3R& translation = Vec3R(0.0),
+                      double focalLength = 50.0,
+                      double aperture    = 41.2136,
+                      double nearPlane   = 1e-3,
+                      double farPlane    = std::numeric_limits<double>::max())
+        : BaseCamera(film, rotation, translation, 0.5*aperture/focalLength, nearPlane, farPlane)
+    {
+    }
+
+    virtual ~PerspectiveCamera() {}
+
+    /// @brief Return a Ray in world space given the pixel indices and
+    /// optional offsets in the range [0,1]. An offset of 0.5 corresponds
+    /// to the center of the pixel.
+    virtual math::Ray<double> getRay(
+        size_t i, size_t j, double iOffset = 0.5, double jOffset = 0.5) const
+    {
+        math::Ray<double> ray(mRay);
+        Vec3R dir = BaseCamera::rasterToScreen(Real(i) + iOffset, Real(j) + jOffset, -1.0);
+        dir = BaseCamera::mScreenToWorld.applyJacobian(dir);
+        dir.normalize();
+        ray.scaleTimes(1.0/dir.dot(ray.dir()));
+        ray.setDir(dir);
+        return ray;
+    }
+
+    /// @brief Return the horizontal field of view in degrees given a
+    /// focal lenth in mm and the specified aperture in mm.
+    static double focalLengthToFieldOfView(double length, double aperture)
+    {
+        return 360.0 / M_PI * atan(aperture/(2.0*length));
+    }
+    /// @brief Return the focal length in mm given a horizontal field of
+    /// view in degrees and the specified aperture in mm.
+    static double fieldOfViewToFocalLength(double fov, double aperture)
+    {
+        return aperture/(2.0*(tan(fov * M_PI / 360.0)));
+    }
+};// PerspectiveCamera
+
+
+class OrthographicCamera: public BaseCamera
+{
+public:
+    /// @brief Constructor
+    /// @param film         film (i.e. image) defining the pixel resolution
+    /// @param rotation     rotation in degrees of the camera in world space
+    ///                     (applied in x, y, z order)
+    /// @param translation  translation of the camera in world-space units,
+    ///                     applied after rotation
+    /// @param frameWidth   width in of the frame in world-space units
+    /// @param nearPlane    depth of the near clipping plane in world-space units
+    /// @param farPlane     depth of the far clipping plane in world-space units
+    ///
+    /// @details If no rotation or translation is provided, the camera is placed
+    /// at (0,0,0) in world space and points in the direction of the negative z axis.
+    OrthographicCamera(Film& film,
+                       const Vec3R& rotation    = Vec3R(0.0),
+                       const Vec3R& translation = Vec3R(0.0),
+                       double frameWidth = 1.0,
+                       double nearPlane  = 1e-3,
+                       double farPlane   = std::numeric_limits<double>::max())
+        : BaseCamera(film, rotation, translation, 0.5*frameWidth, nearPlane, farPlane)
+    {
+    }
+    virtual ~OrthographicCamera() {}
+
+    virtual math::Ray<double> getRay(
+        size_t i, size_t j, double iOffset = 0.5, double jOffset = 0.5) const
+    {
+        math::Ray<double> ray(mRay);
+        Vec3R eye = BaseCamera::rasterToScreen(Real(i) + iOffset, Real(j) + jOffset, 0.0);
+        ray.setEye(BaseCamera::mScreenToWorld.applyMap(eye));
+        return ray;
+    }
+};// OrthographicCamera
+
+
+//////////////////////////////////////// SHADERS ////////////////////////////////////////
+
+
+/// Abstract base class for the shaders
+class BaseShader
+{
+public:
+    typedef math::Ray<Real> RayT;
+    BaseShader() {}
+    virtual ~BaseShader() {}
+    /// @brief Defines the interface of the virtual function that returns a RGB color.
+    /// @param xyz World position of the intersection point.
+    /// @param nml Normal in world space at the intersection point.
+    /// @param dir Direction of the ray in world space.
+    virtual Film::RGBA operator()(const Vec3R& xyz, const Vec3R& nml, const Vec3R& dir) const = 0;
+    virtual BaseShader* copy() const = 0;
+};
+
+
+/// @brief Shader that produces a simple matte.
+///
+/// @details The color can either be constant (if GridT =
+/// Film::RGBA which is the default) or defined in a separate Vec3
+/// color grid. Use SamplerType to define the order of interpolation
+/// (default is zero order, i.e. closes-point).
+template <typename GridT = Film::RGBA,
+          typename SamplerType = tools::PointSampler>
+class MatteShader: public BaseShader
+{
+public:
+    MatteShader(const GridT& grid) : mAcc(grid.getAccessor()), mXform(&grid.transform()) {}
+    virtual ~MatteShader() {}
+    virtual Film::RGBA operator()(const Vec3R& xyz, const Vec3R&, const Vec3R&) const
+    {
+        typename GridT::ValueType v = zeroVal<typename GridT::ValueType>();
+        SamplerType::sample(mAcc, mXform->worldToIndex(xyz), v);
+        return Film::RGBA(
+            static_cast<Film::RGBA::ValueT>(v[0]),
+            static_cast<Film::RGBA::ValueT>(v[1]),
+            static_cast<Film::RGBA::ValueT>(v[2]));
+    }
+    virtual BaseShader* copy() const { return new MatteShader<GridT, SamplerType>(*this); }
+
+private:
+    typename GridT::ConstAccessor mAcc;
+    const math::Transform* mXform;
+};
+// Template specialization using a constant color of the material.
+template <typename SamplerType>
+class MatteShader<Film::RGBA, SamplerType>: public BaseShader
+{
+public:
+    MatteShader(const Film::RGBA& c = Film::RGBA(1.0f)): mRGBA(c) {}
+    virtual ~MatteShader() {}
+    virtual Film::RGBA operator()(const Vec3R&, const Vec3R&, const Vec3R&) const
+    {
+        return mRGBA;
+    }
+    virtual BaseShader* copy() const { return new MatteShader<Film::RGBA, SamplerType>(*this); }
+
+private:
+    const Film::RGBA mRGBA;
+};
+
+
+/// @brief Color shader that treats the surface normal (x, y, z) as an
+/// RGB color.
+///
+/// @details The color can either be constant (if GridT =
+/// Film::RGBA which is the default) or defined in a separate Vec3
+/// color grid. Use SamplerType to define the order of interpolation
+/// (default is zero order, i.e. closes-point).
+template <typename GridT = Film::RGBA,
+          typename SamplerType = tools::PointSampler>
+class NormalShader: public BaseShader
+{
+public:
+    NormalShader(const GridT& grid) : mAcc(grid.getAccessor()), mXform(&grid.transform()) {}
+    virtual ~NormalShader() {}
+    virtual Film::RGBA operator()(const Vec3R& xyz, const Vec3R& normal, const Vec3R&) const
+    {
+        typename GridT::ValueType v = zeroVal<typename GridT::ValueType>();
+        SamplerType::sample(mAcc, mXform->worldToIndex(xyz), v);
+        return Film::RGBA(v[0]*(normal[0]+1.0f), v[1]*(normal[1]+1.0f), v[2]*(normal[2]+1.0f));
+    }
+    virtual BaseShader* copy() const { return new NormalShader<GridT, SamplerType>(*this); }
+
+private:
+    typename GridT::ConstAccessor mAcc;
+    const math::Transform* mXform;
+};
+// Template specialization using a constant color of the material.
+template <typename SamplerType>
+class NormalShader<Film::RGBA, SamplerType>: public BaseShader
+{
+public:
+    NormalShader(const Film::RGBA& c = Film::RGBA(1.0f)) : mRGBA(c*0.5f) {}
+    virtual ~NormalShader() {}
+    virtual Film::RGBA operator()(const Vec3R&, const Vec3R& normal, const Vec3R&) const
+    {
+        return mRGBA*Film::RGBA(normal[0]+1.0f, normal[1]+1.0f, normal[2]+1.0f);
+    }
+    virtual BaseShader* copy() const { return new NormalShader<Film::RGBA, SamplerType>(*this); }
+
+private:
+    const Film::RGBA mRGBA;
+};
+
+
+/// @brief Color shader that treats position (x, y, z) as an RGB color in a
+/// cube defined from an axis-aligned bounding box in world space.
+///
+/// @details The color can either be constant (if GridT =
+/// Film::RGBA which is the default) or defined in a separate Vec3
+/// color grid. Use SamplerType to define the order of interpolation
+/// (default is zero order, i.e. closes-point).
+template <typename GridT = Film::RGBA,
+          typename SamplerType = tools::PointSampler>
+class PositionShader: public BaseShader
+{
+public:
+    PositionShader(const math::BBox<Vec3R>& bbox, const GridT& grid)
+        : mMin(bbox.min())
+        , mInvDim(1.0/bbox.extents())
+        , mAcc(grid.getAccessor())
+        , mXform(&grid.transform())
+    {
+    }
+    virtual ~PositionShader() {}
+    virtual Film::RGBA operator()(const Vec3R& xyz, const Vec3R&, const Vec3R&) const
+    {
+        typename GridT::ValueType v = zeroVal<typename GridT::ValueType>();
+        SamplerType::sample(mAcc, mXform->worldToIndex(xyz), v);
+        const Vec3R rgb = (xyz - mMin)*mInvDim;
+        return Film::RGBA(v[0],v[1],v[2]) * Film::RGBA(rgb[0], rgb[1], rgb[2]);
+    }
+    virtual BaseShader* copy() const { return new PositionShader<GridT, SamplerType>(*this); }
+
+private:
+    const Vec3R mMin, mInvDim;
+    typename GridT::ConstAccessor mAcc;
+    const math::Transform* mXform;
+};
+// Template specialization using a constant color of the material.
+template <typename SamplerType>
+class PositionShader<Film::RGBA, SamplerType>: public BaseShader
+{
+public:
+    PositionShader(const math::BBox<Vec3R>& bbox, const Film::RGBA& c = Film::RGBA(1.0f))
+        : mMin(bbox.min()), mInvDim(1.0/bbox.extents()), mRGBA(c) {}
+    virtual ~PositionShader() {}
+    virtual Film::RGBA operator()(const Vec3R& xyz, const Vec3R&, const Vec3R&) const
+    {
+        const Vec3R rgb = (xyz - mMin)*mInvDim;
+        return mRGBA*Film::RGBA(rgb[0], rgb[1], rgb[2]);
+    }
+    virtual BaseShader* copy() const { return new PositionShader<Film::RGBA, SamplerType>(*this); }
+
+private:
+    const Vec3R mMin, mInvDim;
+    const Film::RGBA mRGBA;
+};
+
+/// @brief Simple diffuse Lambertian surface shader.
+///
+/// @details The diffuse color can either be constant (if GridT =
+/// Film::RGBA which is the default) or defined in a separate Vec3
+/// color grid. Lambertian implies that the (radiant) intensity is
+/// directly proportional to the cosine of the angle between the
+/// surface normal and the direction of the light source. Use
+/// SamplerType to define the order of interpolation (default is
+/// zero order, i.e. closes-point).
+template <typename GridT = Film::RGBA,
+          typename SamplerType = tools::PointSampler>
+class DiffuseShader: public BaseShader
+{
+public:
+    DiffuseShader(const GridT& grid): mAcc(grid.getAccessor()), mXform(&grid.transform()) {}
+    virtual ~DiffuseShader() {}
+    virtual Film::RGBA operator()(const Vec3R& xyz, const Vec3R& normal, const Vec3R& rayDir) const
+    {
+        typename GridT::ValueType v = zeroVal<typename GridT::ValueType>();
+        SamplerType::sample(mAcc, mXform->worldToIndex(xyz), v);
+        // We take the abs of the dot product corresponding to having
+        // light sources at +/- rayDir, i.e., two-sided shading.
+        return Film::RGBA(v[0],v[1],v[2]) * math::Abs(normal.dot(rayDir));
+    }
+    virtual BaseShader* copy() const { return new DiffuseShader<GridT, SamplerType>(*this); }
+
+private:
+    typename GridT::ConstAccessor mAcc;
+    const math::Transform* mXform;
+};
+// Template specialization using a constant color of the material.
+template <typename SamplerType>
+class DiffuseShader<Film::RGBA, SamplerType>: public BaseShader
+{
+public:
+    DiffuseShader(const Film::RGBA& d = Film::RGBA(1.0f)): mRGBA(d) {}
+    virtual ~DiffuseShader() {}
+    virtual Film::RGBA operator()(const Vec3R&, const Vec3R& normal, const Vec3R& rayDir) const
+    {
+        // We assume a single directional light source at the camera,
+        // so the cosine of the angle between the surface normal and the
+        // direction of the light source becomes the dot product of the
+        // surface normal and inverse direction of the ray.  We also ignore
+        // negative dot products, corresponding to strict one-sided shading.
+        //return mRGBA * math::Max(0.0, normal.dot(-rayDir));
+
+        // We take the abs of the dot product corresponding to having
+        // light sources at +/- rayDir, i.e., two-sided shading.
+        return mRGBA * math::Abs(normal.dot(rayDir));
+    }
+    virtual BaseShader* copy() const { return new DiffuseShader<Film::RGBA, SamplerType>(*this); }
+
+private:
+    const Film::RGBA mRGBA;
+};
+
+//////////////////////////////////////// RAYTRACER ////////////////////////////////////////
+
+template<typename GridT>
+inline void rayTrace(const GridT& grid,
+                     const BaseShader& shader,
+                     BaseCamera& camera,
+                     size_t pixelSamples,
+                     unsigned int seed,
+                     bool threaded)
+{
+    LevelSetRayTracer<GridT, tools::LevelSetRayIntersector<GridT> >
+        tracer(grid, shader, camera, pixelSamples, seed);
+    tracer.render(threaded);
+}
+
+
+template<typename GridT, typename IntersectorT>
+inline void rayTrace(const GridT&,
+                     const IntersectorT& inter,
+                     const BaseShader& shader,
+                     BaseCamera& camera,
+                     size_t pixelSamples,
+                     unsigned int seed,
+                     bool threaded)
+{
+    LevelSetRayTracer<GridT, IntersectorT> tracer(inter, shader, camera, pixelSamples, seed);
+    tracer.render(threaded);
+}
+
+
+//////////////////////////////////////// LevelSetRayTracer ////////////////////////////////////////
+
+
+template<typename GridT, typename IntersectorT>
+inline LevelSetRayTracer<GridT, IntersectorT>::
+LevelSetRayTracer(const GridT& grid,
+                  const BaseShader& shader,
+                  BaseCamera& camera,
+                  size_t pixelSamples,
+                  unsigned int seed)
+    : mIsMaster(true),
+      mRand(NULL),
+      mInter(grid),
+      mShader(shader.copy()),
+      mCamera(&camera)
+{
+    this->setPixelSamples(pixelSamples, seed);
+}
+
+template<typename GridT, typename IntersectorT>
+inline LevelSetRayTracer<GridT, IntersectorT>::
+LevelSetRayTracer(const IntersectorT& inter,
+                  const BaseShader& shader,
+                  BaseCamera& camera,
+                  size_t pixelSamples,
+                  unsigned int seed)
+    : mIsMaster(true),
+      mRand(NULL),
+      mInter(inter),
+      mShader(shader.copy()),
+      mCamera(&camera)
+{
+    this->setPixelSamples(pixelSamples, seed);
+}
+
+template<typename GridT, typename IntersectorT>
+inline LevelSetRayTracer<GridT, IntersectorT>::
+LevelSetRayTracer(const LevelSetRayTracer& other) :
+    mIsMaster(false),
+    mRand(other.mRand),
+    mInter(other.mInter),
+    mShader(other.mShader->copy()),
+    mCamera(other.mCamera),
+    mSubPixels(other.mSubPixels)
+{
+}
+
+template<typename GridT, typename IntersectorT>
+inline LevelSetRayTracer<GridT, IntersectorT>::
+~LevelSetRayTracer()
+{
+    if (mIsMaster) delete [] mRand;
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+setGrid(const GridT& grid)
+{
+    assert(mIsMaster);
+    mInter = IntersectorT(grid);
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+setIntersector(const IntersectorT& inter)
+{
+    assert(mIsMaster);
+    mInter = inter;
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+setShader(const BaseShader& shader)
+{
+    assert(mIsMaster);
+    mShader.reset(shader.copy());
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+setCamera(BaseCamera& camera)
+{
+    assert(mIsMaster);
+    mCamera = &camera;
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+setPixelSamples(size_t pixelSamples, unsigned int seed)
+{
+    assert(mIsMaster);
+    if (pixelSamples == 0) {
+        OPENVDB_THROW(ValueError, "pixelSamples must be larger than zero!");
+    }
+    mSubPixels = pixelSamples - 1;
+    delete [] mRand;
+    if (mSubPixels > 0) {
+        mRand = new double[16];
+        math::Rand01<double> rand(seed);//offsets for anti-aliaing by jittered super-sampling
+        for (size_t i=0; i<16; ++i) mRand[i] = rand();
+    } else {
+        mRand = NULL;
+    }
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+render(bool threaded) const
+{
+    tbb::blocked_range<size_t> range(0, mCamera->height());
+    threaded ? tbb::parallel_for(range, *this) : (*this)(range);
+}
+
+template<typename GridT, typename IntersectorT>
+inline void LevelSetRayTracer<GridT, IntersectorT>::
+operator()(const tbb::blocked_range<size_t>& range) const
+{
+    const BaseShader& shader = *mShader;
+    Vec3Type xyz, nml;
+    const float frac = 1.0f / (1.0f + mSubPixels);
+    for (size_t j=range.begin(), n=0, je = range.end(); j<je; ++j) {
+        for (size_t i=0, ie = mCamera->width(); i<ie; ++i) {
+            Film::RGBA& bg = mCamera->pixel(i,j);
+            RayType ray = mCamera->getRay(i, j);//primary ray
+            Film::RGBA c = mInter.intersectsWS(ray, xyz, nml) ? shader(xyz, nml, ray.dir()) : bg;
+            for (size_t k=0; k<mSubPixels; ++k, n +=2 ) {
+                ray = mCamera->getRay(i, j, mRand[n & 15], mRand[(n+1) & 15]);
+                c += mInter.intersectsWS(ray, xyz, nml) ? shader(xyz, nml, ray.dir()) : bg;
+            }//loop over sub-pixels
+            bg = c*frac;
+        }//loop over image height
+    }//loop over image width
+}
+
+//////////////////////////////////////// VolumeRender ////////////////////////////////////////
+
+template<typename IntersectorT, typename SampleT>
+inline VolumeRender<IntersectorT, SampleT>::
+VolumeRender(const IntersectorT& inter, BaseCamera& camera)
+    : mAccessor(inter.grid().getConstAccessor())
+    , mCamera(&camera)
+    , mPrimary(new IntersectorT(inter))
+    , mShadow(new IntersectorT(inter))
+    , mPrimaryStep(1.0)
+    , mShadowStep(3.0)
+    , mCutOff(0.005)
+    , mLightGain(0.2)
+    , mLightDir(Vec3R(0.3, 0.3, 0).unit())
+    , mLightColor(0.7, 0.7, 0.7)
+    , mAbsorption(0.1)
+    , mScattering(1.5)
+{
+}
+
+template<typename IntersectorT, typename SampleT>
+inline VolumeRender<IntersectorT, SampleT>::
+VolumeRender(const VolumeRender& other)
+    : mAccessor(other.mAccessor)
+    , mCamera(other.mCamera)
+    , mPrimary(new IntersectorT(*(other.mPrimary)))
+    , mShadow(new IntersectorT(*(other.mShadow)))
+    , mPrimaryStep(other.mPrimaryStep)
+    , mShadowStep(other.mShadowStep)
+    , mCutOff(other.mCutOff)
+    , mLightGain(other.mLightGain)
+    , mLightDir(other.mLightDir)
+    , mLightColor(other.mLightColor)
+    , mAbsorption(other.mAbsorption)
+    , mScattering(other.mScattering)
+{
+}
+
+template<typename IntersectorT, typename SampleT>
+inline void VolumeRender<IntersectorT, SampleT>::
+print(std::ostream& os, int verboseLevel)
+{
+    if (verboseLevel>0) {
+        os << "\nPrimary step: " <<  mPrimaryStep
+           << "\nShadow step: " << mShadowStep
+           << "\nCutoff: " << mCutOff
+           << "\nLightGain: " << mLightGain
+           << "\nLightDir: " << mLightDir
+           << "\nLightColor: " << mLightColor
+           << "\nAbsorption: " << mAbsorption
+           << "\nScattering: " << mScattering << std::endl;
+    }
+    mPrimary->print(os, verboseLevel);
+}
+
+template<typename IntersectorT, typename SampleT>
+inline void VolumeRender<IntersectorT, SampleT>::
+setIntersector(const IntersectorT& inter)
+{
+    mPrimary.reset(new IntersectorT(inter));
+    mShadow.reset( new IntersectorT(inter));
+}
+
+template<typename IntersectorT, typename SampleT>
+inline void VolumeRender<IntersectorT, SampleT>::
+render(bool threaded) const
+{
+    tbb::blocked_range<size_t> range(0, mCamera->height());
+    threaded ? tbb::parallel_for(range, *this) : (*this)(range);
+}
+
+template<typename IntersectorT, typename SampleT>
+inline void VolumeRender<IntersectorT, SampleT>::
+operator()(const tbb::blocked_range<size_t>& range) const
+{
+    SamplerType sampler(mAccessor, mShadow->grid().transform());//light-weight wrapper
+
+    // Any variable prefixed with p (or s) means it's associated with a primary (or shadow) ray
+    const Vec3R extinction = -mScattering-mAbsorption, One(1.0);
+    const Vec3R albedo = mLightColor*mScattering/(mScattering+mAbsorption);//single scattering
+    const Real sGain = mLightGain;//in-scattering along shadow ray
+    const Real pStep = mPrimaryStep;//Integration step along primary ray in voxel units
+    const Real sStep = mShadowStep;//Integration step along shadow ray in voxel units
+    const Real cutoff = mCutOff;//Cutoff for density and transmittance
+
+    // For the sake of completeness we show how to use two different
+    // methods (hits/march) in VolumeRayIntersector that produce
+    // segments along the ray that intersects active values. Comment out
+    // the line below to use VolumeRayIntersector::march instead of
+    // VolumeRayIntersector::hits.
+#define USE_HITS
+#ifdef USE_HITS
+    std::vector<typename RayType::TimeSpan> pTS, sTS;
+    //std::deque<typename RayType::TimeSpan> pTS, sTS;
+#endif
+
+    RayType sRay(Vec3R(0), mLightDir);//Shadow ray
+    for (size_t j=range.begin(), je = range.end(); j<je; ++j) {
+        for (size_t i=0, ie = mCamera->width(); i<ie; ++i) {
+            Film::RGBA& bg = mCamera->pixel(i, j);
+            bg.a = bg.r = bg.g = bg.b = 0;
+            RayType pRay = mCamera->getRay(i, j);// Primary ray
+            if( !mPrimary->setWorldRay(pRay)) continue;
+            Vec3R pTrans(1.0), pLumi(0.0);
+#ifndef USE_HITS
+            Real pT0, pT1;
+            while (mPrimary->march(pT0, pT1)) {
+                for (Real pT = pStep*ceil(pT0/pStep); pT <= pT1; pT += pStep) {
+#else
+            mPrimary->hits(pTS);
+            for (size_t k=0; k<pTS.size(); ++k) {
+                Real pT = pStep*ceil(pTS[k].t0/pStep), pT1=pTS[k].t1;
+                for (; pT <= pT1; pT += pStep) {
+#endif
+                    Vec3R pPos = mPrimary->getWorldPos(pT);
+                    const Real density = sampler.wsSample(pPos);
+                    if (density < cutoff) continue;
+                    const Vec3R dT = math::Exp(extinction * density * pStep);
+                    Vec3R sTrans(1.0);
+                    sRay.setEye(pPos);
+                    if( !mShadow->setWorldRay(sRay)) continue;
+#ifndef USE_HITS
+                    Real sT0, sT1;
+                    while (mShadow->march(sT0, sT1)) {
+                        for (Real sT = sStep*ceil(sT0/sStep); sT <= sT1; sT+= sStep) {
+#else
+                    mShadow->hits(sTS);
+                    for (size_t l=0; l<sTS.size(); ++l) {
+                        Real sT = sStep*ceil(sTS[l].t0/sStep), sT1=sTS[l].t1;
+                        for (; sT <= sT1; sT+= sStep) {
+#endif
+                            const Real d = sampler.wsSample(mShadow->getWorldPos(sT));
+                            if (d < cutoff) continue;
+                            sTrans *= math::Exp(extinction * d * sStep/(1.0+sT*sGain));
+                            if (sTrans.lengthSqr()<cutoff) goto Luminance;//Terminate sRay
+                        }//Integration over shadow segment
+                    }// Shadow ray march
+                Luminance:
+                    pLumi += albedo * sTrans * pTrans * (One-dT);
+                    pTrans *= dT;
+                    if (pTrans.lengthSqr()<cutoff) goto Pixel;  // Terminate Ray
+                }//Integration over primary segment
+            }// Primary ray march
+        Pixel:
+            bg.r = static_cast<Film::RGBA::ValueT>(pLumi[0]);
+            bg.g = static_cast<Film::RGBA::ValueT>(pLumi[1]);
+            bg.b = static_cast<Film::RGBA::ValueT>(pLumi[2]);
+            bg.a = static_cast<Film::RGBA::ValueT>(1.0f - pTrans.sum()/3.0f);
+     }//Horizontal pixel scan
+   }//Vertical pixel scan
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_RAYTRACER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/SignedFloodFill.h b/nuparu/include/openvdb_new/tools/SignedFloodFill.h
new file mode 100644
index 00000000..d75ba1d9
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/SignedFloodFill.h
@@ -0,0 +1,311 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file SignedFloodFill.h
+///
+/// @brief Propagates the sign of distance values from the active
+/// voxels in the narrow band to the inactive values outside the
+/// narrow band.
+///
+/// @author Ken Museth
+
+#ifndef OPENVDB_TOOLS_SIGNEDFLOODFILL_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_SIGNEDFLOODFILL_HAS_BEEN_INCLUDED
+
+#include <boost/utility/enable_if.hpp>
+#include <openvdb/math/Math.h> // for math::negative
+#include <openvdb/Types.h> // for Index typedef
+#include <boost/static_assert.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+#include <boost/type_traits/is_signed.hpp>
+
+#include <openvdb/tree/NodeManager.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Set the values of all inactive voxels and tiles of a narrow-band
+/// level set from the signs of the active voxels, setting outside values to
+/// +background and inside values to -background.
+///
+/// @warning This method should only be used on closed, symmetric narrow-band level sets.
+///
+/// @note If a LeafManager is used the cached leaf nodes are reused,
+/// resulting in slightly better overall performance.
+///
+/// @param tree          Tree or LeafManager that will be flood filled.
+/// @param threaded      enable or disable threading  (threading is enabled by default)
+/// @param grainSize     used to control the threading granularity (default is 1)
+/// @param minLevel      Specify the lowest tree level to process (leafnode level = 0)
+///
+/// @throw TypeError if the ValueType of @a tree is not floating-point.
+template<typename TreeOrLeafManagerT>
+inline void
+signedFloodFill(TreeOrLeafManagerT& tree, bool threaded = true,
+    size_t grainSize = 1, Index minLevel = 0);
+
+
+/// @brief Set the values of all inactive voxels and tiles of a narrow-band
+/// level set from the signs of the active voxels, setting exterior values to
+/// @a outsideWidth and interior values to @a insideWidth.  Set the background value
+/// of this tree to @a outsideWidth.
+///
+/// @warning This method should only be used on closed, narrow-band level sets.
+///
+/// @note If a LeafManager is used the cached leaf nodes are reused
+/// resulting in slightly better overall performance.
+///
+/// @param tree          Tree or LeafManager that will be flood filled
+/// @param outsideWidth  the width of the outside of the narrow band
+/// @param insideWidth   the width of the inside of the narrow band
+/// @param threaded      enable or disable threading  (threading is enabled by default)
+/// @param grainSize     used to control the threading granularity (default is 1)
+/// @param minLevel      Specify the lowest tree level to process (leafnode level = 0)
+///
+/// @throw TypeError if the ValueType of @a tree is not floating-point.
+template<typename TreeOrLeafManagerT>
+inline void
+signedFloodFillWithValues(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& outsideWidth,
+    const typename TreeOrLeafManagerT::ValueType& insideWidth,
+    bool threaded = true,
+    size_t grainSize = 1,
+    Index minLevel = 0);
+
+
+////////////////////////// Implementation of SignedFloodFill ////////////////////////////
+
+
+template<typename TreeOrLeafManagerT>
+class SignedFloodFillOp
+{
+public:
+    typedef typename TreeOrLeafManagerT::ValueType    ValueT;
+    typedef typename TreeOrLeafManagerT::RootNodeType RootT;
+    typedef typename TreeOrLeafManagerT::LeafNodeType LeafT;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueT>::value || boost::is_signed<ValueT>::value);
+
+    SignedFloodFillOp(const TreeOrLeafManagerT& tree, Index minLevel = 0)
+        : mOutside(ValueT(math::Abs(tree.background())))
+        , mInside(ValueT(math::negative(mOutside)))
+        , mMinLevel(minLevel)
+    {
+    }
+
+    SignedFloodFillOp(ValueT outsideValue, ValueT insideValue, Index minLevel = 0)
+        : mOutside(ValueT(math::Abs(outsideValue)))
+        , mInside(ValueT(math::negative(math::Abs(insideValue))))
+        , mMinLevel(minLevel)
+    {
+    }
+
+    // Nothing to do at the leaf node level
+    void operator()(LeafT& leaf) const
+    {
+        if (LeafT::LEVEL < mMinLevel) return;
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (!leaf.allocate()) return;//this assures that the buffer is allocated and in-memory
+#endif
+        const typename LeafT::NodeMaskType& valueMask = leaf.getValueMask();
+        // WARNING: "Never do what you're about to see at home, we're what you call experts!"
+        typename LeafT::ValueType* buffer =
+            const_cast<typename LeafT::ValueType*>(&(leaf.getFirstValue()));
+
+        const Index first = valueMask.findFirstOn();
+        if (first < LeafT::SIZE) {
+            bool xInside = buffer[first]<0, yInside = xInside, zInside = xInside;
+            for (Index x = 0; x != (1 << LeafT::LOG2DIM); ++x) {
+                const Index x00 = x << (2 * LeafT::LOG2DIM);
+                if (valueMask.isOn(x00)) xInside = buffer[x00] < 0; // element(x, 0, 0)
+                yInside = xInside;
+                for (Index y = 0; y != (1 << LeafT::LOG2DIM); ++y) {
+                    const Index xy0 = x00 + (y << LeafT::LOG2DIM);
+                    if (valueMask.isOn(xy0)) yInside = buffer[xy0] < 0; // element(x, y, 0)
+                    zInside = yInside;
+                    for (Index z = 0; z != (1 << LeafT::LOG2DIM); ++z) {
+                        const Index xyz = xy0 + z; // element(x, y, z)
+                        if (valueMask.isOn(xyz)) {
+                            zInside = buffer[xyz] < 0;
+                        } else {
+                            buffer[xyz] = zInside ? mInside : mOutside;
+                        }
+                    }
+                }
+            }
+        } else {// if no active voxels exist simply use the sign of the first value
+            leaf.fill(buffer[0] < 0 ? mInside : mOutside);
+        }
+    }
+
+    // Prune the child nodes of the internal nodes
+    template<typename NodeT>
+    void operator()(NodeT& node) const
+    {
+        if (NodeT::LEVEL < mMinLevel) return;
+        // We assume the child nodes have already been flood filled!
+        const typename NodeT::NodeMaskType& childMask = node.getChildMask();
+        // WARNING: "Never do what you're about to see at home, we're what you call experts!"
+        typename NodeT::UnionType* table = const_cast<typename NodeT::UnionType*>(node.getTable());
+
+        const Index first = childMask.findFirstOn();
+        if (first < NodeT::NUM_VALUES) {
+            bool xInside = table[first].getChild()->getFirstValue()<0;
+            bool yInside = xInside, zInside = xInside;
+            for (Index x = 0; x != (1 << NodeT::LOG2DIM); ++x) {
+                const int x00 = x << (2 * NodeT::LOG2DIM); // offset for block(x, 0, 0)
+                if (childMask.isOn(x00)) xInside = table[x00].getChild()->getLastValue()<0;
+                yInside = xInside;
+                for (Index y = 0; y != (1 << NodeT::LOG2DIM); ++y) {
+                    const Index xy0 = x00 + (y << NodeT::LOG2DIM); // offset for block(x, y, 0)
+                    if (childMask.isOn(xy0)) yInside = table[xy0].getChild()->getLastValue()<0;
+                    zInside = yInside;
+                    for (Index z = 0; z != (1 << NodeT::LOG2DIM); ++z) {
+                        const Index xyz = xy0 + z; // offset for block(x, y, z)
+                        if (childMask.isOn(xyz)) {
+                            zInside = table[xyz].getChild()->getLastValue()<0;
+                        } else {
+                            table[xyz].setValue(zInside ? mInside : mOutside);
+                        }
+                    }
+                }
+            }
+        } else {//no child nodes exist simply use the sign of the first tile value.
+            const ValueT v =  table[0].getValue()<0 ? mInside : mOutside;
+            for (Index i = 0; i < NodeT::NUM_VALUES; ++i) table[i].setValue(v);
+        }
+    }
+
+    // Prune the child nodes of the root node
+    void operator()(RootT& root) const
+    {
+        if (RootT::LEVEL < mMinLevel) return;
+        typedef typename RootT::ChildNodeType ChildT;
+        // Insert the child nodes into a map sorted according to their origin
+        std::map<Coord, ChildT*> nodeKeys;
+        typename RootT::ChildOnIter it = root.beginChildOn();
+        for (; it; ++it) nodeKeys.insert(std::pair<Coord, ChildT*>(it.getCoord(), &(*it)));
+        static const Index DIM = RootT::ChildNodeType::DIM;
+
+        // We employ a simple z-scanline algorithm that inserts inactive tiles with
+        // the inside value if they are sandwiched between inside child nodes only!
+        typename std::map<Coord, ChildT*>::const_iterator b = nodeKeys.begin(), e = nodeKeys.end();
+        if ( b == e ) return;
+        for (typename std::map<Coord, ChildT*>::const_iterator a = b++; b != e; ++a, ++b) {
+            Coord d = b->first - a->first; // delta of neighboring coordinates
+            if (d[0]!=0 || d[1]!=0 || d[2]==Int32(DIM)) continue;// not same z-scanline or neighbors
+            const ValueT fill[] = { a->second->getLastValue(), b->second->getFirstValue() };
+            if (!(fill[0] < 0) || !(fill[1] < 0)) continue; // scanline isn't inside
+            Coord c = a->first + Coord(0u, 0u, DIM);
+            for (; c[2] != b->first[2]; c[2] += DIM) root.addTile(c, mInside, false);
+        }
+        root.setBackground(mOutside, /*updateChildNodes=*/false);
+    }
+
+private:
+    const ValueT mOutside, mInside;
+    const Index mMinLevel;
+};// SignedFloodFillOp
+
+
+template<typename TreeOrLeafManagerT>
+inline
+typename boost::enable_if_c<
+    boost::is_floating_point<typename TreeOrLeafManagerT::ValueType>::value ||
+    boost::is_signed<typename TreeOrLeafManagerT::ValueType>::value, void>::type
+doSignedFloodFill(TreeOrLeafManagerT& tree,
+                  typename TreeOrLeafManagerT::ValueType outsideValue,
+                  typename TreeOrLeafManagerT::ValueType insideValue,
+                  bool threaded,
+                  size_t grainSize,
+                  Index minLevel)
+{
+    tree::NodeManager<TreeOrLeafManagerT> nodes(tree);
+    SignedFloodFillOp<TreeOrLeafManagerT> op(outsideValue, insideValue, minLevel);
+    nodes.foreachBottomUp(op, threaded, grainSize);
+}
+
+// Dummy (no-op) implementation for non-float types
+template <typename TreeOrLeafManagerT>
+inline
+typename boost::disable_if_c<
+    boost::is_floating_point<typename TreeOrLeafManagerT::ValueType>::value ||
+    boost::is_signed<typename TreeOrLeafManagerT::ValueType>::value, void>::type
+doSignedFloodFill(TreeOrLeafManagerT&,
+                  const typename TreeOrLeafManagerT::ValueType&,
+                  const typename TreeOrLeafManagerT::ValueType&,
+                  bool,
+                  size_t,
+                  Index)
+{
+    OPENVDB_THROW(TypeError,
+        "signedFloodFill is supported only for signed value grids");
+}
+
+
+// If the narrow-band is symmetric and unchanged
+template <typename TreeOrLeafManagerT>
+inline void
+signedFloodFillWithValues(
+    TreeOrLeafManagerT& tree,
+    const typename TreeOrLeafManagerT::ValueType& outsideValue,
+    const typename TreeOrLeafManagerT::ValueType& insideValue,
+    bool threaded,
+    size_t grainSize,
+    Index minLevel)
+{
+    doSignedFloodFill(tree, outsideValue, insideValue, threaded, grainSize, minLevel);
+}
+
+
+template <typename TreeOrLeafManagerT>
+inline void
+signedFloodFill(TreeOrLeafManagerT& tree,
+                bool threaded,
+                size_t grainSize,
+                Index minLevel)
+{
+    const typename TreeOrLeafManagerT::ValueType v = tree.root().background();
+    doSignedFloodFill(tree, v, math::negative(v), threaded, grainSize, minLevel);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_RESETBACKGROUND_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/Statistics.h b/nuparu/include/openvdb_new/tools/Statistics.h
new file mode 100644
index 00000000..f979c4af
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/Statistics.h
@@ -0,0 +1,438 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Statistics.h
+///
+/// @brief Functions to efficiently compute histograms, extremas
+/// (min/max) and statistics (mean, variance, etc.) of grid values
+
+#ifndef OPENVDB_TOOLS_STATISTICS_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_STATISTICS_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/Exceptions.h>
+#include <openvdb/math/Stats.h>
+#include "ValueTransformer.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Iterate over a scalar grid and compute a histogram of the values
+/// of the voxels that are visited, or iterate over a vector-valued grid
+/// and compute a histogram of the magnitudes of the vectors.
+/// @param iter      an iterator over the values of a grid or its tree
+///                  (@c Grid::ValueOnCIter, @c Tree::ValueOffIter, etc.)
+/// @param minVal    the smallest value that can be added to the histogram
+/// @param maxVal    the largest value that can be added to the histogram
+/// @param numBins   the number of histogram bins
+/// @param threaded  if true, iterate over the grid in parallel
+template<typename IterT>
+inline math::Histogram
+histogram(const IterT& iter, double minVal, double maxVal,
+          size_t numBins = 10, bool threaded = true);
+
+/// @brief Iterate over a scalar grid and compute extrema (min/max) of the
+/// values of the voxels that are visited, or iterate over a vector-valued grid
+/// and compute extrema of the magnitudes of the vectors.
+/// @param iter      an iterator over the values of a grid or its tree
+///                  (@c Grid::ValueOnCIter, @c Tree::ValueOffIter, etc.)
+/// @param threaded  if true, iterate over the grid in parallel
+template<typename IterT>
+inline math::Extrema
+extrema(const IterT& iter, bool threaded = true);
+
+/// @brief Iterate over a scalar grid and compute statistics (mean, variance, etc.)
+/// of the values of the voxels that are visited, or iterate over a vector-valued grid
+/// and compute statistics of the magnitudes of the vectors.
+/// @param iter      an iterator over the values of a grid or its tree
+///                  (@c Grid::ValueOnCIter, @c Tree::ValueOffIter, etc.)
+/// @param threaded  if true, iterate over the grid in parallel
+template<typename IterT>
+inline math::Stats
+statistics(const IterT& iter, bool threaded = true);
+
+/// @brief Iterate over a grid and compute extrema (min/max) of
+/// the values produced by applying the given functor at each voxel that is visited.
+/// @param iter      an iterator over the values of a grid or its tree
+///                  (@c Grid::ValueOnCIter, @c Tree::ValueOffIter, etc.)
+/// @param op        a functor of the form <tt>void op(const IterT&, math::Stats&)</tt>,
+///                  where @c IterT is the type of @a iter, that inserts zero or more
+///                  floating-point values into the provided @c math::Stats object
+/// @param threaded  if true, iterate over the grid in parallel
+/// @note When @a threaded is true, each thread gets its own copy of the functor.
+///
+/// @par Example:
+/// Compute statistics of just the active and positive-valued voxels of a scalar,
+/// floating-point grid.
+/// @code
+/// struct Local {
+///     static inline
+///     void addIfPositive(const FloatGrid::ValueOnCIter& iter, math::Extrema& ex)
+///     {
+///         const float f = *iter;
+///         if (f > 0.0) {
+///             if (iter.isVoxelValue()) ex.add(f);
+///             else ex.add(f, iter.getVoxelCount());
+///         }
+///     }
+/// };
+/// FloatGrid grid = ...;
+/// math::Extrema stats =
+///     tools::extrema(grid.cbeginValueOn(), Local::addIfPositive, /*threaded=*/true);
+/// @endcode
+template<typename IterT, typename ValueOp>
+inline math::Extrema
+extrema(const IterT& iter, const ValueOp& op, bool threaded);
+
+/// @brief Iterate over a grid and compute statistics (mean, variance, etc.) of
+/// the values produced by applying the given functor at each voxel that is visited.
+/// @param iter      an iterator over the values of a grid or its tree
+///                  (@c Grid::ValueOnCIter, @c Tree::ValueOffIter, etc.)
+/// @param op        a functor of the form <tt>void op(const IterT&, math::Stats&)</tt>,
+///                  where @c IterT is the type of @a iter, that inserts zero or more
+///                  floating-point values into the provided @c math::Stats object
+/// @param threaded  if true, iterate over the grid in parallel
+/// @note When @a threaded is true, each thread gets its own copy of the functor.
+///
+/// @par Example:
+/// Compute statistics of just the active and positive-valued voxels of a scalar,
+/// floating-point grid.
+/// @code
+/// struct Local {
+///     static inline
+///     void addIfPositive(const FloatGrid::ValueOnCIter& iter, math::Stats& stats)
+///     {
+///         const float f = *iter;
+///         if (f > 0.0) {
+///             if (iter.isVoxelValue()) stats.add(f);
+///             else stats.add(f, iter.getVoxelCount());
+///         }
+///     }
+/// };
+/// FloatGrid grid = ...;
+/// math::Stats stats =
+///     tools::statistics(grid.cbeginValueOn(), Local::addIfPositive, /*threaded=*/true);
+/// @endcode
+template<typename IterT, typename ValueOp>
+inline math::Stats
+statistics(const IterT& iter, const ValueOp& op, bool threaded);
+
+
+/// @brief Iterate over a grid and compute statistics (mean, variance, etc.)
+/// of the values produced by applying a given operator (see math/Operators.h)
+/// at each voxel that is visited.
+/// @param iter      an iterator over the values of a grid or its tree
+///                  (@c Grid::ValueOnCIter, @c Tree::ValueOffIter, etc.)
+/// @param op        an operator object with a method of the form
+///                  <tt>double result(Accessor&, const Coord&)</tt>
+/// @param threaded  if true, iterate over the grid in parallel
+/// @note World-space operators, whose @c result() methods are of the form
+/// <tt>double result(const Map&, Accessor&, const Coord&)</tt>, must be wrapped
+/// in a math::MapAdapter.
+/// @note Vector-valued operators like math::Gradient must be wrapped in an adapter
+/// such as math::OpMagnitude.
+///
+/// @par Example:
+/// Compute statistics of the magnitude of the gradient at the active voxels of
+/// a scalar, floating-point grid.  (Note the use of the math::MapAdapter and
+/// math::OpMagnitude adapters.)
+/// @code
+/// FloatGrid grid = ...;
+///
+/// // Assume that we know that the grid has a uniform scale map.
+/// typedef math::UniformScaleMap MapType;
+/// // Specify a world-space gradient operator that uses first-order differencing.
+/// typedef math::Gradient<MapType, math::FD_1ST> GradientOp;
+/// // Wrap the operator with an adapter that computes the magnitude of the gradient.
+/// typedef math::OpMagnitude<GradientOp, MapType> MagnitudeOp;
+/// // Wrap the operator with an adapter that associates a map with it.
+/// typedef math::MapAdapter<MapType, GradientOp, double> CompoundOp;
+///
+/// if (MapType::Ptr map = grid.constTransform().constMap<MapType>()) {
+///     math::Stats stats = tools::opStatistics(grid.cbeginValueOn(), CompoundOp(*map));
+/// }
+/// @endcode
+///
+/// @par Example:
+/// Compute statistics of the divergence at the active voxels of a vector-valued grid.
+/// @code
+/// Vec3SGrid grid = ...;
+///
+/// // Assume that we know that the grid has a uniform scale map.
+/// typedef math::UniformScaleMap MapType;
+/// // Specify a world-space divergence operator that uses first-order differencing.
+/// typedef math::Divergence<MapType, math::FD_1ST> DivergenceOp;
+/// // Wrap the operator with an adapter that associates a map with it.
+/// typedef math::MapAdapter<MapType, DivergenceOp, double> CompoundOp;
+///
+/// if (MapType::Ptr map = grid.constTransform().constMap<MapType>()) {
+///     math::Stats stats = tools::opStatistics(grid.cbeginValueOn(), CompoundOp(*map));
+/// }
+/// @endcode
+///
+/// @par Example:
+/// As above, but computing the divergence in index space.
+/// @code
+/// Vec3SGrid grid = ...;
+///
+/// // Specify an index-space divergence operator that uses first-order differencing.
+/// typedef math::ISDivergence<math::FD_1ST> DivergenceOp;
+///
+/// math::Stats stats = tools::opStatistics(grid.cbeginValueOn(), DivergenceOp());
+/// @endcode
+template<typename OperatorT, typename IterT>
+inline math::Stats
+opStatistics(const IterT& iter, const OperatorT& op = OperatorT(), bool threaded = true);
+
+/// @brief Same as opStatistics except it returns a math::Extrema vs a math::Stats
+template<typename OperatorT, typename IterT>
+inline math::Extrema
+opExtrema(const IterT& iter, const OperatorT& op = OperatorT(), bool threaded = true);
+
+////////////////////////////////////////
+
+
+namespace stats_internal {
+
+/// @todo This traits class is needed because tree::TreeValueIteratorBase uses
+/// the name ValueT for the type of the value to which the iterator points,
+/// whereas node-level iterators use the name ValueType.
+template<typename IterT, typename AuxT = void>
+struct IterTraits {
+    typedef typename IterT::ValueType ValueType;
+};
+
+template<typename TreeT, typename ValueIterT>
+struct IterTraits<tree::TreeValueIteratorBase<TreeT, ValueIterT> > {
+    typedef typename tree::TreeValueIteratorBase<TreeT, ValueIterT>::ValueT ValueType;
+};
+
+
+// Helper class to compute a scalar value from either a scalar or a vector value
+// (the latter by computing the vector's magnitude)
+template<typename T, bool IsVector> struct GetValImpl;
+
+template<typename T>
+struct GetValImpl<T, /*IsVector=*/false> {
+    static inline double get(const T& val) { return double(val); }
+};
+
+template<typename T>
+struct GetValImpl<T, /*IsVector=*/true> {
+    static inline double get(const T& val) { return val.length(); }
+};
+
+
+// Helper class to compute a scalar value from a tree or node iterator
+// that points to a value in either a scalar or a vector grid, and to
+// add that value to a math::Stats object.
+template<typename IterT, typename StatsT>
+struct GetVal
+{
+    typedef typename IterTraits<IterT>::ValueType ValueT;
+    typedef GetValImpl<ValueT, VecTraits<ValueT>::IsVec> ImplT;
+
+    inline void operator()(const IterT& iter, StatsT& stats) const {
+        if (iter.isVoxelValue()) stats.add(ImplT::get(*iter));
+        else stats.add(ImplT::get(*iter), iter.getVoxelCount());
+    }
+};
+
+// Helper class to accumulate scalar voxel values or vector voxel magnitudes
+// into a math::Stats object
+template<typename IterT, typename ValueOp, typename StatsT>
+struct StatsOp
+{
+    StatsOp(const ValueOp& op): getValue(op) {}
+
+    // Accumulate voxel and tile values into this functor's Stats object.
+    inline void operator()(const IterT& iter) { getValue(iter, stats); }
+
+    // Accumulate another functor's Stats object into this functor's.
+    inline void join(StatsOp& other) { stats.add(other.stats); }
+
+    StatsT stats;
+    ValueOp getValue;
+};
+
+
+// Helper class to accumulate scalar voxel values or vector voxel magnitudes
+// into a math::Histogram object
+template<typename IterT, typename ValueOp>
+struct HistOp
+{
+    HistOp(const ValueOp& op, double vmin, double vmax, size_t bins):
+        hist(vmin, vmax, bins), getValue(op)
+    {}
+
+    // Accumulate voxel and tile values into this functor's Histogram object.
+    inline void operator()(const IterT& iter) { getValue(iter, hist); }
+
+    // Accumulate another functor's Histogram object into this functor's.
+    inline void join(HistOp& other) { hist.add(other.hist); }
+
+    math::Histogram hist;
+    ValueOp getValue;
+};
+
+
+// Helper class to apply an operator such as math::Gradient or math::Laplacian
+// to voxels and accumulate the scalar results or the magnitudes of vector results
+// into a math::Stats object
+template<typename IterT, typename OpT, typename StatsT>
+struct MathOp
+{
+    typedef typename IterT::TreeT                     TreeT;
+    typedef typename TreeT::ValueType                 ValueT;
+    typedef typename tree::ValueAccessor<const TreeT> ConstAccessor;
+
+    // Each thread gets its own accessor and its own copy of the operator.
+    ConstAccessor mAcc;
+    OpT mOp;
+    StatsT mStats;
+
+    template<typename TreeT>
+    static inline TreeT* THROW_IF_NULL(TreeT* ptr) {
+        if (ptr == NULL) OPENVDB_THROW(ValueError, "iterator references a null tree");
+        return ptr;
+    }
+
+    MathOp(const IterT& iter, const OpT& op):
+        mAcc(*THROW_IF_NULL(iter.getTree())), mOp(op)
+    {}
+
+    // Accumulate voxel and tile values into this functor's Stats object.
+    void operator()(const IterT& it)
+    {
+        if (it.isVoxelValue()) {
+            // Add the magnitude of the gradient at a single voxel.
+            mStats.add(mOp.result(mAcc, it.getCoord()));
+        } else {
+            // Iterate over the voxels enclosed by a tile and add the results
+            // of applying the operator at each voxel.
+            /// @todo This could be specialized to be done more efficiently for some operators.
+            /// For example, all voxels in the interior of a tile (i.e., not on the borders)
+            /// have gradient zero, so there's no need to apply the operator to every voxel.
+            CoordBBox bbox = it.getBoundingBox();
+            Coord xyz;
+            int &x = xyz.x(), &y = xyz.y(), &z = xyz.z();
+            for (x = bbox.min().x(); x <= bbox.max().x(); ++x) {
+                for (y = bbox.min().y(); y <= bbox.max().y(); ++y) {
+                    for (z = bbox.min().z(); z <= bbox.max().z(); ++z) {
+                        mStats.add(mOp.result(mAcc, it.getCoord()));
+                    }
+                }
+            }
+        }
+    }
+
+    // Accumulate another functor's Stats object into this functor's.
+    inline void join(MathOp& other) { mStats.add(other.mStats); }
+}; // struct MathOp
+
+} // namespace stats_internal
+
+
+template<typename IterT>
+inline math::Histogram
+histogram(const IterT& iter, double vmin, double vmax, size_t numBins, bool threaded)
+{
+    typedef stats_internal::GetVal<IterT, math::Histogram> ValueOp;
+    ValueOp valOp;
+    stats_internal::HistOp<IterT, ValueOp> op(valOp, vmin, vmax, numBins);
+    tools::accumulate(iter, op, threaded);
+    return op.hist;
+}
+
+template<typename IterT>
+inline math::Extrema
+extrema(const IterT& iter, bool threaded)
+{
+    stats_internal::GetVal<IterT, math::Extrema> valOp;
+    return extrema(iter, valOp, threaded);
+}
+
+template<typename IterT>
+inline math::Stats
+statistics(const IterT& iter, bool threaded)
+{
+    stats_internal::GetVal<IterT, math::Stats> valOp;
+    return statistics(iter, valOp, threaded);
+}
+
+template<typename IterT, typename ValueOp>
+inline math::Extrema
+extrema(const IterT& iter, const ValueOp& valOp, bool threaded)
+{
+    stats_internal::StatsOp<IterT, const ValueOp, math::Extrema> op(valOp);
+    tools::accumulate(iter, op, threaded);
+    return op.stats;
+}
+
+template<typename IterT, typename ValueOp>
+inline math::Stats
+statistics(const IterT& iter, const ValueOp& valOp, bool threaded)
+{
+    stats_internal::StatsOp<IterT, const ValueOp, math::Stats> op(valOp);
+    tools::accumulate(iter, op, threaded);
+    return op.stats;
+}
+
+
+template<typename OperatorT, typename IterT>
+inline math::Extrema
+opExtrema(const IterT& iter, const OperatorT& op, bool threaded)
+{
+    stats_internal::MathOp<IterT, OperatorT, math::Extrema> func(iter, op);
+    tools::accumulate(iter, func, threaded);
+    return func.mStats;
+}
+
+template<typename OperatorT, typename IterT>
+inline math::Stats
+opStatistics(const IterT& iter, const OperatorT& op, bool threaded)
+{
+    stats_internal::MathOp<IterT, OperatorT, math::Stats> func(iter, op);
+    tools::accumulate(iter, func, threaded);
+    return func.mStats;
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_STATISTICS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/TopologyToLevelSet.h b/nuparu/include/openvdb_new/tools/TopologyToLevelSet.h
new file mode 100644
index 00000000..1ae83f7a
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/TopologyToLevelSet.h
@@ -0,0 +1,284 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file    TopologyToLevelSet.h
+///
+/// @brief   This tool converts active grid topology to a into a signed
+///          distance field encoded as a narrow band level set.
+///
+/// @details The boundary between active and inactive voxels is treated
+///          as the zero crossing for the level set.
+///
+/// @par Example:
+/// Combine with @c tools::createPointMaskGrid for fast point cloud to level set conversion.
+///
+/// @author  D.J. Hill
+
+#ifndef OPENVDB_TOOLS_TOPOLOGY_TO_LEVELSET_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_TOPOLOGY_TO_LEVELSET_HAS_BEEN_INCLUDED
+
+#include "LevelSetFilter.h"
+#include "Morphology.h" // for {dilate|erode}Voxels
+#include "Prune.h"// for pruneInactive
+#include "SignedFloodFill.h" // for signedFloodFill
+
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+#include <openvdb/math/FiniteDifference.h> // for math::BiasedGradientScheme
+#include <openvdb/util/NullInterrupter.h>
+
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+
+#include <vector>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief  Compute the narrow-band signed distance to the boundary
+///         between active and inactive voxels in the input grid.
+///
+/// @return A shared pointer to a new signed distance field of type @c float
+///
+/// @param grid           Input grid of arbitrary type whose active voxels are used
+///                       in constructing the level set.
+/// @param halfBandWidth  Half the width of the narrow band, in voxel units
+/// @param closingWidth   Number of iterations used to first expand and then shrink
+///                       the filled voxel region.
+///                       This causes holes and valleys to be filled.
+/// @param dilation       Number of iterations used to expand the filled voxel region.
+/// @param smoothingSteps Number of smoothing interations
+template<typename GridType>
+inline typename GridType::template ValueConverter<float>::Type::Ptr
+topologyToLevelSet(const GridType& grid, int halfBandWidth = 3, int closingWidth = 1,
+    int dilation = 0, int smoothingSteps = 0);
+
+
+/// @brief  Compute the narrow-band signed distance to the boundary
+///         between active and inactive voxels in the input grid.
+///
+/// @return A shared pointer to a new signed distance field of type @c float
+///
+/// @param grid           Input grid of arbitrary type whose active voxels are used
+///                       in constructing the level set.
+/// @param halfBandWidth  Half the width of the narrow band, in voxel units
+/// @param closingWidth   Number of iterations used to first expand and then shrink
+///                       the filled voxel region.
+///                       This causes holes and valleys to be filled.
+/// @param dilation       Number of iterations used to expand the filled voxel region.
+/// @param smoothingSteps Number of smoothing interations
+/// @param interrupt      Optional object adhering to the util::NullInterrupter interface.
+template<typename GridType, typename InterrupterType>
+inline typename GridType::template ValueConverter<float>::Type::Ptr
+topologyToLevelSet(const GridType& grid, int halfBandWidth = 3, int closingWidth = 1,
+    int dilation = 0, int smoothingSteps = 0, InterrupterType* interrupt = NULL);
+
+
+////////////////////////////////////////
+////////////////////////////////////////
+
+// Implementation details
+
+
+namespace ttls_internal {
+
+
+template<typename TreeType>
+struct OffsetAndMinComp
+{
+    typedef typename TreeType::LeafNodeType     LeafNodeType;
+    typedef typename TreeType::ValueType        ValueType;
+
+    OffsetAndMinComp(std::vector<LeafNodeType*>& lhsNodes, const TreeType& rhsTree, ValueType offset)
+        : mLhsNodes(lhsNodes.empty() ? NULL : &lhsNodes[0]), mRhsTree(&rhsTree), mOffset(offset)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        typedef typename LeafNodeType::ValueOnIter Iterator;
+
+        tree::ValueAccessor<const TreeType> rhsAcc(*mRhsTree);
+        const ValueType offset = mOffset;
+
+        for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+
+            LeafNodeType& lhsNode = *mLhsNodes[n];
+            const LeafNodeType * rhsNodePt = rhsAcc.probeConstLeaf(lhsNode.origin());
+            if (!rhsNodePt) continue;
+
+            for (Iterator it = lhsNode.beginValueOn(); it; ++it) {
+                ValueType& val = const_cast<ValueType&>(it.getValue());
+                val = std::min(val, offset + rhsNodePt->getValue(it.pos()));
+            }
+        }
+    }
+
+private:
+    LeafNodeType    *       * const mLhsNodes;
+    TreeType          const * const mRhsTree;
+    ValueType                 const mOffset;
+}; // struct OffsetAndMinComp
+
+
+template<typename GridType, typename InterrupterType>
+inline void
+normalizeLevelSet(GridType& grid, const int halfWidthInVoxels, InterrupterType* interrupt = NULL)
+{
+    LevelSetFilter<GridType, GridType, InterrupterType> filter(grid, interrupt);
+    filter.setSpatialScheme(math::FIRST_BIAS);
+    filter.setNormCount(halfWidthInVoxels);
+    filter.normalize();
+    filter.prune();
+}
+
+
+template<typename GridType, typename InterrupterType>
+inline void
+smoothLevelSet(GridType& grid, int iterations, int halfBandWidthInVoxels, InterrupterType* interrupt = NULL)
+{
+    typedef typename GridType::ValueType        ValueType;
+    typedef typename GridType::TreeType         TreeType;
+    typedef typename TreeType::LeafNodeType     LeafNodeType;
+
+    GridType filterGrid(grid);
+
+    LevelSetFilter<GridType, GridType, InterrupterType> filter(filterGrid, interrupt);
+    filter.setSpatialScheme(math::FIRST_BIAS);
+
+    for (int n = 0; n < iterations; ++n) {
+        if (interrupt && interrupt->wasInterrupted()) break;
+        filter.mean(1);
+    }
+
+    std::vector<LeafNodeType*> nodes;
+    grid.tree().getNodes(nodes);
+
+    const ValueType offset = ValueType(double(0.5) * grid.transform().voxelSize()[0]);
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, nodes.size()),
+        OffsetAndMinComp<TreeType>(nodes, filterGrid.tree(), -offset));
+
+    // Clean up any damanage that was done by the min operation
+    normalizeLevelSet(grid, halfBandWidthInVoxels, interrupt);
+}
+
+
+} // namespace ttls_internal
+
+
+////////////////////////////////////////
+
+
+template<typename GridType, typename InterrupterType>
+inline typename GridType::template ValueConverter<float>::Type::Ptr
+topologyToLevelSet(const GridType& grid, int halfBandWidth, int closingWidth,
+    int dilation, int smoothingSteps, InterrupterType* interrupt)
+{
+    typedef typename GridType::template ValueConverter<float>::Type             FloatGridType;
+    typedef typename FloatGridType::TreeType                                    FloatTreeType;
+    typedef typename FloatTreeType::template ValueConverter<ValueMask>::Type    MaskTreeType;
+
+    halfBandWidth = std::max(halfBandWidth, 1);
+    closingWidth = std::max(closingWidth, 0);
+
+    MaskTreeType regionMask(grid.tree(), false, TopologyCopy());
+
+    // closing operation and padding
+    openvdb::tools::dilateVoxels(regionMask, closingWidth + dilation);
+    openvdb::tools::erodeVoxels(regionMask, closingWidth);
+
+
+    // Construct inside band mask
+
+    MaskTreeType coreMask(regionMask);
+    openvdb::tools::erodeVoxels(coreMask, halfBandWidth);
+
+    regionMask.topologyDifference(coreMask);
+    tools::pruneInactive(regionMask,  /*threading=*/true);
+
+    // Generate a volume with an implicit zero crossing at the boundary
+    // between active and inactive values in the input grid.
+
+    const float width = float(grid.transform().voxelSize()[0] * double(halfBandWidth));
+
+    typename FloatTreeType::Ptr resultTree(
+        new FloatTreeType(regionMask, /*inactive=*/width, /*active=*/-width, openvdb::TopologyCopy()));
+
+    // Construct outside band mask
+    openvdb::tools::dilateVoxels(regionMask, halfBandWidth);
+    regionMask.topologyDifference(coreMask);
+    tools::pruneInactive(regionMask,  /*threading=*/true);
+
+    // Activate outside band
+    resultTree->topologyUnion(regionMask);
+
+    // Update interior sign
+    tools::signedFloodFill(*resultTree);
+
+    // Embed the tree in a grid to define a transform and voxel size.
+    typename FloatGridType::Ptr resultGrid = FloatGridType::create(resultTree);
+    resultGrid->setTransform(grid.transform().copy());
+    resultGrid->setGridClass(GRID_LEVEL_SET);
+
+    // Use a PDE based scheme to propagate distance values from the
+    // implicit zero crossing.
+    ttls_internal::normalizeLevelSet(*resultGrid, 3*halfBandWidth, interrupt);
+
+    // Optinal smooting operation
+    if (smoothingSteps > 0) {
+        ttls_internal::smoothLevelSet(*resultGrid, smoothingSteps, halfBandWidth, interrupt);
+    }
+
+    return resultGrid;
+}
+
+
+template<typename GridType>
+inline typename GridType::template ValueConverter<float>::Type::Ptr
+topologyToLevelSet(const GridType& grid, int halfBandWidth, int closingWidth, int dilation, int smoothingSteps)
+{
+    util::NullInterrupter interrupt;
+    return topologyToLevelSet(grid, halfBandWidth, closingWidth, dilation, smoothingSteps, &interrupt);
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif //OPENVDB_TOOLS_DENSESPARSETOOLS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/ValueTransformer.h b/nuparu/include/openvdb_new/tools/ValueTransformer.h
new file mode 100644
index 00000000..7f51cd8b
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/ValueTransformer.h
@@ -0,0 +1,707 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file ValueTransformer.h
+///
+/// @author Peter Cucka
+///
+/// tools::foreach() and tools::transformValues() transform the values in a grid
+/// by iterating over the grid with a user-supplied iterator and applying a
+/// user-supplied functor at each step of the iteration.  With tools::foreach(),
+/// the transformation is done in-place on the input grid, whereas with
+/// tools::transformValues(), transformed values are written to an output grid
+/// (which can, for example, have a different value type than the input grid).
+/// Both functions can optionally transform multiple values of the grid in parallel.
+///
+/// tools::accumulate() can be used to accumulate the results of applying a functor
+/// at each step of a grid iteration.  (The functor is responsible for storing and
+/// updating intermediate results.)  When the iteration is done serially the behavior is
+/// the same as with tools::foreach(), but when multiple values are processed in parallel,
+/// an additional step is performed: when any two threads finish processing,
+/// @c op.join(otherOp) is called on one thread's functor to allow it to coalesce
+/// its intermediate result with the other thread's.
+///
+/// Finally, tools::setValueOnMin(), tools::setValueOnMax(), tools::setValueOnSum()
+/// and tools::setValueOnMult() are wrappers around Tree::modifyValue() (or
+/// ValueAccessor::modifyValue()) for some commmon in-place operations.
+/// These are typically significantly faster than calling getValue() followed by setValue().
+
+#ifndef OPENVDB_TOOLS_VALUETRANSFORMER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_VALUETRANSFORMER_HAS_BEEN_INCLUDED
+
+#include <algorithm> // for std::min(), std::max()
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <openvdb/Types.h>
+#include <openvdb/Grid.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// Iterate over a grid and at each step call @c op(iter).
+/// @param iter      an iterator over a grid or its tree (@c Grid::ValueOnCIter,
+///                  @c Tree::NodeIter, etc.)
+/// @param op        a functor of the form <tt>void op(const IterT&)</tt>, where @c IterT is
+///                  the type of @a iter
+/// @param threaded  if true, transform multiple values of the grid in parallel
+/// @param shareOp   if true and @a threaded is true, all threads use the same functor;
+///                  otherwise, each thread gets its own copy of the @e original functor
+///
+/// @par Example:
+/// Multiply all values (both set and unset) of a scalar, floating-point grid by two.
+/// @code
+/// struct Local {
+///     static inline void op(const FloatGrid::ValueAllIter& iter) {
+///         iter.setValue(*iter * 2);
+///     }
+/// };
+/// FloatGrid grid = ...;
+/// tools::foreach(grid.beginValueAll(), Local::op);
+/// @endcode
+///
+/// @par Example:
+/// Rotate all active vectors of a vector grid by 45 degrees about the y axis.
+/// @code
+/// namespace {
+///     struct MatMul {
+///         math::Mat3s M;
+///         MatMul(const math::Mat3s& mat): M(mat) {}
+///         inline void operator()(const VectorGrid::ValueOnIter& iter) const {
+///             iter.setValue(M.transform(*iter));
+///         }
+///     };
+/// }
+/// {
+///     VectorGrid grid = ...;
+///     tools::foreach(grid.beginValueOn(),
+///         MatMul(math::rotation<math::Mat3s>(math::Y, M_PI_4)));
+/// }
+/// @endcode
+///
+/// @note For more complex operations that require finer control over threading,
+/// consider using @c tbb::parallel_for() or @c tbb::parallel_reduce() in conjunction
+/// with a tree::IteratorRange that wraps a grid or tree iterator.
+template<typename IterT, typename XformOp>
+inline void foreach(const IterT& iter, XformOp& op,
+    bool threaded = true, bool shareOp = true);
+
+template<typename IterT, typename XformOp>
+inline void foreach(const IterT& iter, const XformOp& op,
+    bool threaded = true, bool shareOp = true);
+
+
+/// Iterate over a grid and at each step call <tt>op(iter, accessor)</tt> to
+/// populate (via the accessor) the given output grid, whose @c ValueType
+/// need not be the same as the input grid's.
+/// @param inIter    a non-<tt>const</tt> or (preferably) @c const iterator over an
+///                  input grid or its tree (@c Grid::ValueOnCIter, @c Tree::NodeIter, etc.)
+/// @param outGrid   an empty grid to be populated
+/// @param op        a functor of the form
+///                  <tt>void op(const InIterT&, OutGridT::ValueAccessor&)</tt>,
+///                  where @c InIterT is the type of @a inIter
+/// @param threaded  if true, transform multiple values of the input grid in parallel
+/// @param shareOp   if true and @a threaded is true, all threads use the same functor;
+///                  otherwise, each thread gets its own copy of the @e original functor
+/// @param merge     how to merge intermediate results from multiple threads (see Types.h)
+///
+/// @par Example:
+/// Populate a scalar floating-point grid with the lengths of the vectors from all
+/// active voxels of a vector-valued input grid.
+/// @code
+/// struct Local {
+///     static void op(
+///         const Vec3fGrid::ValueOnCIter& iter,
+///         FloatGrid::ValueAccessor& accessor)
+///     {
+///         if (iter.isVoxelValue()) { // set a single voxel
+///             accessor.setValue(iter.getCoord(), iter->length());
+///         } else { // fill an entire tile
+///             CoordBBox bbox;
+///             iter.getBoundingBox(bbox);
+///             accessor.getTree()->fill(bbox, iter->length());
+///         }
+///     }
+/// };
+/// Vec3fGrid inGrid = ...;
+/// FloatGrid outGrid;
+/// tools::transformValues(inGrid.cbeginValueOn(), outGrid, Local::op);
+/// @endcode
+///
+/// @note For more complex operations that require finer control over threading,
+/// consider using @c tbb::parallel_for() or @c tbb::parallel_reduce() in conjunction
+/// with a tree::IteratorRange that wraps a grid or tree iterator.
+template<typename InIterT, typename OutGridT, typename XformOp>
+inline void transformValues(const InIterT& inIter, OutGridT& outGrid,
+    XformOp& op, bool threaded = true, bool shareOp = true,
+    MergePolicy merge = MERGE_ACTIVE_STATES);
+
+#ifndef _MSC_VER
+template<typename InIterT, typename OutGridT, typename XformOp>
+inline void transformValues(const InIterT& inIter, OutGridT& outGrid,
+    const XformOp& op, bool threaded = true, bool shareOp = true,
+    MergePolicy merge = MERGE_ACTIVE_STATES);
+#endif
+
+
+/// Iterate over a grid and at each step call @c op(iter).  If threading is enabled,
+/// call @c op.join(otherOp) to accumulate intermediate results from pairs of threads.
+/// @param iter      an iterator over a grid or its tree (@c Grid::ValueOnCIter,
+///                  @c Tree::NodeIter, etc.)
+/// @param op        a functor with a join method of the form <tt>void join(XformOp&)</tt>
+///                  and a call method of the form <tt>void op(const IterT&)</tt>,
+///                  where @c IterT is the type of @a iter
+/// @param threaded  if true, transform multiple values of the grid in parallel
+/// @note If @a threaded is true, each thread gets its own copy of the @e original functor.
+/// The order in which threads are joined is unspecified.
+/// @note If @a threaded is false, the join method is never called.
+///
+/// @par Example:
+/// Compute the average of the active values of a scalar, floating-point grid
+/// using the math::Stats class.
+/// @code
+/// namespace {
+///     struct Average {
+///         math::Stats stats;
+///
+///         // Accumulate voxel and tile values into this functor's Stats object.
+///         inline void operator()(const FloatGrid::ValueOnCIter& iter) {
+///             if (iter.isVoxelValue()) stats.add(*iter);
+///             else stats.add(*iter, iter.getVoxelCount());
+///         }
+///
+///         // Accumulate another functor's Stats object into this functor's.
+///         inline void join(Average& other) { stats.add(other.stats); }
+///
+///         // Return the cumulative result.
+///         inline double average() const { return stats.mean(); }
+///     };
+/// }
+/// {
+///     FloatGrid grid = ...;
+///     Average op;
+///     tools::accumulate(grid.cbeginValueOn(), op);
+///     double average = op.average();
+/// }
+/// @endcode
+///
+/// @note For more complex operations that require finer control over threading,
+/// consider using @c tbb::parallel_for() or @c tbb::parallel_reduce() in conjunction
+/// with a tree::IteratorRange that wraps a grid or tree iterator.
+template<typename IterT, typename XformOp>
+inline void accumulate(const IterT& iter, XformOp& op, bool threaded = true);
+
+
+/// @brief Set the value of the voxel at the given coordinates in @a tree to
+/// the minimum of its current value and @a value, and mark the voxel as active.
+/// @details This is typically significantly faster than calling getValue()
+/// followed by setValueOn().
+/// @note @a TreeT can be either a Tree or a ValueAccessor.
+template<typename TreeT>
+inline void setValueOnMin(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value);
+
+/// @brief Set the value of the voxel at the given coordinates in @a tree to
+/// the maximum of its current value and @a value, and mark the voxel as active.
+/// @details This is typically significantly faster than calling getValue()
+/// followed by setValueOn().
+/// @note @a TreeT can be either a Tree or a ValueAccessor.
+template<typename TreeT>
+inline void setValueOnMax(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value);
+
+/// @brief Set the value of the voxel at the given coordinates in @a tree to
+/// the sum of its current value and @a value, and mark the voxel as active.
+/// @details This is typically significantly faster than calling getValue()
+/// followed by setValueOn().
+/// @note @a TreeT can be either a Tree or a ValueAccessor.
+template<typename TreeT>
+inline void setValueOnSum(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value);
+
+/// @brief Set the value of the voxel at the given coordinates in @a tree to
+/// the product of its current value and @a value, and mark the voxel as active.
+/// @details This is typically significantly faster than calling getValue()
+/// followed by setValueOn().
+/// @note @a TreeT can be either a Tree or a ValueAccessor.
+template<typename TreeT>
+inline void setValueOnMult(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value);
+
+
+////////////////////////////////////////
+
+
+namespace valxform {
+
+template<typename ValueType>
+struct MinOp {
+    const ValueType val;
+    MinOp(const ValueType& v): val(v) {}
+    inline void operator()(ValueType& v) const { v = std::min<ValueType>(v, val); }
+};
+
+template<typename ValueType>
+struct MaxOp {
+    const ValueType val;
+    MaxOp(const ValueType& v): val(v) {}
+    inline void operator()(ValueType& v) const { v = std::max<ValueType>(v, val); }
+};
+
+template<typename ValueType>
+struct SumOp {
+    const ValueType val;
+    SumOp(const ValueType& v): val(v) {}
+    inline void operator()(ValueType& v) const { v += val; }
+};
+
+template<typename ValueType>
+struct MultOp {
+    const ValueType val;
+    MultOp(const ValueType& v): val(v) {}
+    inline void operator()(ValueType& v) const { v *= val; }
+};
+
+}
+
+
+template<typename TreeT>
+inline void
+setValueOnMin(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value)
+{
+    tree.modifyValue(xyz, valxform::MinOp<typename TreeT::ValueType>(value));
+}
+
+
+template<typename TreeT>
+inline void
+setValueOnMax(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value)
+{
+    tree.modifyValue(xyz, valxform::MaxOp<typename TreeT::ValueType>(value));
+}
+
+
+template<typename TreeT>
+inline void
+setValueOnSum(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value)
+{
+    tree.modifyValue(xyz, valxform::SumOp<typename TreeT::ValueType>(value));
+}
+
+
+template<typename TreeT>
+inline void
+setValueOnMult(TreeT& tree, const Coord& xyz, const typename TreeT::ValueType& value)
+{
+    tree.modifyValue(xyz, valxform::MultOp<typename TreeT::ValueType>(value));
+}
+
+
+////////////////////////////////////////
+
+
+namespace valxform {
+
+template<typename IterT, typename OpT>
+class SharedOpApplier
+{
+public:
+    typedef typename tree::IteratorRange<IterT> IterRange;
+
+    SharedOpApplier(const IterT& iter, OpT& op): mIter(iter), mOp(op) {}
+
+    void process(bool threaded = true)
+    {
+        IterRange range(mIter);
+        if (threaded) {
+            tbb::parallel_for(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+
+    void operator()(IterRange& r) const { for ( ; r; ++r) mOp(r.iterator()); }
+
+private:
+    IterT mIter;
+    OpT& mOp;
+};
+
+
+template<typename IterT, typename OpT>
+class CopyableOpApplier
+{
+public:
+    typedef typename tree::IteratorRange<IterT> IterRange;
+
+    CopyableOpApplier(const IterT& iter, const OpT& op): mIter(iter), mOp(op), mOrigOp(&op) {}
+
+    // When splitting this task, give the subtask a copy of the original functor,
+    // not of this task's functor, which might have been modified arbitrarily.
+    CopyableOpApplier(const CopyableOpApplier& other):
+        mIter(other.mIter), mOp(*other.mOrigOp), mOrigOp(other.mOrigOp) {}
+
+    void process(bool threaded = true)
+    {
+        IterRange range(mIter);
+        if (threaded) {
+            tbb::parallel_for(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+
+    void operator()(IterRange& r) const { for ( ; r; ++r) mOp(r.iterator()); }
+
+private:
+    IterT mIter;
+    OpT mOp; // copy of original functor
+    OpT const * const mOrigOp; // pointer to original functor
+};
+
+} // namespace valxform
+
+
+template<typename IterT, typename XformOp>
+inline void
+foreach(const IterT& iter, XformOp& op, bool threaded, bool shared)
+{
+    if (shared) {
+        typename valxform::SharedOpApplier<IterT, XformOp> proc(iter, op);
+        proc.process(threaded);
+    } else {
+        typedef typename valxform::CopyableOpApplier<IterT, XformOp> Processor;
+        Processor proc(iter, op);
+        proc.process(threaded);
+    }
+}
+
+template<typename IterT, typename XformOp>
+inline void
+foreach(const IterT& iter, const XformOp& op, bool threaded, bool /*shared*/)
+{
+    // Const ops are shared across threads, not copied.
+    typename valxform::SharedOpApplier<IterT, const XformOp> proc(iter, op);
+    proc.process(threaded);
+}
+
+
+////////////////////////////////////////
+
+
+namespace valxform {
+
+template<typename InIterT, typename OutTreeT, typename OpT>
+class SharedOpTransformer
+{
+public:
+    typedef typename InIterT::TreeT InTreeT;
+    typedef typename tree::IteratorRange<InIterT> IterRange;
+    typedef typename OutTreeT::ValueType OutValueT;
+
+    SharedOpTransformer(const InIterT& inIter, OutTreeT& outTree, OpT& op, MergePolicy merge):
+        mIsRoot(true),
+        mInputIter(inIter),
+        mInputTree(inIter.getTree()),
+        mOutputTree(&outTree),
+        mOp(op),
+        mMergePolicy(merge)
+    {
+        if (static_cast<const void*>(mInputTree) == static_cast<void*>(mOutputTree)) {
+            OPENVDB_LOG_INFO("use tools::foreach(), not transformValues(),"
+                " to transform a grid in place");
+        }
+    }
+
+    /// Splitting constructor
+    SharedOpTransformer(SharedOpTransformer& other, tbb::split):
+        mIsRoot(false),
+        mInputIter(other.mInputIter),
+        mInputTree(other.mInputTree),
+        mOutputTree(new OutTreeT(zeroVal<OutValueT>())),
+        mOp(other.mOp),
+        mMergePolicy(other.mMergePolicy)
+        {}
+
+    ~SharedOpTransformer()
+    {
+        // Delete the output tree only if it was allocated locally
+        // (the top-level output tree was supplied by the caller).
+        if (!mIsRoot) {
+            delete mOutputTree;
+            mOutputTree = NULL;
+        }
+    }
+
+    void process(bool threaded = true)
+    {
+        if (!mInputTree || !mOutputTree) return;
+
+        IterRange range(mInputIter);
+
+        // Independently transform elements in the iterator range,
+        // either in parallel or serially.
+        if (threaded) {
+            tbb::parallel_reduce(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+
+    /// Transform each element in the given range.
+    void operator()(IterRange& range) const
+    {
+        if (!mOutputTree) return;
+        typename tree::ValueAccessor<OutTreeT> outAccessor(*mOutputTree);
+        for ( ; range; ++range) {
+            mOp(range.iterator(), outAccessor);
+        }
+    }
+
+    void join(const SharedOpTransformer& other)
+    {
+        if (mOutputTree && other.mOutputTree) {
+            mOutputTree->merge(*other.mOutputTree, mMergePolicy);
+        }
+    }
+
+private:
+    bool mIsRoot;
+    InIterT mInputIter;
+    const InTreeT* mInputTree;
+    OutTreeT* mOutputTree;
+    OpT& mOp;
+    MergePolicy mMergePolicy;
+}; // class SharedOpTransformer
+
+
+template<typename InIterT, typename OutTreeT, typename OpT>
+class CopyableOpTransformer
+{
+public:
+    typedef typename InIterT::TreeT InTreeT;
+    typedef typename tree::IteratorRange<InIterT> IterRange;
+    typedef typename OutTreeT::ValueType OutValueT;
+
+    CopyableOpTransformer(const InIterT& inIter, OutTreeT& outTree,
+        const OpT& op, MergePolicy merge):
+        mIsRoot(true),
+        mInputIter(inIter),
+        mInputTree(inIter.getTree()),
+        mOutputTree(&outTree),
+        mOp(op),
+        mOrigOp(&op),
+        mMergePolicy(merge)
+    {
+        if (static_cast<const void*>(mInputTree) == static_cast<void*>(mOutputTree)) {
+            OPENVDB_LOG_INFO("use tools::foreach(), not transformValues(),"
+                " to transform a grid in place");
+        }
+    }
+
+    // When splitting this task, give the subtask a copy of the original functor,
+    // not of this task's functor, which might have been modified arbitrarily.
+    CopyableOpTransformer(CopyableOpTransformer& other, tbb::split):
+        mIsRoot(false),
+        mInputIter(other.mInputIter),
+        mInputTree(other.mInputTree),
+        mOutputTree(new OutTreeT(zeroVal<OutValueT>())),
+        mOp(*other.mOrigOp),
+        mOrigOp(other.mOrigOp),
+        mMergePolicy(other.mMergePolicy)
+        {}
+
+    ~CopyableOpTransformer()
+    {
+        // Delete the output tree only if it was allocated locally
+        // (the top-level output tree was supplied by the caller).
+        if (!mIsRoot) {
+            delete mOutputTree;
+            mOutputTree = NULL;
+        }
+    }
+
+    void process(bool threaded = true)
+    {
+        if (!mInputTree || !mOutputTree) return;
+
+        IterRange range(mInputIter);
+
+        // Independently transform elements in the iterator range,
+        // either in parallel or serially.
+        if (threaded) {
+            tbb::parallel_reduce(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+
+    /// Transform each element in the given range.
+    void operator()(IterRange& range)
+    {
+        if (!mOutputTree) return;
+        typename tree::ValueAccessor<OutTreeT> outAccessor(*mOutputTree);
+        for ( ; range; ++range) {
+            mOp(range.iterator(), outAccessor);
+        }
+    }
+
+    void join(const CopyableOpTransformer& other)
+    {
+        if (mOutputTree && other.mOutputTree) {
+            mOutputTree->merge(*other.mOutputTree, mMergePolicy);
+        }
+    }
+
+private:
+    bool mIsRoot;
+    InIterT mInputIter;
+    const InTreeT* mInputTree;
+    OutTreeT* mOutputTree;
+    OpT mOp; // copy of original functor
+    OpT const * const mOrigOp; // pointer to original functor
+    MergePolicy mMergePolicy;
+}; // class CopyableOpTransformer
+
+} // namespace valxform
+
+
+////////////////////////////////////////
+
+
+template<typename InIterT, typename OutGridT, typename XformOp>
+inline void
+transformValues(const InIterT& inIter, OutGridT& outGrid, XformOp& op,
+    bool threaded, bool shared, MergePolicy merge)
+{
+    typedef TreeAdapter<OutGridT> Adapter;
+    typedef typename Adapter::TreeType OutTreeT;
+    if (shared) {
+        typedef typename valxform::SharedOpTransformer<InIterT, OutTreeT, XformOp> Processor;
+        Processor proc(inIter, Adapter::tree(outGrid), op, merge);
+        proc.process(threaded);
+    } else {
+        typedef typename valxform::CopyableOpTransformer<InIterT, OutTreeT, XformOp> Processor;
+        Processor proc(inIter, Adapter::tree(outGrid), op, merge);
+        proc.process(threaded);
+    }
+}
+
+#ifndef _MSC_VER
+template<typename InIterT, typename OutGridT, typename XformOp>
+inline void
+transformValues(const InIterT& inIter, OutGridT& outGrid, const XformOp& op,
+    bool threaded, bool /*share*/, MergePolicy merge)
+{
+    typedef TreeAdapter<OutGridT> Adapter;
+    typedef typename Adapter::TreeType OutTreeT;
+    // Const ops are shared across threads, not copied.
+    typedef typename valxform::SharedOpTransformer<InIterT, OutTreeT, const XformOp> Processor;
+    Processor proc(inIter, Adapter::tree(outGrid), op, merge);
+    proc.process(threaded);
+}
+#endif
+
+
+////////////////////////////////////////
+
+
+namespace valxform {
+
+template<typename IterT, typename OpT>
+class OpAccumulator
+{
+public:
+    typedef typename tree::IteratorRange<IterT> IterRange;
+
+    // The root task makes a const copy of the original functor (mOrigOp)
+    // and keeps a pointer to the original functor (mOp), which it then modifies.
+    // Each subtask keeps a const pointer to the root task's mOrigOp
+    // and makes and then modifies a non-const copy (mOp) of it.
+    OpAccumulator(const IterT& iter, OpT& op):
+        mIsRoot(true),
+        mIter(iter),
+        mOp(&op),
+        mOrigOp(new OpT(op))
+    {}
+
+    // When splitting this task, give the subtask a copy of the original functor,
+    // not of this task's functor, which might have been modified arbitrarily.
+    OpAccumulator(OpAccumulator& other, tbb::split):
+        mIsRoot(false),
+        mIter(other.mIter),
+        mOp(new OpT(*other.mOrigOp)),
+        mOrigOp(other.mOrigOp)
+    {}
+
+    ~OpAccumulator() { if (mIsRoot) delete mOrigOp; else delete mOp; }
+
+    void process(bool threaded = true)
+    {
+        IterRange range(mIter);
+        if (threaded) {
+            tbb::parallel_reduce(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+
+    void operator()(IterRange& r) { for ( ; r; ++r) (*mOp)(r.iterator()); }
+
+    void join(OpAccumulator& other) { mOp->join(*other.mOp); }
+
+private:
+    const bool mIsRoot;
+    const IterT mIter;
+    OpT* mOp; // pointer to original functor, which might get modified
+    OpT const * const mOrigOp; // const copy of original functor
+}; // class OpAccumulator
+
+} // namespace valxform
+
+
+////////////////////////////////////////
+
+
+template<typename IterT, typename XformOp>
+inline void
+accumulate(const IterT& iter, XformOp& op, bool threaded)
+{
+    typename valxform::OpAccumulator<IterT, XformOp> proc(iter, op);
+    proc.process(threaded);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_VALUETRANSFORMER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/VectorTransformer.h b/nuparu/include/openvdb_new/tools/VectorTransformer.h
new file mode 100644
index 00000000..1a3e56a5
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/VectorTransformer.h
@@ -0,0 +1,158 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file VectorTransformer.h
+
+#ifndef OPENVDB_TOOLS_VECTORTRANSFORMER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_VECTORTRANSFORMER_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/math/Mat4.h>
+#include <openvdb/math/Vec3.h>
+#include "ValueTransformer.h" // for tools::foreach()
+#include <boost/utility/enable_if.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Apply an affine transform to the voxel values of a vector-valued grid
+/// in accordance with the grid's vector type (covariant, contravariant, etc.).
+/// @throw TypeError if the grid is not vector-valued
+template<typename GridType>
+inline void
+transformVectors(GridType&, const Mat4d&);
+
+
+////////////////////////////////////////
+
+
+// Functors for use with tools::foreach() to transform vector voxel values
+
+struct HomogeneousMatMul
+{
+    const Mat4d mat;
+    HomogeneousMatMul(const Mat4d& _mat): mat(_mat) {}
+    template<typename TreeIterT> void operator()(const TreeIterT& it) const
+    {
+        Vec3d v(*it);
+        it.setValue(mat.transformH(v));
+    }
+};
+
+struct MatMul
+{
+    const Mat4d mat;
+    MatMul(const Mat4d& _mat): mat(_mat) {}
+    template<typename TreeIterT>
+    void operator()(const TreeIterT& it) const
+    {
+        Vec3d v(*it);
+        it.setValue(mat.transform3x3(v));
+    }
+};
+
+struct MatMulNormalize
+{
+    const Mat4d mat;
+    MatMulNormalize(const Mat4d& _mat): mat(_mat) {}
+    template<typename TreeIterT>
+    void operator()(const TreeIterT& it) const
+    {
+        Vec3d v(*it);
+        v = mat.transform3x3(v);
+        v.normalize();
+        it.setValue(v);
+    }
+};
+
+
+/// @internal This overload is enabled only for scalar-valued grids.
+template<typename GridType> inline
+typename boost::disable_if_c<VecTraits<typename GridType::ValueType>::IsVec, void>::type
+doTransformVectors(GridType&, const Mat4d&)
+{
+    OPENVDB_THROW(TypeError, "tools::transformVectors() requires a vector-valued grid");
+}
+
+/// @internal This overload is enabled only for vector-valued grids.
+template<typename GridType> inline
+typename boost::enable_if_c<VecTraits<typename GridType::ValueType>::IsVec, void>::type
+doTransformVectors(GridType& grid, const Mat4d& mat)
+{
+    if (!grid.isInWorldSpace()) return;
+
+    const VecType vecType = grid.getVectorType();
+    switch (vecType) {
+        case VEC_COVARIANT:
+        case VEC_COVARIANT_NORMALIZE:
+        {
+            Mat4d invmat = mat.inverse();
+            invmat = invmat.transpose();
+
+            if (vecType == VEC_COVARIANT_NORMALIZE) {
+                foreach(grid.beginValueAll(), MatMulNormalize(invmat));
+            } else {
+                foreach(grid.beginValueAll(), MatMul(invmat));
+            }
+            break;
+        }
+
+        case VEC_CONTRAVARIANT_RELATIVE:
+            foreach(grid.beginValueAll(), MatMul(mat));
+            break;
+
+        case VEC_CONTRAVARIANT_ABSOLUTE:
+            foreach(grid.beginValueAll(), HomogeneousMatMul(mat));
+            break;
+
+        case VEC_INVARIANT:
+            break;
+    }
+}
+
+
+template<typename GridType>
+inline void
+transformVectors(GridType& grid, const Mat4d& mat)
+{
+    doTransformVectors<GridType>(grid, mat);
+}
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_VECTORTRANSFORMER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/VelocityFields.h b/nuparu/include/openvdb_new/tools/VelocityFields.h
new file mode 100644
index 00000000..7cb0539c
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/VelocityFields.h
@@ -0,0 +1,305 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file VelocityFields.h
+///
+/// @brief Defines two simple wrapper classes for advection velocity
+///        fields as well as VelocitySampler and VelocityIntegrator
+///
+///
+/// @details DiscreteField wraps a velocity grid and EnrightField is mostly
+///          intended for debugging (it's an analytical divergence free and
+///          periodic field). They both share the same API required by the
+///          LevelSetAdvection class defined in LevelSetAdvect.h. Thus, any
+///          class with this API should work with LevelSetAdvection.
+///
+/// @warning Note the Field wrapper classes below always assume the velocity
+///          is represented in the world-frame of reference. For DiscreteField
+///          this implies the input grid must contain velocities in world
+///          coordinates.
+
+#ifndef OPENVDB_TOOLS_VELOCITY_FIELDS_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_VELOCITY_FIELDS_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_reduce.h>
+#include <openvdb/Platform.h>
+#include "Interpolation.h" // for Sampler, etc.
+#include <openvdb/math/FiniteDifference.h>
+#include <boost/math/constants/constants.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Thin wrapper class for a velocity grid
+/// @note Consider replacing BoxSampler with StaggeredBoxSampler
+template <typename VelGridT, typename Interpolator = BoxSampler>
+class DiscreteField
+{
+public:
+    typedef typename VelGridT::ValueType     VectorType;
+    typedef typename VectorType::ValueType   ValueType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ValueType>::value);
+
+    DiscreteField(const VelGridT &vel)
+        : mAccessor(vel.tree())
+        , mTransform(&vel.transform())
+    {
+    }
+
+    /// @brief Copy constructor
+    DiscreteField(const DiscreteField& other)
+        : mAccessor(other.mAccessor.tree())
+        , mTransform(other.mTransform)
+    {
+    }
+
+    /// @return const reference to the transform between world and index space
+    /// @note Use this method to determine if a client grid is
+    /// aligned with the coordinate space of the velocity grid.
+    const math::Transform& transform() const { return *mTransform; }
+
+    /// @return the interpolated velocity at the world space position xyz
+    ///
+    /// @warning Not threadsafe since it uses a ValueAccessor! So use
+    /// one instance per thread (which is fine since its lightweight).
+    inline VectorType operator() (const Vec3d& xyz, ValueType/*dummy time*/) const
+    {
+        return Interpolator::sample(mAccessor, mTransform->worldToIndex(xyz));
+    }
+
+    /// @return the velocity at the coordinate space position ijk
+    ///
+    /// @warning Not threadsafe since it uses a ValueAccessor! So use
+    /// one instance per thread (which is fine since its lightweight).
+    inline VectorType operator() (const Coord& ijk, ValueType/*dummy time*/) const
+    {
+        return mAccessor.getValue(ijk);
+    }
+
+private:
+    const typename VelGridT::ConstAccessor mAccessor;//Not thread-safe
+    const math::Transform*                 mTransform;
+
+}; // end of DiscreteField
+
+///////////////////////////////////////////////////////////////////////
+
+/// @brief Analytical, divergence-free and periodic velocity field
+/// @note Primarily intended for debugging!
+/// @warning This analytical velocity only produce meaningful values
+/// in the unit box in world space. In other words make sure any level
+/// set surface is fully enclosed in the axis aligned bounding box
+/// spanning 0->1 in world units.
+template <typename ScalarT = float>
+class EnrightField
+{
+public:
+    typedef ScalarT             ValueType;
+    typedef math::Vec3<ScalarT> VectorType;
+    BOOST_STATIC_ASSERT(boost::is_floating_point<ScalarT>::value);
+
+    EnrightField() {}
+
+    /// @return const reference to the identity transform between world and index space
+    /// @note Use this method to determine if a client grid is
+    /// aligned with the coordinate space of this velocity field
+    math::Transform transform() const { return math::Transform(); }
+
+    /// @return the velocity in world units, evaluated at the world
+    /// position xyz and at the specified time
+    inline VectorType operator() (const Vec3d& xyz, ValueType time) const;
+
+    /// @return the velocity at the coordinate space position ijk
+    inline VectorType operator() (const Coord& ijk, ValueType time) const
+    {
+        return (*this)(ijk.asVec3d(), time);
+    }
+}; // end of EnrightField
+
+template <typename ScalarT>
+inline math::Vec3<ScalarT>
+EnrightField<ScalarT>::operator() (const Vec3d& xyz, ValueType time) const
+{
+    const ScalarT pi = boost::math::constants::pi<ScalarT>();
+    const ScalarT phase = pi / ScalarT(3.0);
+    const ScalarT Px =  pi * ScalarT(xyz[0]), Py = pi * ScalarT(xyz[1]), Pz = pi * ScalarT(xyz[2]);
+    const ScalarT tr =  cos(ScalarT(time) * phase);
+    const ScalarT a  =  sin(ScalarT(2.0)*Py);
+    const ScalarT b  = -sin(ScalarT(2.0)*Px);
+    const ScalarT c  =  sin(ScalarT(2.0)*Pz);
+    return math::Vec3<ScalarT>(
+        tr * ( ScalarT(2) * math::Pow2(sin(Px)) * a * c ),
+        tr * ( b * math::Pow2(sin(Py)) * c ),
+        tr * ( b * a * math::Pow2(sin(Pz)) ));
+}
+
+
+///////////////////////////////////////////////////////////////////////
+
+/// Class to hold a Vec3 field interpreted as a velocity field.
+/// Primarily exists to provide a method(s) that integrate a passive
+/// point forward in the velocity field for a single time-step (dt)
+template<typename GridT = Vec3fGrid,
+         bool Staggered = false,
+         size_t Order = 1>
+class VelocitySampler
+{
+public:
+    typedef typename GridT::ConstAccessor AccessorType;
+    typedef typename GridT::ValueType     ValueType;
+
+    /// @brief Constructor from a grid
+    VelocitySampler(const GridT& grid):
+        mGrid(&grid),
+        mAcc(grid.getAccessor())
+    {
+    }
+    /// @brief Copy-constructor
+    VelocitySampler(const VelocitySampler& other):
+        mGrid(other.mGrid),
+        mAcc(mGrid->getAccessor())
+    {
+    }
+    /// @brief Samples the velocity at world position onto result. Supports both
+    /// staggered (i.e. MAC) and collocated velocity grids.
+    ///
+    /// @return @c true if any one of the sampled values is active.
+    ///
+    /// @warning Not threadsafe since it uses a ValueAccessor! So use
+    /// one instance per thread (which is fine since its lightweight).
+    template <typename LocationType>
+    inline bool sample(const LocationType& world, ValueType& result) const
+    {
+        const Vec3R xyz = mGrid->worldToIndex(Vec3R(world[0], world[1], world[2]));
+        bool active = Sampler<Order, Staggered>::sample(mAcc, xyz, result);
+        return active;
+    }
+
+    /// @brief Samples the velocity at world position onto result. Supports both
+    /// staggered (i.e. MAC) and co-located velocity grids.
+    ///
+    /// @warning Not threadsafe since it uses a ValueAccessor! So use
+    /// one instance per thread (which is fine since its lightweight).
+    template <typename LocationType>
+    inline ValueType sample(const LocationType& world) const
+    {
+        const Vec3R xyz = mGrid->worldToIndex(Vec3R(world[0], world[1], world[2]));
+        return Sampler<Order, Staggered>::sample(mAcc, xyz);
+    }
+
+private:
+    // holding the Grids for the transforms
+    const GridT* mGrid; // Velocity vector field
+    AccessorType mAcc;
+};// end of VelocitySampler class
+
+///////////////////////////////////////////////////////////////////////
+
+/// @brief Performs Runge-Kutta time integration of variable order in
+/// a static velocity field.
+///
+/// @note Note that the order of the velocity sampling is controlled
+/// with the SampleOrder template parameter, which defaults
+/// to one, i.e. a tri-linear interpolation kernel.
+template<typename GridT = Vec3fGrid,
+         bool Staggered = false,
+         size_t SampleOrder = 1>
+class VelocityIntegrator
+{
+public:
+    typedef typename GridT::ValueType  VecType;
+    typedef typename VecType::ValueType ElementType;
+
+    VelocityIntegrator(const GridT& velGrid):
+        mVelSampler(velGrid)
+    {
+    }
+    /// @brief Variable order Runge-Kutta time integration for a single time step
+    ///
+    /// @param dt     Time sub-step for the Runge-Kutte integrator of order OrderRK
+    /// @param world  Location in world space coordinates (both input and output)
+    template<size_t OrderRK, typename LocationType>
+    inline void rungeKutta(const ElementType dt, LocationType& world) const
+    {
+        BOOST_STATIC_ASSERT(OrderRK <= 4);
+        VecType P(static_cast<ElementType>(world[0]),
+                  static_cast<ElementType>(world[1]),
+                  static_cast<ElementType>(world[2]));
+        // Note the if-branching below is optimized away at compile time
+        if (OrderRK == 0) {
+            return;// do nothing
+        } else if (OrderRK == 1) {
+            VecType V0;
+            mVelSampler.sample(P, V0);
+            P =  dt * V0;
+        } else if (OrderRK == 2) {
+            VecType V0, V1;
+            mVelSampler.sample(P, V0);
+            mVelSampler.sample(P + ElementType(0.5) * dt * V0, V1);
+            P = dt * V1;
+        } else if (OrderRK == 3) {
+            VecType V0, V1, V2;
+            mVelSampler.sample(P, V0);
+            mVelSampler.sample(P + ElementType(0.5) * dt * V0, V1);
+            mVelSampler.sample(P + dt * (ElementType(2.0) * V1 - V0), V2);
+            P = dt * (V0 + ElementType(4.0) * V1 + V2) * ElementType(1.0 / 6.0);
+        } else if (OrderRK == 4) {
+            VecType V0, V1, V2, V3;
+            mVelSampler.sample(P, V0);
+            mVelSampler.sample(P + ElementType(0.5) * dt * V0, V1);
+            mVelSampler.sample(P + ElementType(0.5) * dt * V1, V2);
+            mVelSampler.sample(P + dt * V2, V3);
+            P = dt * (V0 + ElementType(2.0) * (V1 + V2) + V3) * ElementType(1.0 / 6.0);
+        }
+        typedef typename LocationType::ValueType OutType;
+        world += LocationType(static_cast<OutType>(P[0]),
+                              static_cast<OutType>(P[1]),
+                              static_cast<OutType>(P[2]));
+    }
+private:
+    VelocitySampler<GridT, Staggered, SampleOrder> mVelSampler;
+};// end of VelocityIntegrator class
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_VELOCITY_FIELDS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/VolumeAdvect.h b/nuparu/include/openvdb_new/tools/VolumeAdvect.h
new file mode 100644
index 00000000..b29d3128
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/VolumeAdvect.h
@@ -0,0 +1,571 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file VolumeAdvect.h
+///
+/// @brief Sparse hyperbolic advection of volumes, e.g. a density or
+///        velocity (vs a level set interface).
+
+#ifndef OPENVDB_TOOLS_VOLUME_ADVECT_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_VOLUME_ADVECT_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_for.h>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/util/NullInterrupter.h>
+#include "Interpolation.h"// for Sampler
+#include "VelocityFields.h" // for VelocityIntegrator
+#include "Morphology.h"//for dilateActiveValues and dilateVoxels
+#include "Prune.h"// for prune
+#include "Statistics.h" // for extrema
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {      
+
+   
+namespace Scheme {
+    /// @brief Numerical advections schemes.
+    enum SemiLagrangian { SEMI, MID, RK3, RK4, MAC, BFECC };
+    /// @brief Flux-limiters employed to stabalize the second-order
+    /// advection schemes MacCormack and BFECC.
+    enum Limiter { NO_LIMITER, CLAMP, REVERT }; 
+}
+    
+/// @brief Performs advections of an arbitrary type of volume in a
+///        static velocity field. The advections are performed by means
+///        of various derivatives of Semi-Lagrangian integration, i.e.
+///        backwards tracking along the hyperbolic characteristics
+///        followed by interpolation.     
+///
+/// @note  Optionally a limiter can be combined with the higher-order
+///        integration schemes MacCormack and BFECC. There are two
+///        types of limiters (CLAMP and REVERT) that supress
+///        non-physical oscillations by means of either claminging or
+///        reverting to a first-order schemes when the function is not
+///        bounded by the cell values used for tri-linear interpolation. 
+///    
+/// @verbatim The supported integrations schemes:
+///    
+///    ================================================================
+///    |  Lable | Accuracy |  Integration Scheme   |  Interpolations  |
+///    |        |Time/Space|                       |  velocity/volume |
+///    ================================================================    
+///    |  SEMI  |   1/1    | Semi-Lagrangian       |        1/1       | 
+///    |  MID   |   2/1    | Mid-Point             |        2/1       |
+///    |  RK3   |   3/1    | 3rd Order Runge-Kutta |        3/1       |
+///    |  RK4   |   4/1    | 4th Order Runge-Kutta |        4/1       |
+///    |  MAC   |   2/2    | MacCormack            |        2/2       |
+///    |  BFECC |   2/2    | BFECC                 |        3/2       |           
+///    ================================================================
+/// @endverbatim
+    
+template<typename VelocityGridT = Vec3fGrid,
+         bool StaggeredVelocity = false,
+         typename InterrupterType = util::NullInterrupter>
+class VolumeAdvection
+{
+public:
+    
+    /// @brief Constructor
+    ///
+    /// @param velGrid     Velocity grid responsible for the (passive) advection.
+    /// @param interrupter Optional interrupter used to prematurely end computations.
+    ///
+    /// @note The velocity field is assumed to be constant for the duration of the
+    ///       advection.
+    VolumeAdvection(const VelocityGridT& velGrid, InterrupterType* interrupter = NULL)
+        : mVelGrid(velGrid)
+        , mInterrupter(interrupter)
+        , mIntegrator( Scheme::SEMI )
+        , mLimiter( Scheme::CLAMP )
+        , mGrainSize( 128 )
+        , mSubSteps( 1 )
+    {
+        math::Extrema e = extrema(velGrid.cbeginValueAll(), /*threading*/true);
+        e.add(velGrid.background().length());
+        mMaxVelocity = e.max();
+    }
+
+    virtual ~VolumeAdvection()
+    {
+    }
+
+    /// @brief Return the spatial order of accuracy of the advection scheme
+    ///
+    /// @note This is the optimal order in smooth regions. In
+    /// non-smooth regions the flux-limiter will drop the order of
+    /// accuracy to add numerical dissipation. 
+    int spatialOrder() const { return (mIntegrator == Scheme::MAC ||
+                                       mIntegrator == Scheme::BFECC) ? 2 : 1; }
+
+    /// @brief Return the temporal order of accuracy of the advection scheme
+    ///
+    /// @note This is the optimal order in smooth regions. In
+    /// non-smooth regions the flux-limiter will drop the order of
+    /// accuracy to add numerical dissipation.
+    int temporalOrder() const {
+        switch (mIntegrator) {
+        case Scheme::SEMI: return 1;
+        case Scheme::MID:  return 2;
+        case Scheme::RK3:  return 3;
+        case Scheme::RK4:  return 4;
+        case Scheme::BFECC:return 2;
+        case Scheme::MAC:  return 2;
+        }
+        return 0;//should never reach this point
+    }
+
+    /// @brief Set the integrator (see details in the table above)
+    void setIntegrator(Scheme::SemiLagrangian integrator) { mIntegrator = integrator; }
+
+    /// @brief Return the integrator (see details in the table above)
+    Scheme::SemiLagrangian getIntegrator() const { return mIntegrator; }
+
+    /// @brief Set the limiter (see details above)
+    void setLimiter(Scheme::Limiter limiter) { mLimiter = limiter; }
+
+    /// @brief Retrun the limiter (see details above)
+    Scheme::Limiter getLimiter() const { return mLimiter; }
+
+    /// @brief Return @c true if a limiter will be applied based on
+    /// the current settings.
+    bool isLimiterOn() const { return this->spatialOrder()>1 &&
+                                      mLimiter != Scheme::NO_LIMITER; }
+    
+    /// @return the grain-size used for multi-threading
+    /// @note A grainsize of 0 implies serial execution
+    size_t getGrainSize() const { return mGrainSize; }
+
+    /// @brief Set the grain-size used for multi-threading
+    /// @note A grainsize of 0 disables multi-threading
+    /// @warning A small grainsize can degrade performance,
+    ///          both in terms of time and memory footprint!
+    void setGrainSize(size_t grainsize) { mGrainSize = grainsize; }
+
+    /// @return the number of sub-steps per integration (always larger
+    /// than or equal to 1).
+    int getSubSteps() const { return mSubSteps; }
+
+    /// @brief Set the number of sub-steps per integration.
+    /// @note The only reason to increase the sub-step above its
+    ///       default value of one is to reduce the memory footprint
+    ///       due to significant dilation. Values smaller than 1 will
+    ///       be clamped to 1!
+    void setSubSteps(int substeps) { mSubSteps = math::Max(1, substeps); }
+
+    /// @brief Return the maximum magnitude of the velocity in the
+    /// advection velocity field defined during construction.
+    double getMaxVelocity() const { return mMaxVelocity; }
+
+    /// @return Returns the maximum distance in voxel units of @a inGrid
+    /// that a particle can travel in the time-step @a dt when advected
+    /// in the velocity field defined during construction.
+    ///
+    /// @details This method is useful when dilating sparse volume
+    /// grids to pad boundary regions. Excessive dilation can be
+    /// computationally expensive so use this method to prevent
+    /// or warn against run-away computation.
+    ///
+    /// @throw RuntimeError if @a inGrid does not have uniform voxels.
+    template<typename VolumeGridT>
+    int getMaxDistance(const VolumeGridT& inGrid, double dt) const
+    {
+        if (!inGrid.hasUniformVoxels()) {
+            OPENVDB_THROW(RuntimeError, "Volume grid does not have uniform voxels!");
+        }
+        const double d = mMaxVelocity*math::Abs(dt)/inGrid.voxelSize()[0];
+        return static_cast<int>( math::RoundUp(d) );
+    }
+
+    /// @return Returns a new grid that is the result of passive advection
+    ///         of all the active values the input grid by @a timeStep.
+    ///
+    /// @param inGrid   The input grid to be advected (unmodified)
+    /// @param timeStep Time-step of the Runge-Kutta integrator.
+    ///
+    /// @details This method will advect all of the active values in
+    ///          the input @a inGrid. To achieve this a
+    ///          deep-copy is dilated to account for the material
+    ///          transport. This dilation step can be slow for large
+    ///          time steps @a dt or a velocity field with large magnitudes.
+    ///
+    /// @warning If the VolumeSamplerT is of higher order than one
+    ///          (i.e. tri-linear interpolation) instabilities are
+    ///          known to occure. To suppress those monotonicity
+    ///          constrains or flux-limiters need to be applies.
+    ///
+    /// @throw RuntimeError if @a inGrid does not have uniform voxels.
+    template<typename VolumeGridT,
+             typename VolumeSamplerT>//only C++11 allows for a default argument
+    typename VolumeGridT::Ptr advect(const VolumeGridT& inGrid, double timeStep)
+    {
+        typename VolumeGridT::Ptr outGrid = inGrid.deepCopy();
+        const double dt = timeStep/mSubSteps;
+        const int n = this->getMaxDistance(inGrid, dt);
+        dilateActiveValues( outGrid->tree(), n, NN_FACE, EXPAND_TILES);
+        this->template cook<VolumeGridT, VolumeSamplerT>(*outGrid, inGrid, dt);
+        for (int step = 1; step < mSubSteps; ++step) {
+            typename VolumeGridT::Ptr tmpGrid = outGrid->deepCopy();
+            dilateActiveValues( tmpGrid->tree(), n, NN_FACE, EXPAND_TILES);
+            this->template cook<VolumeGridT, VolumeSamplerT>(*tmpGrid, *outGrid, dt);
+            outGrid.swap( tmpGrid );
+        }
+        
+        return outGrid;
+    }
+
+    /// @return Returns a new grid that is the result of
+    ///         passive advection of the active values in @a inGrid
+    ///         that intersect the active values in @c mask. The time
+    ///         of the output grid is incremented by @a timeStep.
+    ///
+    /// @param inGrid   The input grid to be advected (unmodified).
+    /// @param mask     The mask of active values defining the active voxels
+    ///                 in @c inGrid on which to perform advection. Only
+    ///                 if a value is active in both grids will it be modified.
+    /// @param timeStep Time-step for a single Runge-Kutta integration step.
+    ///
+    ///
+    /// @details This method will advect all of the active values in
+    ///          the input @a inGrid that intersects with the
+    ///          active values in @a mask. To achieve this a
+    ///          deep-copy is dilated to account for the material
+    ///          transport and finally cropped to the intersection
+    ///          with @a mask. The dilation step can be slow for large
+    ///          time steps @a dt or fast moving velocity fields.
+    ///
+    /// @warning If the VolumeSamplerT is of higher order the one
+    ///          (i.e. tri-linear interpolation) instabilities are
+    ///          known to occure. To suppress those monotonicity
+    ///          constrains or flux-limiters need to be applies.
+    ///
+    /// @throw RuntimeError if @a inGrid is not aligned with @a mask
+    ///        or if its voxels are not uniform.
+    template<typename VolumeGridT,
+             typename MaskGridT,
+             typename VolumeSamplerT>//only C++11 allows for a default argument
+    typename VolumeGridT::Ptr advect(const VolumeGridT& inGrid, const MaskGridT& mask, double timeStep)
+    {
+        if (inGrid.transform() != mask.transform()) {
+            OPENVDB_THROW(RuntimeError, "Volume grid and mask grid are misaligned! Consider "
+                          "resampling either of the two grids into the index space of the other.");
+        }
+        typename VolumeGridT::Ptr outGrid = inGrid.deepCopy();
+        const double dt = timeStep/mSubSteps;
+        const int n = this->getMaxDistance(inGrid, dt);
+        dilateActiveValues( outGrid->tree(), n, NN_FACE, EXPAND_TILES);
+        outGrid->topologyIntersection( mask );
+        pruneInactive( outGrid->tree(), mGrainSize>0, mGrainSize );
+        this->template cook<VolumeGridT, VolumeSamplerT>(*outGrid, inGrid, dt);
+        outGrid->topologyUnion( inGrid );
+
+        for (int step = 1; step < mSubSteps; ++step) {
+            typename VolumeGridT::Ptr tmpGrid = outGrid->deepCopy();
+            dilateActiveValues( tmpGrid->tree(), n, NN_FACE, EXPAND_TILES);
+            tmpGrid->topologyIntersection( mask );
+            pruneInactive( tmpGrid->tree(), mGrainSize>0, mGrainSize );
+            this->template cook<VolumeGridT, VolumeSamplerT>(*tmpGrid, *outGrid, dt);
+            tmpGrid->topologyUnion( inGrid );
+            outGrid.swap( tmpGrid );
+        }
+        return outGrid;
+    }
+
+private:
+    // disallow copy construction and copy by assignment!
+    VolumeAdvection(const VolumeAdvection&);// not implemented
+    VolumeAdvection& operator=(const VolumeAdvection&);// not implemented
+
+    void start(const char* str) const
+    {
+        if (mInterrupter) mInterrupter->start(str);
+    }
+    void stop() const
+    {
+        if (mInterrupter) mInterrupter->end();
+    }      
+    bool interrupt() const
+    {
+        if (mInterrupter && util::wasInterrupted(mInterrupter)) {
+            tbb::task::self().cancel_group_execution();
+            return true;
+        }
+        return false;
+    }
+    
+    template<typename VolumeGridT, typename VolumeSamplerT>
+    void cook(VolumeGridT& outGrid, const VolumeGridT& inGrid, double dt)
+    {
+        switch (mIntegrator) {
+        case Scheme::SEMI: {
+            Advect<VolumeGridT, 1, VolumeSamplerT> adv(inGrid, *this);
+            adv.cook(outGrid, dt);
+            break;
+        }
+        case Scheme::MID: {
+            Advect<VolumeGridT, 2, VolumeSamplerT> adv(inGrid, *this);
+            adv.cook(outGrid, dt);
+            break;
+        }
+        case Scheme::RK3: {
+            Advect<VolumeGridT, 3, VolumeSamplerT> adv(inGrid, *this);
+            adv.cook(outGrid, dt);
+            break;
+        }
+        case Scheme::RK4: {
+            Advect<VolumeGridT, 4, VolumeSamplerT> adv(inGrid, *this);
+            adv.cook(outGrid, dt);
+            break;
+        }
+        case Scheme::BFECC: {
+            Advect<VolumeGridT, 1, VolumeSamplerT> adv(inGrid, *this);
+            adv.cook(outGrid, dt);
+            break;
+        }
+        case Scheme::MAC: {
+            Advect<VolumeGridT, 1, VolumeSamplerT> adv(inGrid, *this);
+            adv.cook(outGrid, dt);
+            break;
+        } 
+        default:
+            OPENVDB_THROW(ValueError, "Spatial difference scheme not supported!");
+        }
+        pruneInactive(outGrid.tree(), mGrainSize>0, mGrainSize);
+    }
+
+    // Private class that implements the multi-threaded advection
+    template<typename VolumeGridT, size_t OrderRK, typename SamplerT> struct Advect;
+
+    // Private member data of VolumeAdvection
+    const VelocityGridT&   mVelGrid;
+    double                 mMaxVelocity;
+    InterrupterType*       mInterrupter;
+    Scheme::SemiLagrangian mIntegrator;
+    Scheme::Limiter        mLimiter;
+    size_t                 mGrainSize;
+    int                    mSubSteps;
+};//end of VolumeAdvection class
+    
+// Private class that implements the multi-threaded advection
+template<typename VelocityGridT, bool StaggeredVelocity, typename InterrupterType>
+template<typename VolumeGridT, size_t OrderRK, typename SamplerT>
+struct VolumeAdvection<VelocityGridT, StaggeredVelocity, InterrupterType>::Advect
+{
+    typedef typename VolumeGridT::TreeType      TreeT;
+    typedef typename VolumeGridT::ConstAccessor AccT;
+    typedef typename TreeT::ValueType            ValueT;
+    typedef typename tree::LeafManager<TreeT>    LeafManagerT;
+    typedef typename LeafManagerT::LeafNodeType  LeafNodeT;
+    typedef typename LeafManagerT::LeafRange     LeafRangeT;
+    typedef VelocityIntegrator<VelocityGridT, StaggeredVelocity> VelocityIntegratorT;
+    typedef typename VelocityIntegratorT::ElementType RealT;
+    typedef typename TreeT::LeafNodeType::ValueOnIter VoxelIterT;
+    
+    Advect(const VolumeGridT& inGrid, const VolumeAdvection& parent)
+        : mTask(0)
+        , mInGrid(&inGrid)
+        , mVelocityInt(parent.mVelGrid)
+        , mParent(&parent)
+    {
+    }
+    inline void cook(const LeafRangeT& range)
+    {
+        if (mParent->mGrainSize > 0) {
+            tbb::parallel_for(range, *this);
+        } else {
+            (*this)(range);
+        }
+    }
+    void operator()(const LeafRangeT& range) const
+    {
+        assert(mTask);
+        mTask(const_cast<Advect*>(this), range);
+    }
+    void cook(VolumeGridT& outGrid, double time_step)
+    {
+        mParent->start("Advecting volume");
+        LeafManagerT manager(outGrid.tree(), mParent->spatialOrder()==2 ? 1 : 0);
+        const LeafRangeT range = manager.leafRange(mParent->mGrainSize);
+        const RealT dt = static_cast<RealT>(-time_step);//method of characteristics backtracks
+        if (mParent->mIntegrator == Scheme::MAC) {
+            mTask = boost::bind(&Advect::rk,  _1, _2, dt, 0, mInGrid);//out[0]=forward 
+            this->cook(range);
+            mTask = boost::bind(&Advect::rk,  _1, _2,-dt, 1, &outGrid);//out[1]=backward
+            this->cook(range);
+            mTask = boost::bind(&Advect::mac, _1, _2);//out[0] = out[0] + (in[0] - out[1])/2
+            this->cook(range);
+        } else if (mParent->mIntegrator == Scheme::BFECC) {
+            mTask = boost::bind(&Advect::rk, _1, _2, dt, 0, mInGrid);//out[0]=forward
+            this->cook(range);
+            mTask = boost::bind(&Advect::rk, _1, _2,-dt, 1, &outGrid);//out[1]=backward
+            this->cook(range);
+            mTask = boost::bind(&Advect::bfecc, _1, _2);//out[0] = (3*in[0] - out[1])/2
+            this->cook(range);
+            mTask = boost::bind(&Advect::rk, _1, _2, dt, 1, &outGrid);//out[1]=forward
+            this->cook(range);
+            manager.swapLeafBuffer(1);// out[0] = out[1]
+        } else {// SEMI, MID, RK3 and RK4
+            mTask = boost::bind(&Advect::rk, _1, _2,  dt, 0, mInGrid);//forward
+            this->cook(range);
+        }
+
+        if (mParent->spatialOrder()==2) manager.removeAuxBuffers();
+        
+        mTask = boost::bind(&Advect::limiter, _1, _2, dt);// out[0] = limiter( out[0] ) 
+        this->cook(range);
+        
+        mParent->stop();
+    }
+    // Last step of the MacCormack scheme: out[0] = out[0] + (in[0] - out[1])/2
+    void mac(const LeafRangeT& range) const
+    {
+        if (mParent->interrupt()) return;
+        assert( mParent->mIntegrator == Scheme::MAC );
+        AccT acc = mInGrid->getAccessor();
+        for (typename LeafRangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueT* out0 = leafIter.buffer( 0 ).data();// forward
+            const ValueT* out1 = leafIter.buffer( 1 ).data();// backward
+            const LeafNodeT* leaf = acc.probeConstLeaf( leafIter->origin() );
+            if (leaf !=NULL) {
+                const ValueT* in0 = leaf->buffer().data();
+                for (VoxelIterT voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                    const Index i = voxelIter.pos();
+                    out0[i] += RealT(0.5) * ( in0[i] - out1[i] );
+                }
+            } else {
+                for (VoxelIterT voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                    const Index i = voxelIter.pos();
+                    out0[i] += RealT(0.5) * ( acc.getValue(voxelIter.getCoord()) - out1[i] );
+                }//loop over active voxels
+            }
+        }//loop over leaf nodes
+    }
+    // Intermediate step in the BFECC scheme: out[0] = (3*in[0] - out[1])/2
+    void bfecc(const LeafRangeT& range) const
+    {
+        if (mParent->interrupt()) return;
+        assert( mParent->mIntegrator == Scheme::BFECC );
+        AccT acc = mInGrid->getAccessor();
+        for (typename LeafRangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueT* out0 = leafIter.buffer( 0 ).data();// forward
+            const ValueT* out1 = leafIter.buffer( 1 ).data();// backward
+            const LeafNodeT* leaf = acc.probeConstLeaf(leafIter->origin());
+            if (leaf !=NULL) {
+                const ValueT* in0 = leaf->buffer().data();
+                for (VoxelIterT voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                    const Index i = voxelIter.pos();
+                    out0[i] = RealT(0.5)*( RealT(3)*in0[i] - out1[i] );
+                }//loop over active voxels
+            } else {
+                for (VoxelIterT voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                    const Index i = voxelIter.pos();
+                    out0[i] = RealT(0.5)*( RealT(3)*acc.getValue(voxelIter.getCoord()) - out1[i] );
+                }//loop over active voxels
+            }
+        }//loop over leaf nodes
+    }
+    // Semi-Lagrangian integration with Runge-Kutta of various orders (1->4)
+    void rk(const LeafRangeT& range, RealT dt, size_t n, const VolumeGridT* grid) const
+    {
+        if (mParent->interrupt()) return;
+        const math::Transform& xform = mInGrid->transform();
+        AccT acc = grid->getAccessor();
+        for (typename LeafRangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueT* phi = leafIter.buffer( n ).data();
+            for (VoxelIterT voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                ValueT& value = phi[voxelIter.pos()];
+                Vec3d wPos = xform.indexToWorld(voxelIter.getCoord());
+                mVelocityInt.template rungeKutta<OrderRK, Vec3d>(dt, wPos);
+                value = SamplerT::sample(acc, xform.worldToIndex(wPos));
+            }//loop over active voxels
+        }//loop over leaf nodes
+    }
+    void limiter(const LeafRangeT& range, RealT dt) const
+    {
+        if (mParent->interrupt()) return;
+        const bool doLimiter = mParent->isLimiterOn();
+        const bool doClamp = mParent->mLimiter == Scheme::CLAMP;
+        ValueT data[2][2][2], vMin, vMax;
+        const math::Transform& xform = mInGrid->transform();
+        AccT acc = mInGrid->getAccessor();
+        const ValueT backg = mInGrid->background();
+        for (typename LeafRangeT::Iterator leafIter = range.begin(); leafIter; ++leafIter) {
+            ValueT* phi = leafIter.buffer( 0 ).data();
+            for (VoxelIterT voxelIter = leafIter->beginValueOn(); voxelIter; ++voxelIter) {
+                ValueT& value = phi[voxelIter.pos()];
+
+                if ( doLimiter ) {
+                    assert(OrderRK == 1);
+                    Vec3d wPos = xform.indexToWorld(voxelIter.getCoord());
+                    mVelocityInt.template rungeKutta<1, Vec3d>(dt, wPos);// Explicit Euler
+                    Vec3d iPos = xform.worldToIndex(wPos);
+                    Coord ijk  = Coord::floor( iPos );
+                    BoxSampler::getValues(data, acc, ijk);
+                    BoxSampler::extrema(data, vMin, vMax);
+                    if ( doClamp ) {
+                        value = math::Clamp( value, vMin, vMax);
+                    } else if (value < vMin || value > vMax ) {
+                        iPos -= Vec3R(ijk[0], ijk[1], ijk[2]);//unit coordinates
+                        value = BoxSampler::trilinearInterpolation( data, iPos );
+                    }
+                }
+                
+                if (math::isApproxEqual(value, backg, math::Delta<ValueT>::value())) {
+                    value = backg;
+                    leafIter->setValueOff( voxelIter.pos() );
+                }
+            }//loop over active voxels
+        }//loop over leaf nodes
+    }
+    // Public member data of the private Advect class
+    
+    typename boost::function<void (Advect*, const LeafRangeT&)> mTask;
+    const VolumeGridT*        mInGrid;
+    const VelocityIntegratorT mVelocityInt;// lightweight!
+    const VolumeAdvection*    mParent;
+};// end of private member class Advect    
+    
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_VOLUME_ADVECT_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/VolumeToMesh.h b/nuparu/include/openvdb_new/tools/VolumeToMesh.h
new file mode 100644
index 00000000..f162941e
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/VolumeToMesh.h
@@ -0,0 +1,4690 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TOOLS_VOLUME_TO_MESH_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_VOLUME_TO_MESH_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h> // for OPENVDB_HAS_CXX11
+#include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/util/Util.h> // for COORD_OFFSETS
+#include <openvdb/math/Operators.h> // for ISGradient
+#include <openvdb/tools/Morphology.h> // for dilateVoxels()
+#include <openvdb/tree/LeafManager.h>
+#include "Prune.h" // for pruneInactive
+
+#include <boost/scoped_array.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/type_traits/is_scalar.hpp>
+#include <boost/utility/enable_if.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
+#include <vector>
+#include <memory> // for auto_ptr/unique_ptr
+
+
+//////////
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+// Wrapper functions for the VolumeToMesh converter
+
+
+/// @brief Uniformly mesh any scalar grid that has a continuous isosurface.
+///
+/// @param grid     a scalar grid to mesh
+/// @param points   output list of world space points
+/// @param quads    output quad index list
+/// @param isovalue determines which isosurface to mesh
+///
+/// @throw TypeError if @a grid does not have a scalar value type
+template<typename GridType>
+inline void
+volumeToMesh(
+    const GridType& grid,
+    std::vector<Vec3s>& points,
+    std::vector<Vec4I>& quads,
+    double isovalue = 0.0);
+
+
+/// @brief Adaptively mesh any scalar grid that has a continuous isosurface.
+///
+/// @param grid         a scalar grid to mesh
+/// @param points       output list of world space points
+/// @param triangles    output triangle index list
+/// @param quads        output quad index list
+/// @param isovalue     determines which isosurface to mesh
+/// @param adaptivity   surface adaptivity threshold [0 to 1]
+///
+/// @throw TypeError if @a grid does not have a scalar value type
+template<typename GridType>
+inline void
+volumeToMesh(
+    const GridType& grid,
+    std::vector<Vec3s>& points,
+    std::vector<Vec3I>& triangles,
+    std::vector<Vec4I>& quads,
+    double isovalue = 0.0,
+    double adaptivity = 0.0);
+
+
+////////////////////////////////////////
+
+
+/// @brief Polygon flags, used for reference based meshing.
+enum { POLYFLAG_EXTERIOR = 0x1, POLYFLAG_FRACTURE_SEAM = 0x2,  POLYFLAG_SUBDIVIDED = 0x4};
+
+
+/// @brief Collection of quads and triangles
+class PolygonPool
+{
+public:
+
+    inline PolygonPool();
+    inline PolygonPool(const size_t numQuads, const size_t numTriangles);
+
+
+    inline void copy(const PolygonPool& rhs);
+
+    inline void resetQuads(size_t size);
+    inline void clearQuads();
+
+    inline void resetTriangles(size_t size);
+    inline void clearTriangles();
+
+
+    // polygon accessor methods
+
+    const size_t& numQuads() const                      { return mNumQuads; }
+
+    openvdb::Vec4I& quad(size_t n)                      { return mQuads[n]; }
+    const openvdb::Vec4I& quad(size_t n) const          { return mQuads[n]; }
+
+
+    const size_t& numTriangles() const                  { return mNumTriangles; }
+
+    openvdb::Vec3I& triangle(size_t n)                  { return mTriangles[n]; }
+    const openvdb::Vec3I& triangle(size_t n) const      { return mTriangles[n]; }
+
+
+    // polygon flags accessor methods
+
+    char& quadFlags(size_t n)                           { return mQuadFlags[n]; }
+    const char& quadFlags(size_t n) const               { return mQuadFlags[n]; }
+
+    char& triangleFlags(size_t n)                       { return mTriangleFlags[n]; }
+    const char& triangleFlags(size_t n) const           { return mTriangleFlags[n]; }
+
+
+    // reduce the polygon containers, n has to
+    // be smaller than the current container size.
+
+    inline bool trimQuads(const size_t n, bool reallocate = false);
+    inline bool trimTrinagles(const size_t n, bool reallocate = false);
+
+private:
+    // disallow copy by assignment
+    void operator=(const PolygonPool&) {}
+
+    size_t mNumQuads, mNumTriangles;
+    boost::scoped_array<openvdb::Vec4I> mQuads;
+    boost::scoped_array<openvdb::Vec3I> mTriangles;
+    boost::scoped_array<char> mQuadFlags, mTriangleFlags;
+};
+
+
+/// @{
+/// @brief Point and primitive list types.
+typedef boost::scoped_array<openvdb::Vec3s> PointList;
+typedef boost::scoped_array<PolygonPool> PolygonPoolList;
+/// @}
+
+
+////////////////////////////////////////
+
+
+/// @brief Mesh any scalar grid that has a continuous isosurface.
+class VolumeToMesh
+{
+public:
+
+    /// @param isovalue         Determines which isosurface to mesh.
+    /// @param adaptivity       Adaptivity threshold [0 to 1]
+    VolumeToMesh(double isovalue = 0, double adaptivity = 0);
+
+
+    //////////
+
+    // Mesh data accessors
+
+    const size_t& pointListSize() const;
+    PointList& pointList();
+
+    const size_t& polygonPoolListSize() const;
+    PolygonPoolList& polygonPoolList();
+    const PolygonPoolList& polygonPoolList() const;
+
+    std::vector<unsigned char>& pointFlags();
+    const std::vector<unsigned char>& pointFlags() const;
+
+
+    //////////
+
+
+    /// @brief Main call
+    /// @note Call with scalar typed grid.
+    template<typename GridT>
+    void operator()(const GridT&);
+
+
+    //////////
+
+
+    /// @brief  When surfacing fractured SDF fragments, the original unfractured
+    ///         SDF grid can be used to eliminate seam lines and tag polygons that are
+    ///         coincident with the reference surface with the @c POLYFLAG_EXTERIOR
+    ///         flag and polygons that are in proximity to the seam lines with the
+    ///         @c POLYFLAG_FRACTURE_SEAM flag. (The performance cost for using this
+    ///         reference based scheme compared to the regular meshing scheme is
+    ///         approximately 15% for the first fragment and neglect-able for
+    ///         subsequent fragments.)
+    ///
+    /// @note   Attributes from the original asset such as uv coordinates, normals etc.
+    ///         are typically transfered to polygons that are marked with the
+    ///         @c POLYFLAG_EXTERIOR flag. Polygons that are not marked with this flag
+    ///         are interior to reference surface and might need projected UV coordinates
+    ///         or a different material. Polygons marked as @c POLYFLAG_FRACTURE_SEAM can
+    ///         be used to drive secondary elements such as debris and dust in a FX pipeline.
+    ///
+    /// @param  grid            reference surface grid of @c GridT type.
+    /// @param  secAdaptivity   Secondary adaptivity threshold [0 to 1]. Used in regions
+    ///                         that do not exist in the reference grid. (Parts of the
+    ///                         fragment surface that are not coincident with the
+    ///                         reference surface.)
+    void setRefGrid(const GridBase::ConstPtr& grid, double secAdaptivity = 0);
+
+
+    /// @param mask A boolean grid whose active topology defines the region to mesh.
+    /// @param invertMask Toggle to mesh the complement of the mask.
+    /// @note The mask's tree configuration has to match @c GridT's tree configuration.
+    void setSurfaceMask(const GridBase::ConstPtr& mask, bool invertMask = false);
+
+    /// @param grid A scalar grid used as a spatial multiplier for the adaptivity threshold.
+    /// @note The grid's tree configuration has to match @c GridT's tree configuration.
+    void setSpatialAdaptivity(const GridBase::ConstPtr& grid);
+
+
+    /// @param tree A boolean tree whose active topology defines the adaptivity mask.
+    /// @note The tree configuration has to match @c GridT's tree configuration.
+    void setAdaptivityMask(const TreeBase::ConstPtr& tree);
+
+
+    /// @brief Subdivide volume and mesh into disjoint parts
+    /// @param partitions Number of partitions.
+    /// @param activePart Specific partition to mesh, 0 to @c partitions - 1.
+    void partition(unsigned partitions = 1, unsigned activePart = 0);
+
+private:
+
+    PointList mPoints;
+    PolygonPoolList mPolygons;
+
+    size_t mPointListSize, mSeamPointListSize, mPolygonPoolListSize;
+    double mIsovalue, mPrimAdaptivity, mSecAdaptivity;
+
+    GridBase::ConstPtr mRefGrid, mSurfaceMaskGrid, mAdaptivityGrid;
+    TreeBase::ConstPtr mAdaptivityMaskTree;
+
+    TreeBase::Ptr mRefSignTree, mRefIdxTree;
+
+    bool mInvertSurfaceMask;
+    unsigned mPartitions, mActivePart;
+
+    boost::scoped_array<uint32_t> mQuantizedSeamPoints;
+
+    std::vector<unsigned char> mPointFlags;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief  Given a set of tangent elements, @c points with corresponding @c normals,
+///         this method returns the intersection point of all tangent elements.
+///
+/// @note   Used to extract surfaces with sharp edges and corners from volume data,
+///         see the following paper for details: "Feature Sensitive Surface
+///         Extraction from Volume Data, Kobbelt et al. 2001".
+inline Vec3d findFeaturePoint(
+    const std::vector<Vec3d>& points,
+    const std::vector<Vec3d>& normals)
+{
+    typedef math::Mat3d Mat3d;
+
+    Vec3d avgPos(0.0);
+
+    if (points.empty()) return avgPos;
+
+    for (size_t n = 0, N = points.size(); n < N; ++n) {
+        avgPos += points[n];
+    }
+
+    avgPos /= double(points.size());
+
+    // Unique components of the 3x3 A^TA matrix, where A is
+    // the matrix of normals.
+    double m00=0,m01=0,m02=0,
+           m11=0,m12=0,
+           m22=0;
+
+    // The rhs vector, A^Tb, where b = n dot p
+    Vec3d rhs(0.0);
+
+    for (size_t n = 0, N = points.size(); n < N; ++n) {
+
+        const Vec3d& n_ref = normals[n];
+
+        // A^TA
+        m00 += n_ref[0] * n_ref[0]; // diagonal
+        m11 += n_ref[1] * n_ref[1];
+        m22 += n_ref[2] * n_ref[2];
+
+        m01 += n_ref[0] * n_ref[1]; // Upper-tri
+        m02 += n_ref[0] * n_ref[2];
+        m12 += n_ref[1] * n_ref[2];
+
+        // A^Tb (centered around the origin)
+        rhs += n_ref * n_ref.dot(points[n] - avgPos);
+    }
+
+    Mat3d A(m00,m01,m02,
+            m01,m11,m12,
+            m02,m12,m22);
+
+    /*
+    // Inverse
+    const double det = A.det();
+    if (det > 0.01) {
+        Mat3d A_inv = A.adjoint();
+        A_inv *= (1.0 / det);
+
+        return avgPos + A_inv * rhs;
+    }
+    */
+
+    // Compute the pseudo inverse
+
+    math::Mat3d eigenVectors;
+    Vec3d eigenValues;
+
+    diagonalizeSymmetricMatrix(A, eigenVectors, eigenValues, 300);
+
+    Mat3d D = Mat3d::identity();
+
+
+    double tolerance = std::max(std::abs(eigenValues[0]), std::abs(eigenValues[1]));
+    tolerance = std::max(tolerance, std::abs(eigenValues[2]));
+    tolerance *= 0.01;
+
+    int clamped = 0;
+    for (int i = 0; i < 3; ++i ) {
+        if (std::abs(eigenValues[i]) < tolerance) {
+            D[i][i] = 0.0;
+            ++clamped;
+        } else {
+            D[i][i] = 1.0 / eigenValues[i];
+        }
+    }
+
+    // Assemble the pseudo inverse and calc. the intersection point
+    if (clamped < 3) {
+        Mat3d pseudoInv = eigenVectors * D *  eigenVectors.transpose();
+        return avgPos + pseudoInv * rhs;
+    }
+
+    return avgPos;
+}
+
+
+////////////////////////////////////////
+
+
+// Internal utility methods
+namespace internal {
+
+template<typename T>
+struct UniquePtr
+{
+#ifdef OPENVDB_HAS_CXX11
+    typedef std::unique_ptr<T>  type;
+#else
+    typedef std::auto_ptr<T>    type;
+#endif
+};
+
+
+/// @brief  Bit-flags used to classify cells.
+enum { SIGNS = 0xFF, EDGES = 0xE00, INSIDE = 0x100,
+       XEDGE = 0x200, YEDGE = 0x400, ZEDGE = 0x800, SEAM = 0x1000};
+
+
+/// @brief Used to quickly determine if a given cell is adaptable.
+const bool sAdaptable[256] = {
+    1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,1,0,0,0,1,0,1,0,1,0,1,0,1,
+    1,0,1,1,0,0,1,1,0,0,0,1,0,0,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,0,1,
+    1,0,0,0,1,0,1,1,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,0,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,1,
+    1,0,0,0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,1,1,1,0,1,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,0,1,0,0,0,1,
+    1,0,0,0,1,0,1,0,1,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,1,1,0,0,1,1,0,1,
+    1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1};
+
+
+/// @brief  Contains the ambiguous face index for certain cell configuration.
+const unsigned char sAmbiguousFace[256] = {
+    0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0,0,1,0,0,5,1,0,4,0,0,0,4,0,0,0,
+    0,1,0,0,2,0,0,0,0,1,5,0,2,0,0,0,0,0,0,0,2,0,0,0,4,0,0,0,0,0,0,0,
+    0,0,2,2,0,5,0,0,3,3,0,0,0,0,0,0,6,6,0,0,6,0,0,0,0,0,0,0,0,0,0,0,
+    0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,4,0,4,3,0,3,0,0,0,5,0,0,0,0,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,
+    6,0,6,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+
+/// @brief  Lookup table for different cell sign configurations. The first entry specifies
+///         the total number of points that need to be generated inside a cell and the
+///         remaining 12 entries indicate different edge groups.
+const unsigned char sEdgeGroupTable[256][13] = {
+    {0,0,0,0,0,0,0,0,0,0,0,0,0},{1,1,0,0,1,0,0,0,0,1,0,0,0},{1,1,1,0,0,0,0,0,0,0,1,0,0},
+    {1,0,1,0,1,0,0,0,0,1,1,0,0},{1,0,1,1,0,0,0,0,0,0,0,1,0},{1,1,1,1,1,0,0,0,0,1,0,1,0},
+    {1,1,0,1,0,0,0,0,0,0,1,1,0},{1,0,0,1,1,0,0,0,0,1,1,1,0},{1,0,0,1,1,0,0,0,0,0,0,0,1},
+    {1,1,0,1,0,0,0,0,0,1,0,0,1},{1,1,1,1,1,0,0,0,0,0,1,0,1},{1,0,1,1,0,0,0,0,0,1,1,0,1},
+    {1,0,1,0,1,0,0,0,0,0,0,1,1},{1,1,1,0,0,0,0,0,0,1,0,1,1},{1,1,0,0,1,0,0,0,0,0,1,1,1},
+    {1,0,0,0,0,0,0,0,0,1,1,1,1},{1,0,0,0,0,1,0,0,1,1,0,0,0},{1,1,0,0,1,1,0,0,1,0,0,0,0},
+    {1,1,1,0,0,1,0,0,1,1,1,0,0},{1,0,1,0,1,1,0,0,1,0,1,0,0},{2,0,1,1,0,2,0,0,2,2,0,1,0},
+    {1,1,1,1,1,1,0,0,1,0,0,1,0},{1,1,0,1,0,1,0,0,1,1,1,1,0},{1,0,0,1,1,1,0,0,1,0,1,1,0},
+    {1,0,0,1,1,1,0,0,1,1,0,0,1},{1,1,0,1,0,1,0,0,1,0,0,0,1},{2,2,1,1,2,1,0,0,1,2,1,0,1},
+    {1,0,1,1,0,1,0,0,1,0,1,0,1},{1,0,1,0,1,1,0,0,1,1,0,1,1},{1,1,1,0,0,1,0,0,1,0,0,1,1},
+    {2,1,0,0,1,2,0,0,2,1,2,2,2},{1,0,0,0,0,1,0,0,1,0,1,1,1},{1,0,0,0,0,1,1,0,0,0,1,0,0},
+    {1,1,0,0,1,1,1,0,0,1,1,0,0},{1,1,1,0,0,1,1,0,0,0,0,0,0},{1,0,1,0,1,1,1,0,0,1,0,0,0},
+    {1,0,1,1,0,1,1,0,0,0,1,1,0},{2,2,2,1,1,1,1,0,0,1,2,1,0},{1,1,0,1,0,1,1,0,0,0,0,1,0},
+    {1,0,0,1,1,1,1,0,0,1,0,1,0},{2,0,0,2,2,1,1,0,0,0,1,0,2},{1,1,0,1,0,1,1,0,0,1,1,0,1},
+    {1,1,1,1,1,1,1,0,0,0,0,0,1},{1,0,1,1,0,1,1,0,0,1,0,0,1},{1,0,1,0,1,1,1,0,0,0,1,1,1},
+    {2,1,1,0,0,2,2,0,0,2,1,2,2},{1,1,0,0,1,1,1,0,0,0,0,1,1},{1,0,0,0,0,1,1,0,0,1,0,1,1},
+    {1,0,0,0,0,0,1,0,1,1,1,0,0},{1,1,0,0,1,0,1,0,1,0,1,0,0},{1,1,1,0,0,0,1,0,1,1,0,0,0},
+    {1,0,1,0,1,0,1,0,1,0,0,0,0},{1,0,1,1,0,0,1,0,1,1,1,1,0},{2,1,1,2,2,0,2,0,2,0,1,2,0},
+    {1,1,0,1,0,0,1,0,1,1,0,1,0},{1,0,0,1,1,0,1,0,1,0,0,1,0},{1,0,0,1,1,0,1,0,1,1,1,0,1},
+    {1,1,0,1,0,0,1,0,1,0,1,0,1},{2,1,2,2,1,0,2,0,2,1,0,0,2},{1,0,1,1,0,0,1,0,1,0,0,0,1},
+    {2,0,2,0,2,0,1,0,1,2,2,1,1},{2,2,2,0,0,0,1,0,1,0,2,1,1},{2,2,0,0,2,0,1,0,1,2,0,1,1},
+    {1,0,0,0,0,0,1,0,1,0,0,1,1},{1,0,0,0,0,0,1,1,0,0,0,1,0},{2,1,0,0,1,0,2,2,0,1,0,2,0},
+    {1,1,1,0,0,0,1,1,0,0,1,1,0},{1,0,1,0,1,0,1,1,0,1,1,1,0},{1,0,1,1,0,0,1,1,0,0,0,0,0},
+    {1,1,1,1,1,0,1,1,0,1,0,0,0},{1,1,0,1,0,0,1,1,0,0,1,0,0},{1,0,0,1,1,0,1,1,0,1,1,0,0},
+    {1,0,0,1,1,0,1,1,0,0,0,1,1},{1,1,0,1,0,0,1,1,0,1,0,1,1},{2,1,2,2,1,0,1,1,0,0,1,2,1},
+    {2,0,1,1,0,0,2,2,0,2,2,1,2},{1,0,1,0,1,0,1,1,0,0,0,0,1},{1,1,1,0,0,0,1,1,0,1,0,0,1},
+    {1,1,0,0,1,0,1,1,0,0,1,0,1},{1,0,0,0,0,0,1,1,0,1,1,0,1},{1,0,0,0,0,1,1,1,1,1,0,1,0},
+    {1,1,0,0,1,1,1,1,1,0,0,1,0},{2,1,1,0,0,2,2,1,1,1,2,1,0},{2,0,2,0,2,1,1,2,2,0,1,2,0},
+    {1,0,1,1,0,1,1,1,1,1,0,0,0},{2,2,2,1,1,2,2,1,1,0,0,0,0},{2,2,0,2,0,1,1,2,2,2,1,0,0},
+    {2,0,0,1,1,2,2,1,1,0,2,0,0},{2,0,0,1,1,1,1,2,2,1,0,1,2},{2,2,0,2,0,2,2,1,1,0,0,2,1},
+    {4,3,2,2,3,4,4,1,1,3,4,2,1},{3,0,2,2,0,1,1,3,3,0,1,2,3},{2,0,2,0,2,2,2,1,1,2,0,0,1},
+    {2,1,1,0,0,1,1,2,2,0,0,0,2},{3,1,0,0,1,2,2,3,3,1,2,0,3},{2,0,0,0,0,1,1,2,2,0,1,0,2},
+    {1,0,0,0,0,1,0,1,0,0,1,1,0},{1,1,0,0,1,1,0,1,0,1,1,1,0},{1,1,1,0,0,1,0,1,0,0,0,1,0},
+    {1,0,1,0,1,1,0,1,0,1,0,1,0},{1,0,1,1,0,1,0,1,0,0,1,0,0},{2,1,1,2,2,2,0,2,0,2,1,0,0},
+    {1,1,0,1,0,1,0,1,0,0,0,0,0},{1,0,0,1,1,1,0,1,0,1,0,0,0},{1,0,0,1,1,1,0,1,0,0,1,1,1},
+    {2,2,0,2,0,1,0,1,0,1,2,2,1},{2,2,1,1,2,2,0,2,0,0,0,1,2},{2,0,2,2,0,1,0,1,0,1,0,2,1},
+    {1,0,1,0,1,1,0,1,0,0,1,0,1},{2,2,2,0,0,1,0,1,0,1,2,0,1},{1,1,0,0,1,1,0,1,0,0,0,0,1},
+    {1,0,0,0,0,1,0,1,0,1,0,0,1},{1,0,0,0,0,0,0,1,1,1,1,1,0},{1,1,0,0,1,0,0,1,1,0,1,1,0},
+    {1,1,1,0,0,0,0,1,1,1,0,1,0},{1,0,1,0,1,0,0,1,1,0,0,1,0},{1,0,1,1,0,0,0,1,1,1,1,0,0},
+    {2,2,2,1,1,0,0,1,1,0,2,0,0},{1,1,0,1,0,0,0,1,1,1,0,0,0},{1,0,0,1,1,0,0,1,1,0,0,0,0},
+    {2,0,0,2,2,0,0,1,1,2,2,2,1},{2,1,0,1,0,0,0,2,2,0,1,1,2},{3,2,1,1,2,0,0,3,3,2,0,1,3},
+    {2,0,1,1,0,0,0,2,2,0,0,1,2},{2,0,1,0,1,0,0,2,2,1,1,0,2},{2,1,1,0,0,0,0,2,2,0,1,0,2},
+    {2,1,0,0,1,0,0,2,2,1,0,0,2},{1,0,0,0,0,0,0,1,1,0,0,0,1},{1,0,0,0,0,0,0,1,1,0,0,0,1},
+    {1,1,0,0,1,0,0,1,1,1,0,0,1},{2,1,1,0,0,0,0,2,2,0,1,0,2},{1,0,1,0,1,0,0,1,1,1,1,0,1},
+    {1,0,1,1,0,0,0,1,1,0,0,1,1},{2,1,1,2,2,0,0,1,1,1,0,1,2},{1,1,0,1,0,0,0,1,1,0,1,1,1},
+    {2,0,0,1,1,0,0,2,2,2,2,2,1},{1,0,0,1,1,0,0,1,1,0,0,0,0},{1,1,0,1,0,0,0,1,1,1,0,0,0},
+    {1,1,1,1,1,0,0,1,1,0,1,0,0},{1,0,1,1,0,0,0,1,1,1,1,0,0},{1,0,1,0,1,0,0,1,1,0,0,1,0},
+    {1,1,1,0,0,0,0,1,1,1,0,1,0},{1,1,0,0,1,0,0,1,1,0,1,1,0},{1,0,0,0,0,0,0,1,1,1,1,1,0},
+    {1,0,0,0,0,1,0,1,0,1,0,0,1},{1,1,0,0,1,1,0,1,0,0,0,0,1},{1,1,1,0,0,1,0,1,0,1,1,0,1},
+    {1,0,1,0,1,1,0,1,0,0,1,0,1},{1,0,1,1,0,1,0,1,0,1,0,1,1},{2,2,2,1,1,2,0,2,0,0,0,2,1},
+    {2,1,0,1,0,2,0,2,0,1,2,2,1},{2,0,0,2,2,1,0,1,0,0,1,1,2},{1,0,0,1,1,1,0,1,0,1,0,0,0},
+    {1,1,0,1,0,1,0,1,0,0,0,0,0},{2,1,2,2,1,2,0,2,0,1,2,0,0},{1,0,1,1,0,1,0,1,0,0,1,0,0},
+    {1,0,1,0,1,1,0,1,0,1,0,1,0},{1,1,1,0,0,1,0,1,0,0,0,1,0},{2,2,0,0,2,1,0,1,0,2,1,1,0},
+    {1,0,0,0,0,1,0,1,0,0,1,1,0},{1,0,0,0,0,1,1,1,1,0,1,0,1},{2,1,0,0,1,2,1,1,2,2,1,0,1},
+    {1,1,1,0,0,1,1,1,1,0,0,0,1},{2,0,2,0,2,1,2,2,1,1,0,0,2},{2,0,1,1,0,1,2,2,1,0,1,2,1},
+    {4,1,1,3,3,2,4,4,2,2,1,4,3},{2,2,0,2,0,2,1,1,2,0,0,1,2},{3,0,0,1,1,2,3,3,2,2,0,3,1},
+    {1,0,0,1,1,1,1,1,1,0,1,0,0},{2,2,0,2,0,1,2,2,1,1,2,0,0},{2,2,1,1,2,2,1,1,2,0,0,0,0},
+    {2,0,1,1,0,2,1,1,2,2,0,0,0},{2,0,2,0,2,2,1,1,2,0,2,1,0},{3,1,1,0,0,3,2,2,3,3,1,2,0},
+    {2,1,0,0,1,1,2,2,1,0,0,2,0},{2,0,0,0,0,2,1,1,2,2,0,1,0},{1,0,0,0,0,0,1,1,0,1,1,0,1},
+    {1,1,0,0,1,0,1,1,0,0,1,0,1},{1,1,1,0,0,0,1,1,0,1,0,0,1},{1,0,1,0,1,0,1,1,0,0,0,0,1},
+    {2,0,2,2,0,0,1,1,0,2,2,1,2},{3,1,1,2,2,0,3,3,0,0,1,3,2},{2,1,0,1,0,0,2,2,0,1,0,2,1},
+    {2,0,0,1,1,0,2,2,0,0,0,2,1},{1,0,0,1,1,0,1,1,0,1,1,0,0},{1,1,0,1,0,0,1,1,0,0,1,0,0},
+    {2,2,1,1,2,0,1,1,0,2,0,0,0},{1,0,1,1,0,0,1,1,0,0,0,0,0},{2,0,1,0,1,0,2,2,0,1,1,2,0},
+    {2,1,1,0,0,0,2,2,0,0,1,2,0},{2,1,0,0,1,0,2,2,0,1,0,2,0},{1,0,0,0,0,0,1,1,0,0,0,1,0},
+    {1,0,0,0,0,0,1,0,1,0,0,1,1},{1,1,0,0,1,0,1,0,1,1,0,1,1},{1,1,1,0,0,0,1,0,1,0,1,1,1},
+    {2,0,2,0,2,0,1,0,1,1,1,2,2},{1,0,1,1,0,0,1,0,1,0,0,0,1},{2,2,2,1,1,0,2,0,2,2,0,0,1},
+    {1,1,0,1,0,0,1,0,1,0,1,0,1},{2,0,0,2,2,0,1,0,1,1,1,0,2},{1,0,0,1,1,0,1,0,1,0,0,1,0},
+    {1,1,0,1,0,0,1,0,1,1,0,1,0},{2,2,1,1,2,0,2,0,2,0,2,1,0},{2,0,2,2,0,0,1,0,1,1,1,2,0},
+    {1,0,1,0,1,0,1,0,1,0,0,0,0},{1,1,1,0,0,0,1,0,1,1,0,0,0},{1,1,0,0,1,0,1,0,1,0,1,0,0},
+    {1,0,0,0,0,0,1,0,1,1,1,0,0},{1,0,0,0,0,1,1,0,0,1,0,1,1},{1,1,0,0,1,1,1,0,0,0,0,1,1},
+    {2,2,2,0,0,1,1,0,0,2,1,2,2},{2,0,1,0,1,2,2,0,0,0,2,1,1},{1,0,1,1,0,1,1,0,0,1,0,0,1},
+    {2,1,1,2,2,1,1,0,0,0,0,0,2},{2,1,0,1,0,2,2,0,0,1,2,0,1},{2,0,0,2,2,1,1,0,0,0,1,0,2},
+    {1,0,0,1,1,1,1,0,0,1,0,1,0},{1,1,0,1,0,1,1,0,0,0,0,1,0},{3,1,2,2,1,3,3,0,0,1,3,2,0},
+    {2,0,1,1,0,2,2,0,0,0,2,1,0},{1,0,1,0,1,1,1,0,0,1,0,0,0},{1,1,1,0,0,1,1,0,0,0,0,0,0},
+    {2,2,0,0,2,1,1,0,0,2,1,0,0},{1,0,0,0,0,1,1,0,0,0,1,0,0},{1,0,0,0,0,1,0,0,1,0,1,1,1},
+    {2,2,0,0,2,1,0,0,1,1,2,2,2},{1,1,1,0,0,1,0,0,1,0,0,1,1},{2,0,1,0,1,2,0,0,2,2,0,1,1},
+    {1,0,1,1,0,1,0,0,1,0,1,0,1},{3,1,1,3,3,2,0,0,2,2,1,0,3},{1,1,0,1,0,1,0,0,1,0,0,0,1},
+    {2,0,0,2,2,1,0,0,1,1,0,0,2},{1,0,0,1,1,1,0,0,1,0,1,1,0},{2,1,0,1,0,2,0,0,2,2,1,1,0},
+    {2,1,2,2,1,1,0,0,1,0,0,2,0},{2,0,1,1,0,2,0,0,2,2,0,1,0},{1,0,1,0,1,1,0,0,1,0,1,0,0},
+    {2,1,1,0,0,2,0,0,2,2,1,0,0},{1,1,0,0,1,1,0,0,1,0,0,0,0},{1,0,0,0,0,1,0,0,1,1,0,0,0},
+    {1,0,0,0,0,0,0,0,0,1,1,1,1},{1,1,0,0,1,0,0,0,0,0,1,1,1},{1,1,1,0,0,0,0,0,0,1,0,1,1},
+    {1,0,1,0,1,0,0,0,0,0,0,1,1},{1,0,1,1,0,0,0,0,0,1,1,0,1},{2,1,1,2,2,0,0,0,0,0,1,0,2},
+    {1,1,0,1,0,0,0,0,0,1,0,0,1},{1,0,0,1,1,0,0,0,0,0,0,0,1},{1,0,0,1,1,0,0,0,0,1,1,1,0},
+    {1,1,0,1,0,0,0,0,0,0,1,1,0},{2,1,2,2,1,0,0,0,0,1,0,2,0},{1,0,1,1,0,0,0,0,0,0,0,1,0},
+    {1,0,1,0,1,0,0,0,0,1,1,0,0},{1,1,1,0,0,0,0,0,0,0,1,0,0},{1,1,0,0,1,0,0,0,0,1,0,0,0},
+    {0,0,0,0,0,0,0,0,0,0,0,0,0}};
+
+
+////////////////////////////////////////
+
+inline bool
+isPlanarQuad(
+    const Vec3d& p0, const Vec3d& p1,
+    const Vec3d& p2, const Vec3d& p3,
+    double epsilon = 0.001)
+{
+    // compute representative plane
+    Vec3d normal = (p2-p0).cross(p1-p3);
+    normal.normalize();
+    const Vec3d centroid = (p0 + p1 + p2 + p3);
+    const double d = centroid.dot(normal) * 0.25;
+
+
+    // test vertice distance to plane
+    double absDist = std::abs(p0.dot(normal) - d);
+    if (absDist > epsilon) return false;
+
+    absDist = std::abs(p1.dot(normal) - d);
+    if (absDist > epsilon) return false;
+
+    absDist = std::abs(p2.dot(normal) - d);
+    if (absDist > epsilon) return false;
+
+    absDist = std::abs(p3.dot(normal) - d);
+    if (absDist > epsilon) return false;
+
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+/// @{
+/// @brief  Utility methods for point quantization.
+
+enum {
+    MASK_FIRST_10_BITS = 0x000003FF,
+    MASK_DIRTY_BIT =     0x80000000,
+    MASK_INVALID_BIT =   0x40000000
+};
+
+inline uint32_t
+packPoint(const Vec3d& v)
+{
+    uint32_t data = 0;
+
+    // values are expected to be in the [0.0 to 1.0] range.
+    assert(!(v.x() > 1.0) && !(v.y() > 1.0) && !(v.z() > 1.0));
+    assert(!(v.x() < 0.0) && !(v.y() < 0.0) && !(v.z() < 0.0));
+
+    data |= (uint32_t(v.x() * 1023.0) & MASK_FIRST_10_BITS) << 20;
+    data |= (uint32_t(v.y() * 1023.0) & MASK_FIRST_10_BITS) << 10;
+    data |= (uint32_t(v.z() * 1023.0) & MASK_FIRST_10_BITS);
+
+    return data;
+}
+
+inline Vec3d
+unpackPoint(uint32_t data)
+{
+    Vec3d v;
+    v.z() = double(data & MASK_FIRST_10_BITS) * 0.0009775171;
+    data = data >> 10;
+    v.y() = double(data & MASK_FIRST_10_BITS) * 0.0009775171;
+    data = data >> 10;
+    v.x() = double(data & MASK_FIRST_10_BITS) * 0.0009775171;
+
+    return v;
+}
+
+/// @}
+
+////////////////////////////////////////
+
+
+/// @brief  General method that computes the cell-sign configuration at the given
+///         @c ijk coordinate.
+template<typename AccessorT>
+inline unsigned char
+evalCellSigns(const AccessorT& accessor, const Coord& ijk, typename AccessorT::ValueType iso)
+{
+    unsigned signs = 0;
+    Coord coord = ijk; // i, j, k
+    if (accessor.getValue(coord) < iso) signs |= 1u;
+    coord[0] += 1; // i+1, j, k
+    if (accessor.getValue(coord) < iso) signs |= 2u;
+    coord[2] += 1; // i+1, j, k+1
+    if (accessor.getValue(coord) < iso) signs |= 4u;
+    coord[0] = ijk[0]; // i, j, k+1
+    if (accessor.getValue(coord) < iso) signs |= 8u;
+    coord[1] += 1; coord[2] = ijk[2]; // i, j+1, k
+    if (accessor.getValue(coord) < iso) signs |= 16u;
+    coord[0] += 1; // i+1, j+1, k
+    if (accessor.getValue(coord) < iso) signs |= 32u;
+    coord[2] += 1; // i+1, j+1, k+1
+    if (accessor.getValue(coord) < iso) signs |= 64u;
+    coord[0] = ijk[0]; // i, j+1, k+1
+    if (accessor.getValue(coord) < iso) signs |= 128u;
+    return uint8_t(signs);
+}
+
+
+/// @brief  Leaf node optimized method that computes the cell-sign configuration
+///         at the given local @c offset
+template<typename LeafT>
+inline unsigned char
+evalCellSigns(const LeafT& leaf, const Index offset, typename LeafT::ValueType iso)
+{
+    unsigned char signs = 0;
+
+    // i, j, k
+    if (leaf.getValue(offset) < iso) signs |= 1u;
+
+    // i, j, k+1
+    if (leaf.getValue(offset + 1) < iso) signs |= 8u;
+
+    // i, j+1, k
+    if (leaf.getValue(offset + LeafT::DIM) < iso) signs |= 16u;
+
+    // i, j+1, k+1
+    if (leaf.getValue(offset + LeafT::DIM + 1) < iso) signs |= 128u;
+
+    // i+1, j, k
+    if (leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) ) < iso) signs |= 2u;
+
+    // i+1, j, k+1
+    if (leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) + 1) < iso) signs |= 4u;
+
+    // i+1, j+1, k
+    if (leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) + LeafT::DIM) < iso) signs |= 32u;
+
+    // i+1, j+1, k+1
+    if (leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) + LeafT::DIM + 1) < iso) signs |= 64u;
+
+    return signs;
+}
+
+
+/// @brief  Used to correct topological ambiguities related to two adjacent cells
+///         that share an ambiguous face.
+template<class AccessorT>
+inline void
+correctCellSigns(unsigned char& signs, unsigned char face,
+    const AccessorT& acc, Coord ijk, typename AccessorT::ValueType iso)
+{
+    if (face == 1) {
+        ijk[2] -= 1;
+        if (sAmbiguousFace[evalCellSigns(acc, ijk, iso)] == 3) signs = uint8_t(~signs);
+    } else if (face == 3) {
+        ijk[2] += 1;
+        if (sAmbiguousFace[evalCellSigns(acc, ijk, iso)] == 1) signs = uint8_t(~signs);
+    } else if (face == 2) {
+        ijk[0] += 1;
+        if (sAmbiguousFace[evalCellSigns(acc, ijk, iso)] == 4) signs = uint8_t(~signs);
+    } else if (face == 4) {
+        ijk[0] -= 1;
+        if (sAmbiguousFace[evalCellSigns(acc, ijk, iso)] == 2) signs = uint8_t(~signs);
+    } else if (face == 5) {
+        ijk[1] -= 1;
+        if (sAmbiguousFace[evalCellSigns(acc, ijk, iso)] == 6) signs = uint8_t(~signs);
+    } else if (face == 6) {
+        ijk[1] += 1;
+        if (sAmbiguousFace[evalCellSigns(acc, ijk, iso)] == 5) signs = uint8_t(~signs);
+    }
+}
+
+
+template<class AccessorT>
+inline bool
+isNonManifold(const AccessorT& accessor, const Coord& ijk,
+    typename AccessorT::ValueType isovalue, const int dim)
+{
+    int hDim = dim >> 1;
+    bool m, p[8]; // Corner signs
+
+    Coord coord = ijk; // i, j, k
+    p[0] = accessor.getValue(coord) < isovalue;
+    coord[0] += dim; // i+dim, j, k
+    p[1] = accessor.getValue(coord) < isovalue;
+    coord[2] += dim; // i+dim, j, k+dim
+    p[2] = accessor.getValue(coord) < isovalue;
+    coord[0] = ijk[0]; // i, j, k+dim
+    p[3] = accessor.getValue(coord) < isovalue;
+    coord[1] += dim; coord[2] = ijk[2]; // i, j+dim, k
+    p[4] = accessor.getValue(coord) < isovalue;
+    coord[0] += dim; // i+dim, j+dim, k
+    p[5] = accessor.getValue(coord) < isovalue;
+    coord[2] += dim; // i+dim, j+dim, k+dim
+    p[6] = accessor.getValue(coord) < isovalue;
+    coord[0] = ijk[0]; // i, j+dim, k+dim
+    p[7] = accessor.getValue(coord) < isovalue;
+
+    // Check if the corner sign configuration is ambiguous
+    unsigned signs = 0;
+    if (p[0]) signs |= 1u;
+    if (p[1]) signs |= 2u;
+    if (p[2]) signs |= 4u;
+    if (p[3]) signs |= 8u;
+    if (p[4]) signs |= 16u;
+    if (p[5]) signs |= 32u;
+    if (p[6]) signs |= 64u;
+    if (p[7]) signs |= 128u;
+    if (!sAdaptable[signs]) return true;
+
+    // Manifold check
+
+    // Evaluate edges
+    int i = ijk[0], ip = ijk[0] + hDim, ipp = ijk[0] + dim;
+    int j = ijk[1], jp = ijk[1] + hDim, jpp = ijk[1] + dim;
+    int k = ijk[2], kp = ijk[2] + hDim, kpp = ijk[2] + dim;
+
+    // edge 1
+    coord.reset(ip, j, k);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[1] != m) return true;
+
+    // edge 2
+    coord.reset(ipp, j, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[1] != m && p[2] != m) return true;
+
+    // edge 3
+    coord.reset(ip, j, kpp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[2] != m && p[3] != m) return true;
+
+    // edge 4
+    coord.reset(i, j, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[3] != m) return true;
+
+    // edge 5
+    coord.reset(ip, jpp, k);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[4] != m && p[5] != m) return true;
+
+    // edge 6
+    coord.reset(ipp, jpp, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[5] != m && p[6] != m) return true;
+
+    // edge 7
+    coord.reset(ip, jpp, kpp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[6] != m && p[7] != m) return true;
+
+    // edge 8
+    coord.reset(i, jpp, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[7] != m && p[4] != m) return true;
+
+    // edge 9
+    coord.reset(i, jp, k);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[4] != m) return true;
+
+    // edge 10
+    coord.reset(ipp, jp, k);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[1] != m && p[5] != m) return true;
+
+    // edge 11
+    coord.reset(ipp, jp, kpp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[2] != m && p[6] != m) return true;
+
+
+    // edge 12
+    coord.reset(i, jp, kpp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[3] != m && p[7] != m) return true;
+
+
+    // Evaluate faces
+
+    // face 1
+    coord.reset(ip, jp, k);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[1] != m && p[4] != m && p[5] != m) return true;
+
+    // face 2
+    coord.reset(ipp, jp, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[1] != m && p[2] != m && p[5] != m && p[6] != m) return true;
+
+    // face 3
+    coord.reset(ip, jp, kpp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[2] != m && p[3] != m && p[6] != m && p[7] != m) return true;
+
+    // face 4
+    coord.reset(i, jp, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[3] != m && p[4] != m && p[7] != m) return true;
+
+    // face 5
+    coord.reset(ip, j, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[1] != m && p[2] != m && p[3] != m) return true;
+
+    // face 6
+    coord.reset(ip, jpp, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[4] != m && p[5] != m && p[6] != m && p[7] != m) return true;
+
+    // test cube center
+    coord.reset(ip, jp, kp);
+    m = accessor.getValue(coord) < isovalue;
+    if (p[0] != m && p[1] != m && p[2] != m && p[3] != m &&
+        p[4] != m && p[5] != m && p[6] != m && p[7] != m) return true;
+
+    return false;
+}
+
+
+////////////////////////////////////////
+
+
+template <class LeafType>
+inline void
+mergeVoxels(LeafType& leaf, const Coord& start, int dim, int regionId)
+{
+    Coord ijk, end = start;
+    end[0] += dim;
+    end[1] += dim;
+    end[2] += dim;
+
+    for (ijk[0] = start[0]; ijk[0] < end[0]; ++ijk[0]) {
+        for (ijk[1] = start[1]; ijk[1] < end[1]; ++ijk[1]) {
+            for (ijk[2] = start[2]; ijk[2] < end[2]; ++ijk[2]) {
+                leaf.setValueOnly(ijk, regionId);
+            }
+        }
+    }
+}
+
+
+// Note that we must use ValueType::value_type or else Visual C++ gets confused
+// thinking that it is a constructor.
+template <class LeafType>
+inline bool
+isMergable(LeafType& leaf, const Coord& start, int dim,
+    typename LeafType::ValueType::value_type adaptivity)
+{
+    if (adaptivity < 1e-6) return false;
+
+    typedef typename LeafType::ValueType VecT;
+    Coord ijk, end = start;
+    end[0] += dim;
+    end[1] += dim;
+    end[2] += dim;
+
+    std::vector<VecT> norms;
+    for (ijk[0] = start[0]; ijk[0] < end[0]; ++ijk[0]) {
+        for (ijk[1] = start[1]; ijk[1] < end[1]; ++ijk[1]) {
+            for (ijk[2] = start[2]; ijk[2] < end[2]; ++ijk[2]) {
+
+                if(!leaf.isValueOn(ijk)) continue;
+                norms.push_back(leaf.getValue(ijk));
+            }
+        }
+    }
+
+    size_t N = norms.size();
+    for (size_t ni = 0; ni < N; ++ni) {
+        VecT n_i = norms[ni];
+        for (size_t nj = 0; nj < N; ++nj) {
+            VecT n_j = norms[nj];
+            if ((1.0 - n_i.dot(n_j)) > adaptivity) return false;
+        }
+    }
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT, typename LeafManagerT>
+class SignData
+{
+public:
+    typedef typename TreeT::ValueType ValueT;
+    typedef tree::ValueAccessor<const TreeT> AccessorT;
+
+    typedef typename TreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef tree::ValueAccessor<IntTreeT> IntAccessorT;
+
+    typedef typename TreeT::template ValueConverter<Int16>::Type Int16TreeT;
+    typedef tree::ValueAccessor<Int16TreeT> Int16AccessorT;
+
+    //////////
+
+
+    SignData(const TreeT& distTree, const LeafManagerT& leafs, ValueT iso);
+
+    void run(bool threaded = true);
+
+    typename Int16TreeT::Ptr signTree() const { return mSignTree; }
+    typename IntTreeT::Ptr idxTree() const { return mIdxTree; }
+
+    //////////
+
+    SignData(SignData&, tbb::split);
+    void operator()(const tbb::blocked_range<size_t>&);
+    void join(const SignData& rhs)
+    {
+        mSignTree->merge(*rhs.mSignTree);
+        mIdxTree->merge(*rhs.mIdxTree);
+    }
+
+private:
+
+    const TreeT& mDistTree;
+    AccessorT mDistAcc;
+
+    const LeafManagerT& mLeafs;
+    ValueT mIsovalue;
+
+    typename Int16TreeT::Ptr mSignTree;
+    Int16AccessorT mSignAcc;
+
+    typename IntTreeT::Ptr mIdxTree;
+    IntAccessorT mIdxAcc;
+
+};
+
+
+template<typename TreeT, typename LeafManagerT>
+SignData<TreeT, LeafManagerT>::SignData(const TreeT& distTree,
+    const LeafManagerT& leafs, ValueT iso)
+    : mDistTree(distTree)
+    , mDistAcc(mDistTree)
+    , mLeafs(leafs)
+    , mIsovalue(iso)
+    , mSignTree(new Int16TreeT(0))
+    , mSignAcc(*mSignTree)
+    , mIdxTree(new IntTreeT(int(util::INVALID_IDX)))
+    , mIdxAcc(*mIdxTree)
+{
+}
+
+
+template<typename TreeT, typename LeafManagerT>
+SignData<TreeT, LeafManagerT>::SignData(SignData& rhs, tbb::split)
+    : mDistTree(rhs.mDistTree)
+    , mDistAcc(mDistTree)
+    , mLeafs(rhs.mLeafs)
+    , mIsovalue(rhs.mIsovalue)
+    , mSignTree(new Int16TreeT(0))
+    , mSignAcc(*mSignTree)
+    , mIdxTree(new IntTreeT(int(util::INVALID_IDX)))
+    , mIdxAcc(*mIdxTree)
+{
+}
+
+
+template<typename TreeT, typename LeafManagerT>
+void
+SignData<TreeT, LeafManagerT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_reduce(mLeafs.getRange(), *this);
+    else (*this)(mLeafs.getRange());
+}
+
+template<typename TreeT, typename LeafManagerT>
+void
+SignData<TreeT, LeafManagerT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    typedef typename Int16TreeT::LeafNodeType Int16LeafT;
+    typedef typename IntTreeT::LeafNodeType IntLeafT;
+    typename LeafManagerT::TreeType::LeafNodeType::ValueOnCIter iter;
+    unsigned char signs, face;
+    Coord ijk, coord;
+
+    typename internal::UniquePtr<Int16LeafT>::type signLeafPt(new Int16LeafT(ijk, 0));
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        bool collectedData = false;
+
+        coord = mLeafs.leaf(n).origin();
+
+        if (!signLeafPt.get()) signLeafPt.reset(new Int16LeafT(coord, 0));
+        else signLeafPt->setOrigin(coord);
+
+        const typename TreeT::LeafNodeType *leafPt = mDistAcc.probeConstLeaf(coord);
+
+        coord.offset(TreeT::LeafNodeType::DIM - 1);
+
+        for (iter = mLeafs.leaf(n).cbeginValueOn(); iter; ++iter) {
+
+            ijk = iter.getCoord();
+
+            if (leafPt && ijk[0] < coord[0] && ijk[1] < coord[1] && ijk[2] < coord[2]) {
+                signs = evalCellSigns(*leafPt, iter.pos(), mIsovalue);
+            } else {
+                signs = evalCellSigns(mDistAcc, ijk, mIsovalue);
+            }
+
+            if (signs != 0 && signs != 0xFF) {
+                Int16 flags = (signs & 0x1) ? INSIDE : 0;
+
+                if (bool(signs & 0x1) != bool(signs & 0x2)) flags |= XEDGE;
+                if (bool(signs & 0x1) != bool(signs & 0x10)) flags |= YEDGE;
+                if (bool(signs & 0x1) != bool(signs & 0x8)) flags |= ZEDGE;
+
+                face = internal::sAmbiguousFace[signs];
+                if (face != 0) correctCellSigns(signs, face, mDistAcc, ijk, mIsovalue);
+
+                flags = Int16(flags | Int16(signs));
+
+                signLeafPt->setValue(ijk, flags);
+                collectedData = true;
+            }
+        }
+
+        if (collectedData) {
+
+            IntLeafT* idxLeaf = mIdxAcc.touchLeaf(coord);
+            idxLeaf->topologyUnion(*signLeafPt);
+            typename IntLeafT::ValueOnIter it = idxLeaf->beginValueOn();
+            for (; it; ++it) {
+                it.setValue(0);
+            }
+
+            mSignAcc.addLeaf(signLeafPt.release());
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Counts the total number of points per leaf, accounts for cells with multiple points.
+class CountPoints
+{
+public:
+    CountPoints(std::vector<size_t>& pointList) : mPointList(pointList) {}
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t leafIndex) const
+    {
+        size_t points = 0;
+
+        typename LeafNodeType::ValueOnCIter iter = leaf.cbeginValueOn();
+        for (; iter; ++iter) {
+            points += size_t(sEdgeGroupTable[(SIGNS & iter.getValue())][0]);
+        }
+
+        mPointList[leafIndex] = points;
+    }
+
+private:
+    std::vector<size_t>& mPointList;
+};
+
+
+/// @brief Computes the point list indices for the index tree.
+template<typename Int16TreeT>
+class MapPoints
+{
+public:
+    typedef tree::ValueAccessor<const Int16TreeT> Int16AccessorT;
+
+    MapPoints(std::vector<size_t>& pointList, const Int16TreeT& signTree)
+        : mPointList(pointList)
+        , mSignAcc(signTree)
+    {
+    }
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t leafIndex) const
+    {
+        size_t ptnIdx = mPointList[leafIndex];
+        typename LeafNodeType::ValueOnIter iter = leaf.beginValueOn();
+
+        const typename Int16TreeT::LeafNodeType *signLeafPt =
+            mSignAcc.probeConstLeaf(leaf.origin());
+
+        for (; iter; ++iter) {
+            iter.setValue(static_cast<typename LeafNodeType::ValueType>(ptnIdx));
+            unsigned signs = SIGNS & signLeafPt->getValue(iter.pos());
+            ptnIdx += size_t(sEdgeGroupTable[signs][0]);
+        }
+    }
+
+private:
+    std::vector<size_t>& mPointList;
+    Int16AccessorT mSignAcc;
+};
+
+
+/// @brief Counts the total number of points per collapsed region
+template<typename IntTreeT>
+class CountRegions
+{
+public:
+    typedef tree::ValueAccessor<IntTreeT> IntAccessorT;
+    typedef typename IntTreeT::LeafNodeType IntLeafT;
+
+    CountRegions(IntTreeT& idxTree, std::vector<size_t>& regions)
+    : mIdxAcc(idxTree)
+    , mRegions(regions)
+    {
+    }
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t leafIndex) const
+    {
+
+        size_t regions = 0;
+
+        IntLeafT tmpLeaf(*mIdxAcc.probeConstLeaf(leaf.origin()));
+
+        typename IntLeafT::ValueOnIter iter = tmpLeaf.beginValueOn();
+        for (; iter; ++iter) {
+            if(iter.getValue() == 0) {
+                iter.setValueOff();
+                regions += size_t(sEdgeGroupTable[(SIGNS & leaf.getValue(iter.pos()))][0]);
+            }
+        }
+
+        int onVoxelCount = int(tmpLeaf.onVoxelCount());
+        while (onVoxelCount > 0) {
+            ++regions;
+            iter = tmpLeaf.beginValueOn();
+            int regionId = iter.getValue();
+            for (; iter; ++iter) {
+                if (iter.getValue() == regionId) {
+                    iter.setValueOff();
+                    --onVoxelCount;
+                }
+            }
+        }
+
+        mRegions[leafIndex] = regions;
+    }
+
+private:
+    IntAccessorT mIdxAcc;
+    std::vector<size_t>& mRegions;
+};
+
+
+////////////////////////////////////////
+
+
+// @brief linear interpolation.
+inline double evalRoot(double v0, double v1, double iso) { return (iso - v0) / (v1 - v0); }
+
+
+/// @brief Extracts the eight corner values for leaf inclusive cells.
+template<typename LeafT>
+inline void
+collectCornerValues(const LeafT& leaf, const Index offset, std::vector<double>& values)
+{
+    values[0] = double(leaf.getValue(offset)); // i, j, k
+    values[3] = double(leaf.getValue(offset + 1)); // i, j, k+1
+    values[4] = double(leaf.getValue(offset + LeafT::DIM)); // i, j+1, k
+    values[7] = double(leaf.getValue(offset + LeafT::DIM + 1)); // i, j+1, k+1
+    values[1] = double(leaf.getValue(offset + (LeafT::DIM * LeafT::DIM))); // i+1, j, k
+    values[2] = double(leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) + 1)); // i+1, j, k+1
+    values[5] = double(leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) + LeafT::DIM)); // i+1, j+1, k
+    values[6] = double(leaf.getValue(offset + (LeafT::DIM * LeafT::DIM) + LeafT::DIM + 1)); // i+1, j+1, k+1
+}
+
+
+/// @brief Extracts the eight corner values for a cell starting at the given @ijk coordinate.
+template<typename AccessorT>
+inline void
+collectCornerValues(const AccessorT& acc, const Coord& ijk, std::vector<double>& values)
+{
+    Coord coord = ijk;
+    values[0] = double(acc.getValue(coord)); // i, j, k
+
+    coord[0] += 1;
+    values[1] = double(acc.getValue(coord)); // i+1, j, k
+
+    coord[2] += 1;
+    values[2] = double(acc.getValue(coord)); // i+i, j, k+1
+
+    coord[0] = ijk[0];
+    values[3] = double(acc.getValue(coord)); // i, j, k+1
+
+    coord[1] += 1; coord[2] = ijk[2];
+    values[4] = double(acc.getValue(coord)); // i, j+1, k
+
+    coord[0] += 1;
+    values[5] = double(acc.getValue(coord)); // i+1, j+1, k
+
+    coord[2] += 1;
+    values[6] = double(acc.getValue(coord)); // i+1, j+1, k+1
+
+    coord[0] = ijk[0];
+    values[7] = double(acc.getValue(coord)); // i, j+1, k+1
+}
+
+
+/// @brief Computes the average cell point for a given edge group.
+inline Vec3d
+computePoint(const std::vector<double>& values, unsigned char signs,
+    unsigned char edgeGroup, double iso)
+{
+    Vec3d avg(0.0, 0.0, 0.0);
+    int samples = 0;
+
+    if (sEdgeGroupTable[signs][1] == edgeGroup) { // Edged: 0 - 1
+        avg[0] += evalRoot(values[0], values[1], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][2] == edgeGroup) { // Edged: 1 - 2
+        avg[0] += 1.0;
+        avg[2] += evalRoot(values[1], values[2], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][3] == edgeGroup) { // Edged: 3 - 2
+        avg[0] += evalRoot(values[3], values[2], iso);
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][4] == edgeGroup) { // Edged: 0 - 3
+        avg[2] += evalRoot(values[0], values[3], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][5] == edgeGroup) { // Edged: 4 - 5
+        avg[0] += evalRoot(values[4], values[5], iso);
+        avg[1] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][6] == edgeGroup) { // Edged: 5 - 6
+        avg[0] += 1.0;
+        avg[1] += 1.0;
+        avg[2] += evalRoot(values[5], values[6], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][7] == edgeGroup) { // Edged: 7 - 6
+        avg[0] += evalRoot(values[7], values[6], iso);
+        avg[1] += 1.0;
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][8] == edgeGroup) { // Edged: 4 - 7
+        avg[1] += 1.0;
+        avg[2] += evalRoot(values[4], values[7], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][9] == edgeGroup) { // Edged: 0 - 4
+        avg[1] += evalRoot(values[0], values[4], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][10] == edgeGroup) { // Edged: 1 - 5
+        avg[0] += 1.0;
+        avg[1] += evalRoot(values[1], values[5], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][11] == edgeGroup) { // Edged: 2 - 6
+        avg[0] += 1.0;
+        avg[1] += evalRoot(values[2], values[6], iso);
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][12] == edgeGroup) { // Edged: 3 - 7
+        avg[1] += evalRoot(values[3], values[7], iso);
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (samples > 1) {
+        double w = 1.0 / double(samples);
+        avg[0] *= w;
+        avg[1] *= w;
+        avg[2] *= w;
+    }
+
+    return avg;
+}
+
+
+/// @brief  Computes the average cell point for a given edge group, ignoring edge
+///         samples present in the @c signsMask configuration.
+inline int
+computeMaskedPoint(Vec3d& avg, const std::vector<double>& values, unsigned char signs,
+    unsigned char signsMask, unsigned char edgeGroup, double iso)
+{
+    avg = Vec3d(0.0, 0.0, 0.0);
+    int samples = 0;
+
+    if (sEdgeGroupTable[signs][1] == edgeGroup
+        && sEdgeGroupTable[signsMask][1] == 0) { // Edged: 0 - 1
+        avg[0] += evalRoot(values[0], values[1], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][2] == edgeGroup
+        && sEdgeGroupTable[signsMask][2] == 0) { // Edged: 1 - 2
+        avg[0] += 1.0;
+        avg[2] += evalRoot(values[1], values[2], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][3] == edgeGroup
+        && sEdgeGroupTable[signsMask][3] == 0) { // Edged: 3 - 2
+        avg[0] += evalRoot(values[3], values[2], iso);
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][4] == edgeGroup
+        && sEdgeGroupTable[signsMask][4] == 0) { // Edged: 0 - 3
+        avg[2] += evalRoot(values[0], values[3], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][5] == edgeGroup
+        && sEdgeGroupTable[signsMask][5] == 0) { // Edged: 4 - 5
+        avg[0] += evalRoot(values[4], values[5], iso);
+        avg[1] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][6] == edgeGroup
+        && sEdgeGroupTable[signsMask][6] == 0) { // Edged: 5 - 6
+        avg[0] += 1.0;
+        avg[1] += 1.0;
+        avg[2] += evalRoot(values[5], values[6], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][7] == edgeGroup
+        && sEdgeGroupTable[signsMask][7] == 0) { // Edged: 7 - 6
+        avg[0] += evalRoot(values[7], values[6], iso);
+        avg[1] += 1.0;
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][8] == edgeGroup
+        && sEdgeGroupTable[signsMask][8] == 0) { // Edged: 4 - 7
+        avg[1] += 1.0;
+        avg[2] += evalRoot(values[4], values[7], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][9] == edgeGroup
+        && sEdgeGroupTable[signsMask][9] == 0) { // Edged: 0 - 4
+        avg[1] += evalRoot(values[0], values[4], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][10] == edgeGroup
+        && sEdgeGroupTable[signsMask][10] == 0) { // Edged: 1 - 5
+        avg[0] += 1.0;
+        avg[1] += evalRoot(values[1], values[5], iso);
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][11] == edgeGroup
+        && sEdgeGroupTable[signsMask][11] == 0) { // Edged: 2 - 6
+        avg[0] += 1.0;
+        avg[1] += evalRoot(values[2], values[6], iso);
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (sEdgeGroupTable[signs][12] == edgeGroup
+        && sEdgeGroupTable[signsMask][12] == 0) { // Edged: 3 - 7
+        avg[1] += evalRoot(values[3], values[7], iso);
+        avg[2] += 1.0;
+        ++samples;
+    }
+
+    if (samples > 1) {
+        double w = 1.0 / double(samples);
+        avg[0] *= w;
+        avg[1] *= w;
+        avg[2] *= w;
+    }
+
+    return samples;
+}
+
+
+/// @brief  Computes the average cell point for a given edge group, by computing
+///         convex weights based on the distance from the sample point @c p.
+inline Vec3d
+computeWeightedPoint(const Vec3d& p, const std::vector<double>& values,
+    unsigned char signs, unsigned char edgeGroup, double iso)
+{
+    std::vector<Vec3d> samples;
+    samples.reserve(8);
+
+    std::vector<double> weights;
+    weights.reserve(8);
+
+    Vec3d avg(0.0, 0.0, 0.0);
+
+    if (sEdgeGroupTable[signs][1] == edgeGroup) { // Edged: 0 - 1
+        avg[0] = evalRoot(values[0], values[1], iso);
+        avg[1] = 0.0;
+        avg[2] = 0.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][2] == edgeGroup) { // Edged: 1 - 2
+        avg[0] = 1.0;
+        avg[1] = 0.0;
+        avg[2] = evalRoot(values[1], values[2], iso);
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][3] == edgeGroup) { // Edged: 3 - 2
+        avg[0] = evalRoot(values[3], values[2], iso);
+        avg[1] = 0.0;
+        avg[2] = 1.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][4] == edgeGroup) { // Edged: 0 - 3
+        avg[0] = 0.0;
+        avg[1] = 0.0;
+        avg[2] = evalRoot(values[0], values[3], iso);
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][5] == edgeGroup) { // Edged: 4 - 5
+        avg[0] = evalRoot(values[4], values[5], iso);
+        avg[1] = 1.0;
+        avg[2] = 0.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][6] == edgeGroup) { // Edged: 5 - 6
+        avg[0] = 1.0;
+        avg[1] = 1.0;
+        avg[2] = evalRoot(values[5], values[6], iso);
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][7] == edgeGroup) { // Edged: 7 - 6
+        avg[0] = evalRoot(values[7], values[6], iso);
+        avg[1] = 1.0;
+        avg[2] = 1.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][8] == edgeGroup) { // Edged: 4 - 7
+        avg[0] = 0.0;
+        avg[1] = 1.0;
+        avg[2] = evalRoot(values[4], values[7], iso);
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][9] == edgeGroup) { // Edged: 0 - 4
+        avg[0] = 0.0;
+        avg[1] = evalRoot(values[0], values[4], iso);
+        avg[2] = 0.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][10] == edgeGroup) { // Edged: 1 - 5
+        avg[0] = 1.0;
+        avg[1] = evalRoot(values[1], values[5], iso);
+        avg[2] = 0.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][11] == edgeGroup) { // Edged: 2 - 6
+        avg[0] = 1.0;
+        avg[1] = evalRoot(values[2], values[6], iso);
+        avg[2] = 1.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+    if (sEdgeGroupTable[signs][12] == edgeGroup) { // Edged: 3 - 7
+        avg[0] = 0.0;
+        avg[1] = evalRoot(values[3], values[7], iso);
+        avg[2] = 1.0;
+
+        samples.push_back(avg);
+        weights.push_back((avg-p).lengthSqr());
+    }
+
+
+    double minWeight = std::numeric_limits<double>::max();
+    double maxWeight = -std::numeric_limits<double>::max();
+
+    for (size_t i = 0, I = weights.size(); i < I; ++i) {
+        minWeight = std::min(minWeight, weights[i]);
+        maxWeight = std::max(maxWeight, weights[i]);
+    }
+
+    const double offset = maxWeight + minWeight * 0.1;
+    for (size_t i = 0, I = weights.size(); i < I; ++i) {
+        weights[i] = offset - weights[i];
+    }
+
+
+    double weightSum = 0.0;
+    for (size_t i = 0, I = weights.size(); i < I; ++i) {
+        weightSum += weights[i];
+    }
+
+    avg[0] = 0.0;
+    avg[1] = 0.0;
+    avg[2] = 0.0;
+
+    if (samples.size() > 1) {
+        for (size_t i = 0, I = samples.size(); i < I; ++i) {
+            avg += samples[i] * (weights[i] / weightSum);
+        }
+    } else {
+        avg = samples.front();
+    }
+
+    return avg;
+}
+
+
+/// @brief  Computes the average cell points defined by the sign configuration
+///         @c signs and the given corner values @c values.
+inline void
+computeCellPoints(std::vector<Vec3d>& points,
+    const std::vector<double>& values, unsigned char signs, double iso)
+{
+    for (size_t n = 1, N = sEdgeGroupTable[signs][0] + 1; n < N; ++n) {
+        points.push_back(computePoint(values, signs, uint8_t(n), iso));
+    }
+}
+
+
+/// @brief  Given a sign configuration @c lhsSigns and an edge group @c groupId,
+///         finds the corresponding edge group in a different sign configuration
+///         @c rhsSigns. Returns -1 if no match is found.
+inline int
+matchEdgeGroup(unsigned char groupId, unsigned char lhsSigns, unsigned char rhsSigns)
+{
+    int id = -1;
+    for (size_t i = 1; i <= 12; ++i) {
+        if (sEdgeGroupTable[lhsSigns][i] == groupId && sEdgeGroupTable[rhsSigns][i] != 0) {
+            id = sEdgeGroupTable[rhsSigns][i];
+            break;
+        }
+    }
+    return id;
+}
+
+
+/// @brief  Computes the average cell points defined by the sign configuration
+///         @c signs and the given corner values @c values. Combines data from
+///         two different level sets to eliminate seam lines when meshing
+///         fractured segments.
+inline void
+computeCellPoints(std::vector<Vec3d>& points, std::vector<bool>& weightedPointMask,
+    const std::vector<double>& lhsValues, const std::vector<double>& rhsValues,
+    unsigned char lhsSigns, unsigned char rhsSigns,
+    double iso, size_t pointIdx, const boost::scoped_array<uint32_t>& seamPoints)
+{
+    for (size_t n = 1, N = sEdgeGroupTable[lhsSigns][0] + 1; n < N; ++n) {
+
+        int id = matchEdgeGroup(uint8_t(n), lhsSigns, rhsSigns);
+
+        if (id != -1) {
+
+            const unsigned char e = uint8_t(id);
+            uint32_t& quantizedPoint = seamPoints[pointIdx + (id - 1)];
+
+            if ((quantizedPoint & MASK_DIRTY_BIT) && !(quantizedPoint & MASK_INVALID_BIT)) {
+                Vec3d p = unpackPoint(quantizedPoint);
+                points.push_back(computeWeightedPoint(p, rhsValues, rhsSigns, e, iso));
+                weightedPointMask.push_back(true);
+            } else {
+                points.push_back(computePoint(rhsValues, rhsSigns, e, iso));
+                weightedPointMask.push_back(false);
+            }
+
+        } else {
+            points.push_back(computePoint(lhsValues, lhsSigns, uint8_t(n), iso));
+            weightedPointMask.push_back(false);
+        }
+    }
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+class GenPoints
+{
+public:
+    typedef tree::ValueAccessor<const TreeT> AccessorT;
+
+    typedef typename TreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef tree::ValueAccessor<IntTreeT> IntAccessorT;
+    typedef tree::ValueAccessor<const IntTreeT> IntCAccessorT;
+
+    typedef typename TreeT::template ValueConverter<Int16>::Type Int16TreeT;
+    typedef tree::ValueAccessor<const Int16TreeT> Int16CAccessorT;
+
+    typedef boost::scoped_array<uint32_t> QuantizedPointList;
+
+    //////////
+
+
+    GenPoints(const LeafManagerT& signLeafs, const TreeT& distTree,
+        IntTreeT& idxTree, PointList& points, std::vector<size_t>& indices,
+        const math::Transform& xform, double iso);
+
+    void run(bool threaded = true);
+
+    void setRefData(const Int16TreeT* refSignTree = NULL, const TreeT* refDistTree = NULL,
+        IntTreeT* refIdxTree = NULL, const QuantizedPointList* seamPoints = NULL,
+        std::vector<unsigned char>* mSeamPointMaskPt = NULL);
+
+    //////////
+
+
+    void operator()(const tbb::blocked_range<size_t>&) const;
+
+private:
+    const LeafManagerT& mSignLeafs;
+
+    AccessorT mDistAcc;
+    IntTreeT& mIdxTree;
+
+    PointList& mPoints;
+    std::vector<size_t>& mIndices;
+    const math::Transform& mTransform;
+    const double mIsovalue;
+
+    // reference data
+    const Int16TreeT *mRefSignTreePt;
+    const TreeT* mRefDistTreePt;
+    const IntTreeT* mRefIdxTreePt;
+    const QuantizedPointList* mSeamPointsPt;
+    std::vector<unsigned char>* mSeamPointMaskPt;
+};
+
+
+template <typename TreeT, typename LeafManagerT>
+GenPoints<TreeT, LeafManagerT>::GenPoints(const LeafManagerT& signLeafs,
+    const TreeT& distTree, IntTreeT& idxTree, PointList& points,
+    std::vector<size_t>& indices, const math::Transform& xform, double iso)
+    : mSignLeafs(signLeafs)
+    , mDistAcc(distTree)
+    , mIdxTree(idxTree)
+    , mPoints(points)
+    , mIndices(indices)
+    , mTransform(xform)
+    , mIsovalue(iso)
+    , mRefSignTreePt(NULL)
+    , mRefDistTreePt(NULL)
+    , mRefIdxTreePt(NULL)
+    , mSeamPointsPt(NULL)
+    , mSeamPointMaskPt(NULL)
+{
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+GenPoints<TreeT, LeafManagerT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_for(mSignLeafs.getRange(), *this);
+    else (*this)(mSignLeafs.getRange());
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+GenPoints<TreeT, LeafManagerT>::setRefData(
+    const Int16TreeT *refSignTree,
+    const TreeT *refDistTree,
+    IntTreeT* refIdxTree,
+    const QuantizedPointList* seamPoints,
+    std::vector<unsigned char>* seamPointMask)
+{
+    mRefSignTreePt = refSignTree;
+    mRefDistTreePt = refDistTree;
+    mRefIdxTreePt = refIdxTree;
+    mSeamPointsPt = seamPoints;
+    mSeamPointMaskPt = seamPointMask;
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+GenPoints<TreeT, LeafManagerT>::operator()(const tbb::blocked_range<size_t>& range) const
+{
+    typename IntTreeT::LeafNodeType::ValueOnIter iter;
+    unsigned char signs, refSigns;
+    Index offset;
+    Coord ijk, coord;
+    std::vector<Vec3d> points(4);
+    std::vector<bool> weightedPointMask(4);
+    std::vector<double> values(8), refValues(8);
+
+
+    IntAccessorT idxAcc(mIdxTree);
+
+    // reference data accessors
+    boost::scoped_ptr<Int16CAccessorT> refSignAcc;
+    if (mRefSignTreePt) refSignAcc.reset(new Int16CAccessorT(*mRefSignTreePt));
+
+    boost::scoped_ptr<IntCAccessorT> refIdxAcc;
+    if (mRefIdxTreePt) refIdxAcc.reset(new IntCAccessorT(*mRefIdxTreePt));
+
+    boost::scoped_ptr<AccessorT> refDistAcc;
+    if (mRefDistTreePt) refDistAcc.reset(new AccessorT(*mRefDistTreePt));
+
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        coord = mSignLeafs.leaf(n).origin();
+
+        const typename TreeT::LeafNodeType *leafPt = mDistAcc.probeConstLeaf(coord);
+        typename IntTreeT::LeafNodeType *idxLeafPt = idxAcc.probeLeaf(coord);
+
+
+        // reference data leafs
+        const typename Int16TreeT::LeafNodeType *refSignLeafPt = NULL;
+        if (refSignAcc) refSignLeafPt = refSignAcc->probeConstLeaf(coord);
+
+        const typename IntTreeT::LeafNodeType *refIdxLeafPt = NULL;
+        if (refIdxAcc) refIdxLeafPt = refIdxAcc->probeConstLeaf(coord);
+
+        const typename TreeT::LeafNodeType *refDistLeafPt = NULL;
+        if (refDistAcc) refDistLeafPt = refDistAcc->probeConstLeaf(coord);
+
+
+        // generate cell points
+        size_t ptnIdx = mIndices[n];
+        coord.offset(TreeT::LeafNodeType::DIM - 1);
+
+
+
+        for (iter = idxLeafPt->beginValueOn(); iter; ++iter) {
+
+            if(iter.getValue() != 0) continue;
+
+            iter.setValue(static_cast<typename IntTreeT::ValueType>(ptnIdx));
+            iter.setValueOff();
+            offset = iter.pos();
+            ijk = iter.getCoord();
+
+            const bool inclusiveCell = ijk[0] < coord[0] && ijk[1] < coord[1] && ijk[2] < coord[2];
+
+            const Int16& flags = mSignLeafs.leaf(n).getValue(offset);
+            signs    = uint8_t(SIGNS & flags);
+            refSigns = 0;
+
+            if ((flags & SEAM) && refSignLeafPt && refIdxLeafPt) {
+                if (refSignLeafPt->isValueOn(offset)) {
+                    refSigns = uint8_t(SIGNS & refSignLeafPt->getValue(offset));
+                }
+            }
+
+
+            if (inclusiveCell) collectCornerValues(*leafPt, offset, values);
+            else collectCornerValues(mDistAcc, ijk, values);
+
+
+            points.clear();
+            weightedPointMask.clear();
+
+            if (refSigns == 0) {
+                computeCellPoints(points, values, signs, mIsovalue);
+            } else {
+
+                if (inclusiveCell) collectCornerValues(*refDistLeafPt, offset, refValues);
+                else collectCornerValues(*refDistAcc, ijk, refValues);
+
+                computeCellPoints(points, weightedPointMask, values, refValues, signs, refSigns,
+                    mIsovalue, refIdxLeafPt->getValue(offset), *mSeamPointsPt);
+            }
+
+
+            for (size_t i = 0, I = points.size(); i < I; ++i) {
+
+                // offset by cell-origin
+                points[i][0] += double(ijk[0]);
+                points[i][1] += double(ijk[1]);
+                points[i][2] += double(ijk[2]);
+
+
+                points[i] = mTransform.indexToWorld(points[i]);
+
+                mPoints[ptnIdx][0] = float(points[i][0]);
+                mPoints[ptnIdx][1] = float(points[i][1]);
+                mPoints[ptnIdx][2] = float(points[i][2]);
+
+                if (mSeamPointMaskPt && !weightedPointMask.empty() && weightedPointMask[i]) {
+                    (*mSeamPointMaskPt)[ptnIdx] = 1;
+                }
+
+                ++ptnIdx;
+            }
+        }
+
+        // generate collapsed region points
+        int onVoxelCount = int(idxLeafPt->onVoxelCount());
+        while (onVoxelCount > 0) {
+
+            iter = idxLeafPt->beginValueOn();
+            int regionId = iter.getValue(), count = 0;
+
+            Vec3d avg(0.0), point;
+
+            for (; iter; ++iter) {
+                if (iter.getValue() != regionId) continue;
+
+                iter.setValue(static_cast<typename IntTreeT::ValueType>(ptnIdx));
+                iter.setValueOff();
+                --onVoxelCount;
+
+                ijk = iter.getCoord();
+                offset = iter.pos();
+
+                signs = uint8_t(SIGNS & mSignLeafs.leaf(n).getValue(offset));
+
+                if (ijk[0] < coord[0] && ijk[1] < coord[1] && ijk[2] < coord[2]) {
+                    collectCornerValues(*leafPt, offset, values);
+                } else {
+                    collectCornerValues(mDistAcc, ijk, values);
+                }
+
+                points.clear();
+                computeCellPoints(points, values, signs, mIsovalue);
+
+                avg[0] += double(ijk[0]) + points[0][0];
+                avg[1] += double(ijk[1]) + points[0][1];
+                avg[2] += double(ijk[2]) + points[0][2];
+
+                ++count;
+            }
+
+
+            if (count > 1) {
+                double w = 1.0 / double(count);
+                avg[0] *= w;
+                avg[1] *= w;
+                avg[2] *= w;
+            }
+
+            avg = mTransform.indexToWorld(avg);
+
+            mPoints[ptnIdx][0] = float(avg[0]);
+            mPoints[ptnIdx][1] = float(avg[1]);
+            mPoints[ptnIdx][2] = float(avg[2]);
+
+            ++ptnIdx;
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+class SeamWeights
+{
+public:
+    typedef tree::ValueAccessor<const TreeT> AccessorT;
+
+    typedef typename TreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef tree::ValueAccessor<const IntTreeT> IntAccessorT;
+
+    typedef typename TreeT::template ValueConverter<Int16>::Type Int16TreeT;
+    typedef tree::ValueAccessor<const Int16TreeT> Int16AccessorT;
+
+    typedef boost::scoped_array<uint32_t> QuantizedPointList;
+
+    //////////
+
+    SeamWeights(const TreeT& distTree, const Int16TreeT& refSignTree,
+        IntTreeT& refIdxTree, QuantizedPointList& points, double iso);
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &signLeaf, size_t leafIndex) const;
+
+private:
+    AccessorT mDistAcc;
+    Int16AccessorT mRefSignAcc;
+    IntAccessorT mRefIdxAcc;
+
+    QuantizedPointList& mPoints;
+    const double mIsovalue;
+};
+
+
+template<typename TreeT>
+SeamWeights<TreeT>::SeamWeights(const TreeT& distTree, const Int16TreeT& refSignTree,
+    IntTreeT& refIdxTree, QuantizedPointList& points, double iso)
+    : mDistAcc(distTree)
+    , mRefSignAcc(refSignTree)
+    , mRefIdxAcc(refIdxTree)
+    , mPoints(points)
+    , mIsovalue(iso)
+{
+}
+
+
+template<typename TreeT>
+template <typename LeafNodeType>
+void
+SeamWeights<TreeT>::operator()(LeafNodeType &signLeaf, size_t /*leafIndex*/) const
+{
+    Coord coord = signLeaf.origin();
+    const typename Int16TreeT::LeafNodeType *refSignLeafPt = mRefSignAcc.probeConstLeaf(coord);
+
+    if (!refSignLeafPt) return;
+
+    const typename TreeT::LeafNodeType *distLeafPt = mDistAcc.probeConstLeaf(coord);
+    const typename IntTreeT::LeafNodeType *refIdxLeafPt = mRefIdxAcc.probeConstLeaf(coord);
+
+    std::vector<double> values(8);
+    unsigned char lhsSigns, rhsSigns;
+    Vec3d point;
+    Index offset;
+
+    Coord ijk;
+    coord.offset(TreeT::LeafNodeType::DIM - 1);
+
+    typename LeafNodeType::ValueOnCIter iter = signLeaf.cbeginValueOn();
+    for (; iter; ++iter) {
+
+        offset = iter.pos();
+        ijk = iter.getCoord();
+
+        const bool inclusiveCell = ijk[0] < coord[0] && ijk[1] < coord[1] && ijk[2] < coord[2];
+
+        if ((iter.getValue() & SEAM) && refSignLeafPt->isValueOn(offset)) {
+
+            lhsSigns = uint8_t(SIGNS & iter.getValue());
+            rhsSigns = uint8_t(SIGNS & refSignLeafPt->getValue(offset));
+
+
+            if (inclusiveCell) {
+                collectCornerValues(*distLeafPt, offset, values);
+            } else {
+                collectCornerValues(mDistAcc, ijk, values);
+            }
+
+
+            for (size_t n = 1, N = sEdgeGroupTable[lhsSigns][0] + 1; n < N; ++n) {
+
+                int id = matchEdgeGroup(uint8_t(n), lhsSigns, rhsSigns);
+
+                if (id != -1) {
+
+                    uint32_t& data = mPoints[refIdxLeafPt->getValue(offset) + (id - 1)];
+
+                    if (!(data & MASK_DIRTY_BIT)) {
+
+                        int smaples = computeMaskedPoint(
+                            point, values, lhsSigns, rhsSigns, uint8_t(n), mIsovalue);
+
+                        if (smaples > 0) data = packPoint(point);
+                        else data = MASK_INVALID_BIT;
+
+                        data |= MASK_DIRTY_BIT;
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename TreeT, typename LeafManagerT>
+class MergeVoxelRegions
+{
+public:
+    typedef typename TreeT::ValueType ValueT;
+    typedef tree::ValueAccessor<const TreeT> AccessorT;
+
+    typedef typename TreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef tree::ValueAccessor<IntTreeT> IntAccessorT;
+
+    typedef typename TreeT::template ValueConverter<bool>::Type BoolTreeT;
+
+    typedef typename LeafManagerT::TreeType::template ValueConverter<Int16>::Type Int16TreeT;
+    typedef tree::ValueAccessor<const Int16TreeT> Int16AccessorT;
+
+    typedef typename TreeT::template ValueConverter<float>::Type FloatTreeT;
+    typedef Grid<FloatTreeT> FloatGridT;
+
+
+    //////////
+
+    MergeVoxelRegions(const LeafManagerT& signLeafs, const Int16TreeT& signTree,
+        const TreeT& distTree, IntTreeT& idxTree, ValueT iso, ValueT adaptivity);
+
+    void run(bool threaded = true);
+
+    void setSpatialAdaptivity(
+        const math::Transform& distGridXForm, const FloatGridT& adaptivityField);
+
+    void setAdaptivityMask(const BoolTreeT* mask);
+
+    void setRefData(const Int16TreeT* signTree, ValueT adaptivity);
+
+    //////////
+
+
+    void operator()(const tbb::blocked_range<size_t>&) const;
+
+private:
+
+    const LeafManagerT& mSignLeafs;
+
+    const Int16TreeT& mSignTree;
+    Int16AccessorT mSignAcc;
+
+    const TreeT& mDistTree;
+    AccessorT mDistAcc;
+
+    IntTreeT& mIdxTree;
+    ValueT mIsovalue, mSurfaceAdaptivity, mInternalAdaptivity;
+
+    const math::Transform* mTransform;
+    const FloatGridT* mAdaptivityGrid;
+    const BoolTreeT* mAdaptivityMask;
+
+    const Int16TreeT* mRefSignTree;
+};
+
+
+template <typename TreeT, typename LeafManagerT>
+MergeVoxelRegions<TreeT, LeafManagerT>::MergeVoxelRegions(
+    const LeafManagerT& signLeafs, const Int16TreeT& signTree,
+    const TreeT& distTree, IntTreeT& idxTree, ValueT iso, ValueT adaptivity)
+    : mSignLeafs(signLeafs)
+    , mSignTree(signTree)
+    , mSignAcc(mSignTree)
+    , mDistTree(distTree)
+    , mDistAcc(mDistTree)
+    , mIdxTree(idxTree)
+    , mIsovalue(iso)
+    , mSurfaceAdaptivity(adaptivity)
+    , mInternalAdaptivity(adaptivity)
+    , mTransform(NULL)
+    , mAdaptivityGrid(NULL)
+    , mAdaptivityMask(NULL)
+    , mRefSignTree(NULL)
+{
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+MergeVoxelRegions<TreeT, LeafManagerT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_for(mSignLeafs.getRange(), *this);
+    else (*this)(mSignLeafs.getRange());
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+MergeVoxelRegions<TreeT, LeafManagerT>::setSpatialAdaptivity(
+    const math::Transform& distGridXForm, const FloatGridT& adaptivityField)
+{
+    mTransform = &distGridXForm;
+    mAdaptivityGrid = &adaptivityField;
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+MergeVoxelRegions<TreeT, LeafManagerT>::setAdaptivityMask(const BoolTreeT* mask)
+{
+    mAdaptivityMask = mask;
+}
+
+template <typename TreeT, typename LeafManagerT>
+void
+MergeVoxelRegions<TreeT, LeafManagerT>::setRefData(const Int16TreeT* signTree, ValueT adaptivity)
+{
+    mRefSignTree = signTree;
+    mInternalAdaptivity = adaptivity;
+}
+
+
+template <typename TreeT, typename LeafManagerT>
+void
+MergeVoxelRegions<TreeT, LeafManagerT>::operator()(const tbb::blocked_range<size_t>& range) const
+{
+    typedef math::Vec3<ValueT> Vec3T;
+
+    typedef typename TreeT::LeafNodeType LeafT;
+    typedef typename IntTreeT::LeafNodeType IntLeafT;
+    typedef typename BoolTreeT::LeafNodeType BoolLeafT;
+    typedef typename LeafT::template ValueConverter<Vec3T>::Type Vec3LeafT;
+
+    const int LeafDim = LeafT::DIM;
+
+    IntAccessorT idxAcc(mIdxTree);
+
+    typename LeafManagerT::TreeType::LeafNodeType::ValueOnCIter iter;
+
+    typedef typename tree::ValueAccessor<const FloatTreeT> FloatTreeCAccessorT;
+    boost::scoped_ptr<FloatTreeCAccessorT> adaptivityAcc;
+    if (mAdaptivityGrid) {
+        adaptivityAcc.reset(new FloatTreeCAccessorT(mAdaptivityGrid->tree()));
+    }
+
+    typedef typename tree::ValueAccessor<const Int16TreeT> Int16TreeCAccessorT;
+    boost::scoped_ptr<Int16TreeCAccessorT> refAcc;
+    if (mRefSignTree) {
+        refAcc.reset(new Int16TreeCAccessorT(*mRefSignTree));
+    }
+
+    typedef typename tree::ValueAccessor<const BoolTreeT> BoolTreeCAccessorT;
+    boost::scoped_ptr<BoolTreeCAccessorT> maskAcc;
+    if (mAdaptivityMask) {
+        maskAcc.reset(new BoolTreeCAccessorT(*mAdaptivityMask));
+    }
+
+
+    BoolLeafT mask;
+    Vec3LeafT gradients;
+    Coord ijk, end;
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        mask.setValuesOff();
+
+        const Coord& origin = mSignLeafs.leaf(n).origin();
+
+        ValueT adaptivity = (refAcc && !refAcc->probeConstLeaf(origin)) ?
+            mInternalAdaptivity : mSurfaceAdaptivity;
+
+        IntLeafT& idxLeaf = *idxAcc.probeLeaf(origin);
+
+        end[0] = origin[0] + LeafDim;
+        end[1] = origin[1] + LeafDim;
+        end[2] = origin[2] + LeafDim;
+
+        // Mask off seam line adjacent voxels
+        if (maskAcc) {
+            const BoolLeafT* maskLeaf = maskAcc->probeConstLeaf(origin);
+            if (maskLeaf != NULL) {
+                typename BoolLeafT::ValueOnCIter it;
+                for (it = maskLeaf->cbeginValueOn(); it; ++it) {
+                    mask.setActiveState(it.getCoord() & ~1u, true);
+                }
+            }
+        }
+
+        // Set region adaptivity
+        LeafT adaptivityLeaf(origin, adaptivity);
+        if (mAdaptivityGrid) {
+            for (Index offset = 0; offset < LeafT::NUM_VALUES; ++offset) {
+                ijk = adaptivityLeaf.offsetToGlobalCoord(offset);
+                Vec3d xyz = mAdaptivityGrid->transform().worldToIndex(
+                    mTransform->indexToWorld(ijk));
+                ValueT tmpA = ValueT(adaptivityAcc->getValue(util::nearestCoord(xyz)));
+                adaptivityLeaf.setValueOnly(offset, tmpA * adaptivity);
+            }
+        }
+
+        // Mask off ambiguous voxels
+        for (iter = mSignLeafs.leaf(n).cbeginValueOn(); iter; ++iter) {
+            unsigned char signs = static_cast<unsigned char>(SIGNS & int(iter.getValue()));
+            if (!sAdaptable[signs] || sEdgeGroupTable[signs][0] > 1) {
+                mask.setActiveState(iter.getCoord() & ~1u, true);
+            }
+        }
+
+        // Mask off topologically ambiguous 2x2x2 voxel sub-blocks
+        int dim = 2;
+        for (ijk[0] = origin[0]; ijk[0] < end[0]; ijk[0] += dim) {
+            for (ijk[1] = origin[1]; ijk[1] < end[1]; ijk[1] += dim) {
+                for (ijk[2] = origin[2]; ijk[2] < end[2]; ijk[2] += dim) {
+                    if (!mask.isValueOn(ijk) & isNonManifold(mDistAcc, ijk, mIsovalue, dim)) {
+                        mask.setActiveState(ijk, true);
+                    }
+                }
+            }
+        }
+
+        // Compute the gradient for the remaining voxels
+        gradients.setValuesOff();
+        for (iter = mSignLeafs.leaf(n).cbeginValueOn(); iter; ++iter) {
+            ijk = iter.getCoord();
+            if(!mask.isValueOn(ijk & ~1u)) {
+                Vec3T dir(math::ISGradient<math::CD_2ND>::result(mDistAcc, ijk));
+                dir.normalize();
+                gradients.setValueOn(iter.pos(), dir);
+            }
+        }
+
+        // Merge regions
+        int regionId = 1;
+        for ( ; dim <= LeafDim; dim = dim << 1) {
+            const unsigned coordMask = ~((dim << 1) - 1);
+            for (ijk[0] = origin[0]; ijk[0] < end[0]; ijk[0] += dim) {
+                for (ijk[1] = origin[1]; ijk[1] < end[1]; ijk[1] += dim) {
+                    for (ijk[2] = origin[2]; ijk[2] < end[2]; ijk[2] += dim) {
+
+                        adaptivity = adaptivityLeaf.getValue(ijk);
+
+                        if (mask.isValueOn(ijk) || isNonManifold(mDistAcc, ijk, mIsovalue, dim)
+                            || !isMergable(gradients, ijk, dim, adaptivity)) {
+                            mask.setActiveState(ijk & coordMask, true);
+                        } else {
+                            mergeVoxels(idxLeaf, ijk, dim, regionId++);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+// Constructs qudas
+struct UniformPrimBuilder
+{
+    UniformPrimBuilder(): mIdx(0), mPolygonPool(NULL) {}
+
+    void init(const size_t upperBound, PolygonPool& quadPool)
+    {
+        mPolygonPool = &quadPool;
+        mPolygonPool->resetQuads(upperBound);
+        mIdx = 0;
+    }
+
+    void addPrim(const Vec4I& verts, bool reverse, char flags = 0)
+    {
+        if (!reverse) {
+            mPolygonPool->quad(mIdx) = verts;
+        } else {
+            Vec4I& quad = mPolygonPool->quad(mIdx);
+            quad[0] = verts[3];
+            quad[1] = verts[2];
+            quad[2] = verts[1];
+            quad[3] = verts[0];
+        }
+        mPolygonPool->quadFlags(mIdx) = flags;
+        ++mIdx;
+    }
+
+    void done()
+    {
+        mPolygonPool->trimQuads(mIdx);
+    }
+
+private:
+    size_t mIdx;
+    PolygonPool* mPolygonPool;
+};
+
+
+// Constructs qudas and triangles
+struct AdaptivePrimBuilder
+{
+    AdaptivePrimBuilder() : mQuadIdx(0), mTriangleIdx(0), mPolygonPool(NULL) {}
+
+    void init(const size_t upperBound, PolygonPool& polygonPool)
+    {
+        mPolygonPool = &polygonPool;
+        mPolygonPool->resetQuads(upperBound);
+        mPolygonPool->resetTriangles(upperBound);
+
+        mQuadIdx = 0;
+        mTriangleIdx = 0;
+    }
+
+    void addPrim(const Vec4I& verts, bool reverse, char flags = 0)
+    {
+        if (verts[0] != verts[1] && verts[0] != verts[2] && verts[0] != verts[3]
+            && verts[1] != verts[2] && verts[1] != verts[3] && verts[2] != verts[3]) {
+            mPolygonPool->quadFlags(mQuadIdx) = flags;
+            addQuad(verts, reverse);
+        } else if (
+            verts[0] == verts[3] &&
+            verts[1] != verts[2] &&
+            verts[1] != verts[0] &&
+            verts[2] != verts[0]) {
+            mPolygonPool->triangleFlags(mTriangleIdx) = flags;
+            addTriangle(verts[0], verts[1], verts[2], reverse);
+        } else if (
+            verts[1] == verts[2] &&
+            verts[0] != verts[3] &&
+            verts[0] != verts[1] &&
+            verts[3] != verts[1]) {
+            mPolygonPool->triangleFlags(mTriangleIdx) = flags;
+            addTriangle(verts[0], verts[1], verts[3], reverse);
+        } else if (
+            verts[0] == verts[1] &&
+            verts[2] != verts[3] &&
+            verts[2] != verts[0] &&
+            verts[3] != verts[0]) {
+            mPolygonPool->triangleFlags(mTriangleIdx) = flags;
+            addTriangle(verts[0], verts[2], verts[3], reverse);
+        } else if (
+            verts[2] == verts[3] &&
+            verts[0] != verts[1] &&
+            verts[0] != verts[2] &&
+            verts[1] != verts[2]) {
+            mPolygonPool->triangleFlags(mTriangleIdx) = flags;
+            addTriangle(verts[0], verts[1], verts[2], reverse);
+        }
+    }
+
+
+    void done()
+    {
+        mPolygonPool->trimQuads(mQuadIdx, /*reallocate=*/true);
+        mPolygonPool->trimTrinagles(mTriangleIdx, /*reallocate=*/true);
+    }
+
+private:
+
+    void addQuad(const Vec4I& verts, bool reverse)
+    {
+        if (!reverse) {
+            mPolygonPool->quad(mQuadIdx) = verts;
+        } else {
+            Vec4I& quad = mPolygonPool->quad(mQuadIdx);
+            quad[0] = verts[3];
+            quad[1] = verts[2];
+            quad[2] = verts[1];
+            quad[3] = verts[0];
+        }
+        ++mQuadIdx;
+    }
+
+    void addTriangle(unsigned v0, unsigned v1, unsigned v2, bool reverse)
+    {
+        Vec3I& prim = mPolygonPool->triangle(mTriangleIdx);
+
+        prim[1] = v1;
+
+        if (!reverse) {
+            prim[0] = v0;
+            prim[2] = v2;
+        } else {
+            prim[0] = v2;
+            prim[2] = v0;
+        }
+        ++mTriangleIdx;
+    }
+
+    size_t mQuadIdx, mTriangleIdx;
+    PolygonPool *mPolygonPool;
+};
+
+
+template<typename SignAccT, typename IdxAccT, typename PrimBuilder>
+inline void
+constructPolygons(Int16 flags, Int16 refFlags, const Vec4i& offsets, const Coord& ijk,
+    const SignAccT& signAcc, const IdxAccT& idxAcc, PrimBuilder& mesher, Index32 pointListSize)
+{
+    const Index32 v0 = idxAcc.getValue(ijk);
+    if (v0 == util::INVALID_IDX) return;
+
+    char tag[2];
+    tag[0] = (flags & SEAM) ? POLYFLAG_FRACTURE_SEAM : 0;
+    tag[1] = tag[0] | char(POLYFLAG_EXTERIOR);
+
+    const bool isInside = flags & INSIDE;
+
+    Coord coord;
+    openvdb::Vec4I quad;
+    unsigned char cell;
+    Index32 tmpIdx = 0;
+
+    if (flags & XEDGE) {
+
+        quad[0] = v0 + offsets[0];
+
+        // i, j-1, k
+        coord[0] = ijk[0];
+        coord[1] = ijk[1] - 1;
+        coord[2] = ijk[2];
+
+        quad[1] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[1] + Index32(sEdgeGroupTable[cell][5] - 1);
+            if (tmpIdx < pointListSize) quad[1] = tmpIdx;
+        }
+
+        // i, j-1, k-1
+        coord[2] -= 1;
+
+        quad[2] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[2] + Index32(sEdgeGroupTable[cell][7] - 1);
+            if (tmpIdx < pointListSize) quad[2] = tmpIdx;
+        }
+
+        // i, j, k-1
+        coord[1] = ijk[1];
+
+        quad[3] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[3] + Index32(sEdgeGroupTable[cell][3] - 1);
+            if (tmpIdx < pointListSize) quad[3] = tmpIdx;
+        }
+
+        if (quad[1] != util::INVALID_IDX &&
+            quad[2] != util::INVALID_IDX && quad[3] != util::INVALID_IDX) {
+            mesher.addPrim(quad, isInside, tag[bool(refFlags & XEDGE)]);
+        }
+    }
+
+
+    if (flags & YEDGE) {
+
+        quad[0] = v0 + offsets[1];
+
+        // i, j, k-1
+        coord[0] = ijk[0];
+        coord[1] = ijk[1];
+        coord[2] = ijk[2] - 1;
+
+        quad[1] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[1] + Index32(sEdgeGroupTable[cell][12] - 1);
+            if (tmpIdx < pointListSize) quad[1] = tmpIdx;
+        }
+
+        // i-1, j, k-1
+        coord[0] -= 1;
+
+        quad[2] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[2] + Index32(sEdgeGroupTable[cell][11] - 1);
+            if (tmpIdx < pointListSize) quad[2] = tmpIdx;
+        }
+
+        // i-1, j, k
+        coord[2] = ijk[2];
+
+        quad[3] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[3] + Index32(sEdgeGroupTable[cell][10] - 1);
+            if (tmpIdx < pointListSize) quad[3] = tmpIdx;
+        }
+
+        if (quad[1] != util::INVALID_IDX &&
+            quad[2] != util::INVALID_IDX && quad[3] != util::INVALID_IDX) {
+            mesher.addPrim(quad, isInside, tag[bool(refFlags & YEDGE)]);
+        }
+    }
+
+    if (flags & ZEDGE) {
+
+        quad[0] = v0 + offsets[2];
+
+        // i, j-1, k
+        coord[0] = ijk[0];
+        coord[1] = ijk[1] - 1;
+        coord[2] = ijk[2];
+
+        quad[1] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[1] + Index32(sEdgeGroupTable[cell][8] - 1);
+            if (tmpIdx < pointListSize) quad[1] = tmpIdx;
+        }
+
+        // i-1, j-1, k
+        coord[0] -= 1;
+
+        quad[2] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[2] + Index32(sEdgeGroupTable[cell][6] - 1);
+            if (tmpIdx < pointListSize) quad[2] = tmpIdx;
+        }
+
+        // i-1, j, k
+        coord[1] = ijk[1];
+
+        quad[3] = idxAcc.getValue(coord);
+        cell = uint8_t(SIGNS & signAcc.getValue(coord));
+        if (sEdgeGroupTable[cell][0] > 1) {
+            tmpIdx = quad[3] + Index32(sEdgeGroupTable[cell][2] - 1);
+            if (tmpIdx < pointListSize) quad[3] = tmpIdx;
+        }
+
+        if (quad[1] != util::INVALID_IDX &&
+            quad[2] != util::INVALID_IDX && quad[3] != util::INVALID_IDX) {
+            mesher.addPrim(quad, !isInside, tag[bool(refFlags & ZEDGE)]);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename LeafManagerT, typename PrimBuilder>
+class GenPolygons
+{
+public:
+    typedef typename LeafManagerT::TreeType::template ValueConverter<int>::Type IntTreeT;
+    typedef typename LeafManagerT::TreeType::template ValueConverter<Int16>::Type Int16TreeT;
+
+    typedef tree::ValueAccessor<const IntTreeT> IntAccessorT;
+    typedef tree::ValueAccessor<const Int16TreeT> Int16AccessorT;
+
+    //////////
+
+
+    GenPolygons(const LeafManagerT& signLeafs, const Int16TreeT& signTree,
+        const IntTreeT& idxTree, PolygonPoolList& polygons, Index32 pointListSize);
+
+    void run(bool threaded = true);
+
+
+    void setRefSignTree(const Int16TreeT *r) { mRefSignTree = r; }
+
+    //////////
+
+
+    void operator()(const tbb::blocked_range<size_t>&) const;
+
+private:
+    const LeafManagerT& mSignLeafs;
+    const Int16TreeT& mSignTree;
+    const IntTreeT& mIdxTree;
+    const PolygonPoolList& mPolygonPoolList;
+    const Index32 mPointListSize;
+
+    const Int16TreeT *mRefSignTree;
+ };
+
+
+template<typename LeafManagerT, typename PrimBuilder>
+GenPolygons<LeafManagerT, PrimBuilder>::GenPolygons(const LeafManagerT& signLeafs,
+    const Int16TreeT& signTree, const IntTreeT& idxTree, PolygonPoolList& polygons,
+    Index32 pointListSize)
+    : mSignLeafs(signLeafs)
+    , mSignTree(signTree)
+    , mIdxTree(idxTree)
+    , mPolygonPoolList(polygons)
+    , mPointListSize(pointListSize)
+    , mRefSignTree(NULL)
+{
+}
+
+template<typename LeafManagerT, typename PrimBuilder>
+void
+GenPolygons<LeafManagerT, PrimBuilder>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_for(mSignLeafs.getRange(), *this);
+    else (*this)(mSignLeafs.getRange());
+}
+
+template<typename LeafManagerT, typename PrimBuilder>
+void
+GenPolygons<LeafManagerT, PrimBuilder>::operator()(
+    const tbb::blocked_range<size_t>& range) const
+{
+    typename LeafManagerT::TreeType::LeafNodeType::ValueOnCIter iter;
+    IntAccessorT idxAcc(mIdxTree);
+    Int16AccessorT signAcc(mSignTree);
+
+
+    PrimBuilder mesher;
+    size_t edgeCount;
+    Coord ijk, origin;
+
+
+    // reference data
+    boost::scoped_ptr<Int16AccessorT> refSignAcc;
+    if (mRefSignTree) refSignAcc.reset(new Int16AccessorT(*mRefSignTree));
+
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        origin = mSignLeafs.leaf(n).origin();
+
+        // Get an upper bound on the number of primitives.
+        edgeCount = 0;
+        iter = mSignLeafs.leaf(n).cbeginValueOn();
+        for (; iter; ++iter) {
+            if (iter.getValue() & XEDGE) ++edgeCount;
+            if (iter.getValue() & YEDGE) ++edgeCount;
+            if (iter.getValue() & ZEDGE) ++edgeCount;
+        }
+
+        if(edgeCount == 0) continue;
+
+        mesher.init(edgeCount, mPolygonPoolList[n]);
+
+        const typename Int16TreeT::LeafNodeType *signleafPt = signAcc.probeConstLeaf(origin);
+        const typename IntTreeT::LeafNodeType *idxLeafPt = idxAcc.probeConstLeaf(origin);
+
+        if (!signleafPt || !idxLeafPt) continue;
+
+
+        const typename Int16TreeT::LeafNodeType *refSignLeafPt = NULL;
+        if (refSignAcc) refSignLeafPt = refSignAcc->probeConstLeaf(origin);
+
+        Vec4i offsets;
+
+        iter = mSignLeafs.leaf(n).cbeginValueOn();
+        for (; iter; ++iter) {
+            ijk = iter.getCoord();
+
+            Int16 flags = iter.getValue();
+
+            if (!(flags & 0xE00)) continue;
+
+            Int16 refFlags = 0;
+            if (refSignLeafPt) {
+                refFlags = refSignLeafPt->getValue(iter.pos());
+            }
+
+            offsets[0] = 0;
+            offsets[1] = 0;
+            offsets[2] = 0;
+
+            const unsigned char cell = uint8_t(SIGNS & flags);
+
+            if (sEdgeGroupTable[cell][0] > 1) {
+                offsets[0] = (sEdgeGroupTable[cell][1] - 1);
+                offsets[1] = (sEdgeGroupTable[cell][9] - 1);
+                offsets[2] = (sEdgeGroupTable[cell][4] - 1);
+            }
+
+            if (ijk[0] > origin[0] && ijk[1] > origin[1] && ijk[2] > origin[2]) {
+                constructPolygons(flags, refFlags, offsets, ijk,
+                    *signleafPt, *idxLeafPt, mesher, mPointListSize);
+            } else {
+                constructPolygons(flags, refFlags, offsets, ijk,
+                    signAcc, idxAcc, mesher, mPointListSize);
+            }
+        }
+
+        mesher.done();
+    }
+}
+
+
+////////////////////////////////////////
+
+// Masking and mesh partitioning
+
+struct PartOp
+{
+
+    PartOp(size_t leafCount, size_t partitions, size_t activePart)
+    {
+        size_t leafSegments = leafCount / partitions;
+        mStart = leafSegments * activePart;
+        mEnd = activePart >= (partitions - 1) ? leafCount : mStart + leafSegments;
+    }
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t leafIndex) const
+    {
+        if (leafIndex < mStart || leafIndex >= mEnd) leaf.setValuesOff();
+    }
+
+private:
+    size_t mStart, mEnd;
+};
+
+
+////////////////////////////////////////
+
+
+template<typename SrcTreeT>
+class PartGen
+{
+public:
+    typedef tree::LeafManager<const SrcTreeT> LeafManagerT;
+    typedef typename SrcTreeT::template ValueConverter<bool>::Type BoolTreeT;
+    typedef tree::ValueAccessor<BoolTreeT> BoolAccessorT;
+
+    //////////
+
+
+    PartGen(const LeafManagerT& leafs, size_t partitions, size_t activePart);
+
+    void run(bool threaded = true);
+
+    BoolTreeT&  tree() { return mTree; }
+
+
+    //////////
+
+    PartGen(PartGen&, tbb::split);
+    void operator()(const tbb::blocked_range<size_t>&);
+    void join(PartGen& rhs) { mTree.merge(rhs.mTree); }
+
+private:
+    const LeafManagerT& mLeafManager;
+    BoolTreeT mTree;
+    size_t mStart, mEnd;
+};
+
+template<typename SrcTreeT>
+PartGen<SrcTreeT>::PartGen(const LeafManagerT& leafs, size_t partitions, size_t activePart)
+    : mLeafManager(leafs)
+    , mTree(false)
+    , mStart(0)
+    , mEnd(0)
+{
+    size_t leafCount = leafs.leafCount();
+    size_t leafSegments = leafCount / partitions;
+    mStart = leafSegments * activePart;
+    mEnd = activePart >= (partitions - 1) ? leafCount : mStart + leafSegments;
+}
+
+template<typename SrcTreeT>
+PartGen<SrcTreeT>::PartGen(PartGen& rhs, tbb::split)
+    : mLeafManager(rhs.mLeafManager)
+    , mTree(false)
+    , mStart(rhs.mStart)
+    , mEnd(rhs.mEnd)
+{
+}
+
+
+template<typename SrcTreeT>
+void
+PartGen<SrcTreeT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_reduce(mLeafManager.getRange(), *this);
+    else (*this)(mLeafManager.getRange());
+}
+
+
+template<typename SrcTreeT>
+void
+PartGen<SrcTreeT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    Coord ijk;
+    BoolAccessorT acc(mTree);
+
+    typedef typename BoolTreeT::LeafNodeType BoolLeafT;
+    typename SrcTreeT::LeafNodeType::ValueOnCIter iter;
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+        if (n < mStart || n >= mEnd) continue;
+        BoolLeafT* leaf = acc.touchLeaf(mLeafManager.leaf(n).origin());
+        leaf->topologyUnion(mLeafManager.leaf(n));
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT, typename LeafManagerT>
+class GenSeamMask
+{
+public:
+    typedef typename TreeT::template ValueConverter<bool>::Type BoolTreeT;
+
+    //////////
+
+    GenSeamMask(const LeafManagerT& leafs, const TreeT& tree);
+
+    void run(bool threaded = true);
+
+    BoolTreeT& mask() { return mMaskTree; }
+
+    //////////
+
+    GenSeamMask(GenSeamMask&, tbb::split);
+    void operator()(const tbb::blocked_range<size_t>&);
+    void join(GenSeamMask& rhs) { mMaskTree.merge(rhs.mMaskTree); }
+
+private:
+
+    const LeafManagerT& mLeafManager;
+    const TreeT& mTree;
+
+    BoolTreeT mMaskTree;
+};
+
+
+template<typename TreeT, typename LeafManagerT>
+GenSeamMask<TreeT, LeafManagerT>::GenSeamMask(const LeafManagerT& leafs, const TreeT& tree)
+    : mLeafManager(leafs)
+    , mTree(tree)
+    , mMaskTree(false)
+{
+}
+
+
+template<typename TreeT, typename LeafManagerT>
+GenSeamMask<TreeT, LeafManagerT>::GenSeamMask(GenSeamMask& rhs, tbb::split)
+    : mLeafManager(rhs.mLeafManager)
+    , mTree(rhs.mTree)
+    , mMaskTree(false)
+{
+}
+
+
+template<typename TreeT, typename LeafManagerT>
+void
+GenSeamMask<TreeT, LeafManagerT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_reduce(mLeafManager.getRange(), *this);
+    else (*this)(mLeafManager.getRange());
+}
+
+
+template<typename TreeT, typename LeafManagerT>
+void
+GenSeamMask<TreeT, LeafManagerT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    Coord ijk;
+    tree::ValueAccessor<const TreeT> acc(mTree);
+    tree::ValueAccessor<BoolTreeT> maskAcc(mMaskTree);
+
+    typename LeafManagerT::TreeType::LeafNodeType::ValueOnCIter it;
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        it = mLeafManager.leaf(n).cbeginValueOn();
+
+        for (; it; ++it) {
+
+            ijk = it.getCoord();
+
+            unsigned char rhsSigns = uint8_t(acc.getValue(ijk) & SIGNS);
+
+            if (sEdgeGroupTable[rhsSigns][0] > 0) {
+                unsigned char lhsSigns = uint8_t(it.getValue() & SIGNS);
+                if (rhsSigns != lhsSigns) {
+                    maskAcc.setValueOn(ijk);
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+class TagSeamEdges
+{
+public:
+    typedef tree::ValueAccessor<const TreeT> AccessorT;
+
+    TagSeamEdges(const TreeT& tree) : mAcc(tree) {}
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t/*leafIndex*/) const
+    {
+        const typename TreeT::LeafNodeType *maskLeaf =
+            mAcc.probeConstLeaf(leaf.origin());
+
+        if (!maskLeaf) return;
+
+        typename LeafNodeType::ValueOnIter it = leaf.beginValueOn();
+
+        for (; it; ++it) {
+
+            if (maskLeaf->isValueOn(it.pos())) {
+                it.setValue(it.getValue() | SEAM);
+            }
+        }
+    }
+
+private:
+    AccessorT mAcc;
+};
+
+
+
+template<typename BoolTreeT>
+struct MaskEdges
+{
+    typedef tree::ValueAccessor<const BoolTreeT> BoolAccessorT;
+
+    MaskEdges(const BoolTreeT& valueMask) : mMaskAcc(valueMask) {}
+
+    template <typename LeafNodeType>
+    void operator()(LeafNodeType &leaf, size_t /*leafIndex*/) const
+    {
+        typename LeafNodeType::ValueOnIter it = leaf.beginValueOn();
+
+        const typename BoolTreeT::LeafNodeType * maskLeaf =
+            mMaskAcc.probeConstLeaf(leaf.origin());
+
+        if (maskLeaf) {
+            for (; it; ++it) {
+                if (!maskLeaf->isValueOn(it.pos())) {
+                    it.setValue(0x1FF & it.getValue());
+                }
+            }
+        } else {
+            for (; it; ++it) {
+                it.setValue(0x1FF & it.getValue());
+            }
+        }
+    }
+
+private:
+    BoolAccessorT mMaskAcc;
+};
+
+
+class FlagUsedPoints
+{
+public:
+    //////////
+
+    FlagUsedPoints(const PolygonPoolList& polygons, size_t polyListCount,
+        std::vector<unsigned char>& usedPointMask)
+        : mPolygons(polygons)
+        , mPolyListCount(polyListCount)
+        , mUsedPointMask(usedPointMask)
+    {
+    }
+
+    void run(bool threaded = true)
+    {
+        if (threaded) {
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, mPolyListCount), *this);
+        } else {
+            (*this)(tbb::blocked_range<size_t>(0, mPolyListCount));
+        }
+    }
+
+    //////////
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        // Concurrent writes to same memory address can occur, but
+        // all threads are writing the same value and char is atomic.
+        for (size_t n = range.begin(); n != range.end(); ++n) {
+            const PolygonPool& polygons = mPolygons[n];
+            for (size_t i = 0; i < polygons.numQuads(); ++i) {
+                const Vec4I& quad = polygons.quad(i);
+                mUsedPointMask[quad[0]] = 1;
+                mUsedPointMask[quad[1]] = 1;
+                mUsedPointMask[quad[2]] = 1;
+                mUsedPointMask[quad[3]] = 1;
+            }
+
+            for (size_t i = 0; i < polygons.numTriangles(); ++i) {
+                const Vec3I& triangle = polygons.triangle(i);
+                mUsedPointMask[triangle[0]] = 1;
+                mUsedPointMask[triangle[1]] = 1;
+                mUsedPointMask[triangle[2]] = 1;
+            }
+        }
+    }
+
+
+private:
+    const PolygonPoolList& mPolygons;
+    size_t mPolyListCount;
+    std::vector<unsigned char>& mUsedPointMask;
+};
+
+class RemapIndices
+{
+public:
+    //////////
+
+    RemapIndices(PolygonPoolList& polygons,
+        size_t polyListCount, const std::vector<unsigned>& indexMap)
+        : mPolygons(polygons)
+        , mPolyListCount(polyListCount)
+        , mIndexMap(indexMap)
+    {
+    }
+
+    void run(bool threaded = true)
+    {
+        if (threaded) {
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, mPolyListCount), *this);
+        } else {
+            (*this)(tbb::blocked_range<size_t>(0, mPolyListCount));
+        }
+    }
+
+    //////////
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        for (size_t n = range.begin(); n != range.end(); ++n) {
+            PolygonPool& polygons = mPolygons[n];
+            for (size_t i = 0; i < polygons.numQuads(); ++i) {
+                Vec4I& quad = polygons.quad(i);
+                quad[0] = mIndexMap[quad[0]];
+                quad[1] = mIndexMap[quad[1]];
+                quad[2] = mIndexMap[quad[2]];
+                quad[3] = mIndexMap[quad[3]];
+            }
+
+            for (size_t i = 0; i < polygons.numTriangles(); ++i) {
+                Vec3I& triangle = polygons.triangle(i);
+                triangle[0] = mIndexMap[triangle[0]];
+                triangle[1] = mIndexMap[triangle[1]];
+                triangle[2] = mIndexMap[triangle[2]];
+            }
+        }
+    }
+
+
+private:
+    PolygonPoolList& mPolygons;
+    size_t mPolyListCount;
+    const std::vector<unsigned>& mIndexMap;
+};
+
+
+class MovePoints
+{
+public:
+    //////////
+
+    MovePoints(
+        internal::UniquePtr<openvdb::Vec3s>::type& newPointList,
+        const PointList& oldPointList,
+        const std::vector<unsigned>& indexMap,
+        const std::vector<unsigned char>& usedPointMask)
+        : mNewPointList(newPointList)
+        , mOldPointList(oldPointList)
+        , mIndexMap(indexMap)
+        , mUsedPointMask(usedPointMask)
+    {
+    }
+
+    void run(bool threaded = true)
+    {
+        if (threaded) {
+            tbb::parallel_for(tbb::blocked_range<size_t>(0, mIndexMap.size()), *this);
+        } else {
+            (*this)(tbb::blocked_range<size_t>(0, mIndexMap.size()));
+        }
+    }
+
+    //////////
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        for (size_t n = range.begin(); n != range.end(); ++n) {
+            if (mUsedPointMask[n]) {
+                const size_t index = mIndexMap[n];
+                mNewPointList.get()[index] = mOldPointList[n];
+            }
+        }
+    }
+
+private:
+    internal::UniquePtr<openvdb::Vec3s>::type& mNewPointList;
+    const PointList& mOldPointList;
+    const std::vector<unsigned>& mIndexMap;
+    const std::vector<unsigned char>& mUsedPointMask;
+};
+
+
+////////////////////////////////////////
+
+
+template<typename SrcTreeT>
+class GenTopologyMask
+{
+public:
+    typedef tree::LeafManager<const SrcTreeT> LeafManagerT;
+    typedef typename SrcTreeT::template ValueConverter<bool>::Type BoolTreeT;
+    typedef tree::ValueAccessor<const SrcTreeT> SrcAccessorT;
+    typedef tree::ValueAccessor<BoolTreeT> BoolAccessorT;
+    typedef Grid<BoolTreeT> BoolGridT;
+
+
+    //////////
+
+
+    GenTopologyMask(const BoolGridT& mask, const LeafManagerT& srcLeafs,
+        const math::Transform& srcXForm, bool invertMask);
+
+    void run(bool threaded = true);
+
+    BoolTreeT& tree() { return mTree; }
+
+
+    //////////
+
+    GenTopologyMask(GenTopologyMask&, tbb::split);
+
+    void operator()(const tbb::blocked_range<size_t>&);
+
+    void join(GenTopologyMask& rhs) { mTree.merge(rhs.mTree); }
+
+private:
+
+    const BoolGridT& mMask;
+    const LeafManagerT& mLeafManager;
+    const math::Transform& mSrcXForm;
+    bool mInvertMask;
+    BoolTreeT mTree;
+};
+
+
+template<typename SrcTreeT>
+GenTopologyMask<SrcTreeT>::GenTopologyMask(const BoolGridT& mask, const LeafManagerT& srcLeafs,
+    const math::Transform& srcXForm, bool invertMask)
+    : mMask(mask)
+    , mLeafManager(srcLeafs)
+    , mSrcXForm(srcXForm)
+    , mInvertMask(invertMask)
+    , mTree(false)
+{
+}
+
+
+template<typename SrcTreeT>
+GenTopologyMask<SrcTreeT>::GenTopologyMask(GenTopologyMask& rhs, tbb::split)
+    : mMask(rhs.mMask)
+    , mLeafManager(rhs.mLeafManager)
+    , mSrcXForm(rhs.mSrcXForm)
+    , mInvertMask(rhs.mInvertMask)
+    , mTree(false)
+{
+}
+
+
+template<typename SrcTreeT>
+void
+GenTopologyMask<SrcTreeT>::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_reduce(mLeafManager.getRange(), *this);
+    } else {
+        (*this)(mLeafManager.getRange());
+    }
+}
+
+
+template<typename SrcTreeT>
+void
+GenTopologyMask<SrcTreeT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    Coord ijk;
+    Vec3d xyz;
+    typedef typename BoolTreeT::LeafNodeType BoolLeafT;
+    const math::Transform& maskXForm = mMask.transform();
+    tree::ValueAccessor<const BoolTreeT> maskAcc(mMask.tree());
+    tree::ValueAccessor<BoolTreeT> acc(mTree);
+
+    typename SrcTreeT::LeafNodeType::ValueOnCIter iter;
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        ijk = mLeafManager.leaf(n).origin();
+        BoolLeafT* leaf = new BoolLeafT(ijk, false);
+        bool addLeaf = false;
+
+        if (maskXForm == mSrcXForm) {
+
+            const BoolLeafT* maskLeaf = maskAcc.probeConstLeaf(ijk);
+
+            if (maskLeaf) {
+
+                for (iter = mLeafManager.leaf(n).cbeginValueOn(); iter; ++iter) {
+                    Index pos = iter.pos();
+                    if(maskLeaf->isValueOn(pos) != mInvertMask) {
+                        leaf->setValueOn(pos);
+                        addLeaf = true;
+                    }
+                }
+
+            } else if (maskAcc.isValueOn(ijk) != mInvertMask) {
+                leaf->topologyUnion(mLeafManager.leaf(n));
+                addLeaf = true;
+            }
+
+        } else {
+            for (iter = mLeafManager.leaf(n).cbeginValueOn(); iter; ++iter) {
+                ijk = iter.getCoord();
+                xyz = maskXForm.worldToIndex(mSrcXForm.indexToWorld(ijk));
+                if(maskAcc.isValueOn(util::nearestCoord(xyz)) != mInvertMask) {
+                    leaf->setValueOn(iter.pos());
+                    addLeaf = true;
+                }
+            }
+        }
+
+        if (addLeaf) acc.addLeaf(leaf);
+        else delete leaf;
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename SrcTreeT>
+class GenBoundaryMask
+{
+public:
+    typedef typename SrcTreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef typename SrcTreeT::template ValueConverter<bool>::Type BoolTreeT;
+    typedef tree::LeafManager<const SrcTreeT> LeafManagerT;
+
+    //////////
+
+    GenBoundaryMask(const LeafManagerT& leafs, const BoolTreeT&, const IntTreeT&);
+
+    void run(bool threaded = true);
+
+    BoolTreeT&  tree() { return mTree; }
+
+    //////////
+
+    GenBoundaryMask(GenBoundaryMask&, tbb::split);
+    void operator()(const tbb::blocked_range<size_t>&);
+    void join(GenBoundaryMask& rhs) { mTree.merge(rhs.mTree); }
+
+private:
+    // This typedef is needed for Windows
+    typedef tree::ValueAccessor<const IntTreeT> IntTreeAccessorT;
+
+    bool neighboringLeaf(const Coord&, const IntTreeAccessorT&) const;
+
+    const LeafManagerT& mLeafManager;
+    const BoolTreeT& mMaskTree;
+    const IntTreeT& mIdxTree;
+    BoolTreeT mTree;
+    CoordBBox mLeafBBox;
+};
+
+
+template<typename SrcTreeT>
+GenBoundaryMask<SrcTreeT>::GenBoundaryMask(const LeafManagerT& leafs,
+    const BoolTreeT& maskTree, const IntTreeT& auxTree)
+    : mLeafManager(leafs)
+    , mMaskTree(maskTree)
+    , mIdxTree(auxTree)
+    , mTree(false)
+{
+    mIdxTree.evalLeafBoundingBox(mLeafBBox);
+    mLeafBBox.expand(IntTreeT::LeafNodeType::DIM);
+}
+
+
+template<typename SrcTreeT>
+GenBoundaryMask<SrcTreeT>::GenBoundaryMask(GenBoundaryMask& rhs, tbb::split)
+    : mLeafManager(rhs.mLeafManager)
+    , mMaskTree(rhs.mMaskTree)
+    , mIdxTree(rhs.mIdxTree)
+    , mTree(false)
+    , mLeafBBox(rhs.mLeafBBox)
+{
+}
+
+
+template<typename SrcTreeT>
+void
+GenBoundaryMask<SrcTreeT>::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_reduce(mLeafManager.getRange(), *this);
+    } else {
+        (*this)(mLeafManager.getRange());
+    }
+}
+
+
+template<typename SrcTreeT>
+bool
+GenBoundaryMask<SrcTreeT>::neighboringLeaf(const Coord& ijk, const IntTreeAccessorT& acc) const
+{
+    if (acc.probeConstLeaf(ijk)) return true;
+
+    const int dim = IntTreeT::LeafNodeType::DIM;
+
+    // face adjacent neghbours
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1], ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1], ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1] + dim, ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1] - dim, ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1], ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1], ijk[2] - dim))) return true;
+
+    // edge adjacent neighbors
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1], ijk[2] - dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1], ijk[2] - dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1], ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1], ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1] + dim, ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1] + dim, ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1] - dim, ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1] - dim, ijk[2]))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1] - dim, ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1] - dim, ijk[2] - dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1] + dim, ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0], ijk[1] + dim, ijk[2] - dim))) return true;
+
+    // corner adjacent neighbors
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1] - dim, ijk[2] - dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1] - dim, ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1] - dim, ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1] - dim, ijk[2] - dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1] + dim, ijk[2] - dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] - dim, ijk[1] + dim, ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1] + dim, ijk[2] + dim))) return true;
+    if (acc.probeConstLeaf(Coord(ijk[0] + dim, ijk[1] + dim, ijk[2] - dim))) return true;
+
+    return false;
+}
+
+
+template<typename SrcTreeT>
+void
+GenBoundaryMask<SrcTreeT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    Coord ijk;
+    tree::ValueAccessor<const BoolTreeT> maskAcc(mMaskTree);
+    tree::ValueAccessor<const IntTreeT> idxAcc(mIdxTree);
+    tree::ValueAccessor<BoolTreeT> acc(mTree);
+
+    typename SrcTreeT::LeafNodeType::ValueOnCIter iter;
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        const typename SrcTreeT::LeafNodeType&
+            leaf = mLeafManager.leaf(n);
+
+        ijk = leaf.origin();
+
+        if (!mLeafBBox.isInside(ijk) || !neighboringLeaf(ijk, idxAcc)) continue;
+
+        const typename BoolTreeT::LeafNodeType*
+            maskLeaf = maskAcc.probeConstLeaf(ijk);
+
+        if (!maskLeaf || !leaf.hasSameTopology(maskLeaf)) {
+            acc.touchLeaf(ijk)->topologyUnion(leaf);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename TreeT>
+class GenTileMask
+{
+public:
+    typedef typename TreeT::template ValueConverter<bool>::Type BoolTreeT;
+
+    typedef typename TreeT::ValueType ValueT;
+
+    //////////
+
+    GenTileMask(const std::vector<Vec4i>& tiles, const TreeT& distTree, ValueT iso);
+
+    void run(bool threaded = true);
+
+    BoolTreeT& tree() { return mTree; }
+
+    //////////
+
+    GenTileMask(GenTileMask&, tbb::split);
+    void operator()(const tbb::blocked_range<size_t>&);
+    void join(GenTileMask& rhs) { mTree.merge(rhs.mTree); }
+
+private:
+
+    const std::vector<Vec4i>& mTiles;
+    const TreeT& mDistTree;
+    ValueT mIsovalue;
+
+    BoolTreeT mTree;
+};
+
+
+template<typename TreeT>
+GenTileMask<TreeT>::GenTileMask(
+    const std::vector<Vec4i>& tiles, const TreeT& distTree, ValueT iso)
+    : mTiles(tiles)
+    , mDistTree(distTree)
+    , mIsovalue(iso)
+    , mTree(false)
+{
+}
+
+
+template<typename TreeT>
+GenTileMask<TreeT>::GenTileMask(GenTileMask& rhs, tbb::split)
+    : mTiles(rhs.mTiles)
+    , mDistTree(rhs.mDistTree)
+    , mIsovalue(rhs.mIsovalue)
+    , mTree(false)
+{
+}
+
+
+template<typename TreeT>
+void
+GenTileMask<TreeT>::run(bool threaded)
+{
+    if (threaded) tbb::parallel_reduce(tbb::blocked_range<size_t>(0, mTiles.size()), *this);
+    else (*this)(tbb::blocked_range<size_t>(0, mTiles.size()));
+}
+
+
+template<typename TreeT>
+void
+GenTileMask<TreeT>::operator()(const tbb::blocked_range<size_t>& range)
+{
+    tree::ValueAccessor<const TreeT> distAcc(mDistTree);
+    CoordBBox region, bbox;
+    Coord ijk, nijk;
+    bool processRegion = true;
+    ValueT value;
+
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        const Vec4i& tile = mTiles[n];
+
+        bbox.min()[0] = tile[0];
+        bbox.min()[1] = tile[1];
+        bbox.min()[2] = tile[2];
+
+        bbox.max() = bbox.min();
+        bbox.max().offset(tile[3]);
+
+        const bool thisInside = (distAcc.getValue(bbox.min()) < mIsovalue);
+        const int thisDepth = distAcc.getValueDepth(bbox.min());
+
+        // eval x-edges
+
+        ijk = bbox.max();
+        nijk = ijk;
+        ++nijk[0];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(nijk)) {
+            processRegion = thisInside != (distAcc.getValue(nijk) < mIsovalue);
+        }
+
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[0] = region.max()[0] = ijk[0];
+            mTree.fill(region, true);
+        }
+
+
+        ijk = bbox.min();
+        --ijk[0];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(ijk)) {
+            processRegion = !distAcc.probeValue(ijk, value) && thisInside != (value < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[0] = region.max()[0] = ijk[0];
+            mTree.fill(region, true);
+        }
+
+
+        // eval y-edges
+
+        ijk = bbox.max();
+        nijk = ijk;
+        ++nijk[1];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(nijk)) {
+            processRegion = thisInside != (distAcc.getValue(nijk) < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[1] = region.max()[1] = ijk[1];
+            mTree.fill(region, true);
+        }
+
+
+        ijk = bbox.min();
+        --ijk[1];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(ijk)) {
+            processRegion = !distAcc.probeValue(ijk, value) && thisInside != (value < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[1] = region.max()[1] = ijk[1];
+            mTree.fill(region, true);
+        }
+
+
+        // eval z-edges
+
+        ijk = bbox.max();
+        nijk = ijk;
+        ++nijk[2];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(nijk)) {
+            processRegion = thisInside != (distAcc.getValue(nijk) < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[2] = region.max()[2] = ijk[2];
+            mTree.fill(region, true);
+        }
+
+        ijk = bbox.min();
+        --ijk[2];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(ijk)) {
+            processRegion = !distAcc.probeValue(ijk, value) && thisInside != (value < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[2] = region.max()[2] = ijk[2];
+            mTree.fill(region, true);
+        }
+
+
+        ijk = bbox.min();
+        --ijk[1];
+        --ijk[2];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(ijk)) {
+            processRegion = !distAcc.probeValue(ijk, value) && thisInside != (value < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[1] = region.max()[1] = ijk[1];
+            region.min()[2] = region.max()[2] = ijk[2];
+            mTree.fill(region, true);
+        }
+
+
+        ijk = bbox.min();
+        --ijk[0];
+        --ijk[1];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(ijk)) {
+            processRegion = !distAcc.probeValue(ijk, value) && thisInside != (value < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[1] = region.max()[1] = ijk[1];
+            region.min()[0] = region.max()[0] = ijk[0];
+            mTree.fill(region, true);
+        }
+
+        ijk = bbox.min();
+        --ijk[0];
+        --ijk[2];
+
+        processRegion = true;
+        if (thisDepth >= distAcc.getValueDepth(ijk)) {
+            processRegion = !distAcc.probeValue(ijk, value) && thisInside != (value < mIsovalue);
+        }
+
+        if (processRegion) {
+            region = bbox;
+            region.min()[2] = region.max()[2] = ijk[2];
+            region.min()[0] = region.max()[0] = ijk[0];
+            mTree.fill(region, true);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<class DistTreeT, class SignTreeT, class IdxTreeT>
+inline void
+tileData(const DistTreeT& distTree, SignTreeT& signTree, IdxTreeT& idxTree, double iso)
+{
+    typename DistTreeT::ValueOnCIter tileIter(distTree);
+    tileIter.setMaxDepth(DistTreeT::ValueOnCIter::LEAF_DEPTH - 1);
+
+    if (!tileIter) return; // volume has no active tiles.
+
+    size_t tileCount = 0;
+    for ( ; tileIter; ++tileIter) {
+        ++tileCount;
+    }
+
+    std::vector<Vec4i> tiles(tileCount);
+
+    tileCount = 0;
+    tileIter = distTree.cbeginValueOn();
+    tileIter.setMaxDepth(DistTreeT::ValueOnCIter::LEAF_DEPTH - 1);
+
+    CoordBBox bbox;
+    for (; tileIter; ++tileIter) {
+        Vec4i& tile = tiles[tileCount++];
+        tileIter.getBoundingBox(bbox);
+        tile[0] = bbox.min()[0];
+        tile[1] = bbox.min()[1];
+        tile[2] = bbox.min()[2];
+        tile[3] = bbox.max()[0] - bbox.min()[0];
+    }
+
+    typename DistTreeT::ValueType isovalue = typename DistTreeT::ValueType(iso);
+
+    GenTileMask<DistTreeT> tileMask(tiles, distTree, isovalue);
+    tileMask.run();
+
+    typedef typename DistTreeT::template ValueConverter<bool>::Type BoolTreeT;
+    typedef tree::LeafManager<BoolTreeT> BoolLeafManagerT;
+
+    BoolLeafManagerT leafs(tileMask.tree());
+
+
+    internal::SignData<DistTreeT, BoolLeafManagerT> op(distTree, leafs, isovalue);
+    op.run();
+
+    signTree.merge(*op.signTree());
+    idxTree.merge(*op.idxTree());
+}
+
+
+////////////////////////////////////////
+
+
+// Utility class for the volumeToMesh wrapper
+class PointListCopy
+{
+public:
+    PointListCopy(const PointList& pointsIn, std::vector<Vec3s>& pointsOut)
+        : mPointsIn(pointsIn) , mPointsOut(pointsOut)
+    {
+    }
+
+    void operator()(const tbb::blocked_range<size_t>& range) const
+    {
+        for (size_t n = range.begin(); n < range.end(); ++n) {
+            mPointsOut[n] = mPointsIn[n];
+        }
+    }
+
+private:
+    const PointList& mPointsIn;
+    std::vector<Vec3s>& mPointsOut;
+};
+
+
+// Checks if the isovalue is in proximity to the active voxel boundary.
+template <typename LeafManagerT>
+inline bool
+needsActiveVoxePadding(const LeafManagerT& leafs, double iso, double voxelSize)
+{
+    double interiorWidth = 0.0, exteriorWidth = 0.0;
+    {
+        typename LeafManagerT::TreeType::LeafNodeType::ValueOffCIter it;
+        bool foundInterior = false, foundExterior = false;
+        for (size_t n = 0, N = leafs.leafCount(); n < N; ++n) {
+
+            for (it = leafs.leaf(n).cbeginValueOff(); it; ++it) {
+                double value = double(it.getValue());
+                if (value < 0.0) {
+                    interiorWidth = value;
+                    foundInterior = true;
+                } else if (value > 0.0) {
+                    exteriorWidth = value;
+                    foundExterior = true;
+                }
+
+                if (foundInterior && foundExterior) break;
+            }
+
+            if (foundInterior && foundExterior) break;
+        }
+
+    }
+
+    double minDist = std::min(std::abs(interiorWidth - iso), std::abs(exteriorWidth - iso));
+    return !(minDist > (2.0 * voxelSize));
+}
+
+
+} // end namespace internal
+
+
+////////////////////////////////////////
+
+
+inline
+PolygonPool::PolygonPool()
+    : mNumQuads(0)
+    , mNumTriangles(0)
+    , mQuads(NULL)
+    , mTriangles(NULL)
+    , mQuadFlags(NULL)
+    , mTriangleFlags(NULL)
+{
+}
+
+
+inline
+PolygonPool::PolygonPool(const size_t numQuads, const size_t numTriangles)
+    : mNumQuads(numQuads)
+    , mNumTriangles(numTriangles)
+    , mQuads(new openvdb::Vec4I[mNumQuads])
+    , mTriangles(new openvdb::Vec3I[mNumTriangles])
+    , mQuadFlags(new char[mNumQuads])
+    , mTriangleFlags(new char[mNumTriangles])
+{
+}
+
+
+inline void
+PolygonPool::copy(const PolygonPool& rhs)
+{
+    resetQuads(rhs.numQuads());
+    resetTriangles(rhs.numTriangles());
+
+    for (size_t i = 0; i < mNumQuads; ++i) {
+        mQuads[i] = rhs.mQuads[i];
+        mQuadFlags[i] = rhs.mQuadFlags[i];
+    }
+
+    for (size_t i = 0; i < mNumTriangles; ++i) {
+        mTriangles[i] = rhs.mTriangles[i];
+        mTriangleFlags[i] = rhs.mTriangleFlags[i];
+    }
+}
+
+
+inline void
+PolygonPool::resetQuads(size_t size)
+{
+    mNumQuads = size;
+    mQuads.reset(new openvdb::Vec4I[mNumQuads]);
+    mQuadFlags.reset(new char[mNumQuads]);
+}
+
+
+inline void
+PolygonPool::clearQuads()
+{
+    mNumQuads = 0;
+    mQuads.reset(NULL);
+    mQuadFlags.reset(NULL);
+}
+
+
+inline void
+PolygonPool::resetTriangles(size_t size)
+{
+    mNumTriangles = size;
+    mTriangles.reset(new openvdb::Vec3I[mNumTriangles]);
+    mTriangleFlags.reset(new char[mNumTriangles]);
+}
+
+
+inline void
+PolygonPool::clearTriangles()
+{
+    mNumTriangles = 0;
+    mTriangles.reset(NULL);
+    mTriangleFlags.reset(NULL);
+}
+
+
+inline bool
+PolygonPool::trimQuads(const size_t n, bool reallocate)
+{
+    if (!(n < mNumQuads)) return false;
+
+    if (reallocate) {
+
+        if (n == 0) {
+            mQuads.reset(NULL);
+        } else {
+
+            boost::scoped_array<openvdb::Vec4I> quads(new openvdb::Vec4I[n]);
+            boost::scoped_array<char> flags(new char[n]);
+
+            for (size_t i = 0; i < n; ++i) {
+                quads[i] = mQuads[i];
+                flags[i] = mQuadFlags[i];
+            }
+
+            mQuads.swap(quads);
+            mQuadFlags.swap(flags);
+        }
+    }
+
+    mNumQuads = n;
+    return true;
+}
+
+
+inline bool
+PolygonPool::trimTrinagles(const size_t n, bool reallocate)
+{
+    if (!(n < mNumTriangles)) return false;
+
+    if (reallocate) {
+
+        if (n == 0) {
+            mTriangles.reset(NULL);
+        } else {
+
+            boost::scoped_array<openvdb::Vec3I> triangles(new openvdb::Vec3I[n]);
+            boost::scoped_array<char> flags(new char[n]);
+
+            for (size_t i = 0; i < n; ++i) {
+                triangles[i] = mTriangles[i];
+                flags[i] = mTriangleFlags[i];
+            }
+
+            mTriangles.swap(triangles);
+            mTriangleFlags.swap(flags);
+        }
+    }
+
+    mNumTriangles = n;
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+inline VolumeToMesh::VolumeToMesh(double isovalue, double adaptivity)
+    : mPoints(NULL)
+    , mPolygons()
+    , mPointListSize(0)
+    , mSeamPointListSize(0)
+    , mPolygonPoolListSize(0)
+    , mIsovalue(isovalue)
+    , mPrimAdaptivity(adaptivity)
+    , mSecAdaptivity(0.0)
+    , mRefGrid(GridBase::ConstPtr())
+    , mSurfaceMaskGrid(GridBase::ConstPtr())
+    , mAdaptivityGrid(GridBase::ConstPtr())
+    , mAdaptivityMaskTree(TreeBase::ConstPtr())
+    , mRefSignTree(TreeBase::Ptr())
+    , mRefIdxTree(TreeBase::Ptr())
+    , mInvertSurfaceMask(false)
+    , mPartitions(1)
+    , mActivePart(0)
+    , mQuantizedSeamPoints(NULL)
+    , mPointFlags(0)
+{
+}
+
+
+inline PointList&
+VolumeToMesh::pointList()
+{
+    return mPoints;
+}
+
+
+inline const size_t&
+VolumeToMesh::pointListSize() const
+{
+    return mPointListSize;
+}
+
+
+inline PolygonPoolList&
+VolumeToMesh::polygonPoolList()
+{
+    return mPolygons;
+}
+
+
+inline const PolygonPoolList&
+VolumeToMesh::polygonPoolList() const
+{
+    return mPolygons;
+}
+
+
+inline const size_t&
+VolumeToMesh::polygonPoolListSize() const
+{
+    return mPolygonPoolListSize;
+}
+
+
+inline void
+VolumeToMesh::setRefGrid(const GridBase::ConstPtr& grid, double secAdaptivity)
+{
+    mRefGrid = grid;
+    mSecAdaptivity = secAdaptivity;
+
+    // Clear out old auxiliary data
+    mRefSignTree = TreeBase::Ptr();
+    mRefIdxTree = TreeBase::Ptr();
+    mSeamPointListSize = 0;
+    mQuantizedSeamPoints.reset(NULL);
+}
+
+
+inline void
+VolumeToMesh::setSurfaceMask(const GridBase::ConstPtr& mask, bool invertMask)
+{
+    mSurfaceMaskGrid = mask;
+    mInvertSurfaceMask = invertMask;
+}
+
+
+inline void
+VolumeToMesh::setSpatialAdaptivity(const GridBase::ConstPtr& grid)
+{
+    mAdaptivityGrid = grid;
+}
+
+
+inline void
+VolumeToMesh::setAdaptivityMask(const TreeBase::ConstPtr& tree)
+{
+   mAdaptivityMaskTree = tree;
+}
+
+
+inline void
+VolumeToMesh::partition(unsigned partitions, unsigned activePart)
+{
+    mPartitions = std::max(partitions, unsigned(1));
+    mActivePart = std::min(activePart, mPartitions-1);
+}
+
+
+inline std::vector<unsigned char>&
+VolumeToMesh::pointFlags()
+{
+    return mPointFlags;
+}
+
+
+inline const std::vector<unsigned char>&
+VolumeToMesh::pointFlags() const
+{
+    return mPointFlags;
+}
+
+
+template<typename GridT>
+inline void
+VolumeToMesh::operator()(const GridT& distGrid)
+{
+    typedef typename GridT::TreeType DistTreeT;
+    typedef tree::LeafManager<const DistTreeT> DistLeafManagerT;
+    typedef typename DistTreeT::ValueType DistValueT;
+
+    typedef typename DistTreeT::template ValueConverter<bool>::Type BoolTreeT;
+    typedef tree::LeafManager<BoolTreeT> BoolLeafManagerT;
+    typedef Grid<BoolTreeT> BoolGridT;
+
+    typedef typename DistTreeT::template ValueConverter<Int16>::Type Int16TreeT;
+    typedef tree::LeafManager<Int16TreeT> Int16LeafManagerT;
+
+    typedef typename DistTreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef typename DistTreeT::template ValueConverter<float>::Type FloatTreeT;
+    typedef Grid<FloatTreeT> FloatGridT;
+
+
+    const openvdb::math::Transform& transform = distGrid.transform();
+    const DistTreeT& distTree = distGrid.tree();
+    const DistValueT isovalue = DistValueT(mIsovalue);
+
+    typename Int16TreeT::Ptr signTreePt;
+    typename IntTreeT::Ptr idxTreePt;
+    typename BoolTreeT::Ptr pointMask;
+
+    BoolTreeT valueMask(false), seamMask(false);
+    const bool adaptive = mPrimAdaptivity > 1e-7 || mSecAdaptivity > 1e-7;
+    bool maskEdges = false;
+
+
+    const BoolGridT * surfaceMask = NULL;
+    if (mSurfaceMaskGrid && mSurfaceMaskGrid->type() == BoolGridT::gridType()) {
+        surfaceMask = static_cast<const BoolGridT*>(mSurfaceMaskGrid.get());
+    }
+
+    const FloatGridT * adaptivityField = NULL;
+    if (mAdaptivityGrid && mAdaptivityGrid->type() == FloatGridT::gridType()) {
+        adaptivityField = static_cast<const FloatGridT*>(mAdaptivityGrid.get());
+    }
+
+    if (mAdaptivityMaskTree && mAdaptivityMaskTree->type() == BoolTreeT::treeType()) {
+        const BoolTreeT *adaptivityMaskPt =
+            static_cast<const BoolTreeT*>(mAdaptivityMaskTree.get());
+        seamMask.topologyUnion(*adaptivityMaskPt);
+    }
+
+
+    // Collect auxiliary data
+    {
+        DistLeafManagerT distLeafs(distTree);
+
+        // Check if the isovalue is in proximity to the active voxel boundary.
+        bool padActiveVoxels = false;
+        int padVoxels = 3;
+
+        if (distGrid.getGridClass() != GRID_LEVEL_SET) {
+            padActiveVoxels = true;
+        } else {
+            padActiveVoxels = internal::needsActiveVoxePadding(distLeafs,
+                mIsovalue, transform.voxelSize()[0]);
+        }
+
+        // always pad the active region for small volumes (the performance hit is neglectable).
+        if (!padActiveVoxels) {
+            Coord dim;
+            distTree.evalActiveVoxelDim(dim);
+            int maxDim = std::max(std::max(dim[0], dim[1]), dim[2]);
+            if (maxDim < 1000) {
+                padActiveVoxels = true;
+                padVoxels = 1;
+            }
+        }
+
+        if (surfaceMask || mPartitions > 1) {
+
+            maskEdges = true;
+
+            if (surfaceMask) {
+
+                { // Mask
+                    internal::GenTopologyMask<DistTreeT> masking(
+                        *surfaceMask, distLeafs, transform, mInvertSurfaceMask);
+                    masking.run();
+                    valueMask.merge(masking.tree());
+                }
+
+                if (mPartitions > 1) { // Partition
+                    tree::LeafManager<BoolTreeT> leafs(valueMask);
+                    leafs.foreach(internal::PartOp(leafs.leafCount() , mPartitions, mActivePart));
+                    tools::pruneInactive(valueMask);
+                }
+
+            } else { // Partition
+
+                internal::PartGen<DistTreeT> partitioner(distLeafs, mPartitions, mActivePart);
+                partitioner.run();
+                valueMask.merge(partitioner.tree());
+            }
+
+            {
+                if (padActiveVoxels) tools::dilateVoxels(valueMask, padVoxels);
+                BoolLeafManagerT leafs(valueMask);
+
+                internal::SignData<DistTreeT, BoolLeafManagerT>
+                    signDataOp(distTree, leafs, isovalue);
+                signDataOp.run();
+
+                signTreePt = signDataOp.signTree();
+                idxTreePt = signDataOp.idxTree();
+            }
+
+            {
+                internal::GenBoundaryMask<DistTreeT> boundary(distLeafs, valueMask, *idxTreePt);
+                boundary.run();
+
+                BoolLeafManagerT bleafs(boundary.tree());
+
+                internal::SignData<DistTreeT, BoolLeafManagerT>
+                    signDataOp(distTree, bleafs, isovalue);
+                signDataOp.run();
+
+                signTreePt->merge(*signDataOp.signTree());
+                idxTreePt->merge(*signDataOp.idxTree());
+            }
+
+        } else {
+
+            // Collect voxel-sign configurations
+            if (padActiveVoxels) {
+
+                BoolTreeT regionMask(false);
+                regionMask.topologyUnion(distTree);
+                tools::dilateVoxels(regionMask, padVoxels);
+
+                BoolLeafManagerT leafs(regionMask);
+
+                internal::SignData<DistTreeT, BoolLeafManagerT>
+                    signDataOp(distTree, leafs, isovalue);
+                signDataOp.run();
+
+                signTreePt = signDataOp.signTree();
+                idxTreePt = signDataOp.idxTree();
+            } else {
+
+                internal::SignData<DistTreeT, DistLeafManagerT>
+                    signDataOp(distTree, distLeafs, isovalue);
+                signDataOp.run();
+
+                signTreePt = signDataOp.signTree();
+                idxTreePt = signDataOp.idxTree();
+            }
+        }
+
+    }
+
+
+    // Collect auxiliary data from active tiles
+    internal::tileData(distTree, *signTreePt, *idxTreePt, static_cast<double>(isovalue));
+
+    // Optionally collect auxiliary data from a reference level set.
+    Int16TreeT *refSignTreePt = NULL;
+    IntTreeT *refIdxTreePt = NULL;
+    const DistTreeT *refDistTreePt = NULL;
+
+    if (mRefGrid && mRefGrid->type() == GridT::gridType()) {
+
+        const GridT* refGrid = static_cast<const GridT*>(mRefGrid.get());
+        refDistTreePt = &refGrid->tree();
+
+        // Collect and cache auxiliary data from the reference grid.
+        if (!mRefSignTree && !mRefIdxTree) {
+
+            DistLeafManagerT refDistLeafs(*refDistTreePt);
+            internal::SignData<DistTreeT, DistLeafManagerT>
+                signDataOp(*refDistTreePt, refDistLeafs, isovalue);
+
+            signDataOp.run();
+
+            mRefSignTree = signDataOp.signTree();
+            mRefIdxTree = signDataOp.idxTree();
+        }
+
+        // Get cached auxiliary data
+        if (mRefSignTree && mRefIdxTree) {
+            refSignTreePt = static_cast<Int16TreeT*>(mRefSignTree.get());
+            refIdxTreePt = static_cast<IntTreeT*>(mRefIdxTree.get());
+        }
+    }
+
+
+    // Process auxiliary data
+    Int16LeafManagerT signLeafs(*signTreePt);
+
+    if (maskEdges) {
+        signLeafs.foreach(internal::MaskEdges<BoolTreeT>(valueMask));
+        valueMask.clear();
+    }
+
+
+    // Generate the seamline mask
+    if (refSignTreePt) {
+        internal::GenSeamMask<Int16TreeT, Int16LeafManagerT> seamOp(signLeafs, *refSignTreePt);
+        seamOp.run();
+
+        tools::dilateVoxels(seamOp.mask(), 3);
+        signLeafs.foreach(internal::TagSeamEdges<BoolTreeT>(seamOp.mask()));
+
+        seamMask.merge(seamOp.mask());
+    }
+
+
+    std::vector<size_t> regions(signLeafs.leafCount(), 0);    
+    if (regions.empty()) {
+        mPointListSize = 0;
+        mPoints.reset();
+        mPolygonPoolListSize = 0;
+        mPolygons.reset();
+        mPointFlags.clear();
+        return;
+    }
+
+    if (adaptive) {
+
+        internal::MergeVoxelRegions<DistTreeT, Int16LeafManagerT> merge(
+            signLeafs, *signTreePt, distTree, *idxTreePt, isovalue, DistValueT(mPrimAdaptivity));
+
+        if (adaptivityField) {
+            merge.setSpatialAdaptivity(transform, *adaptivityField);
+        }
+
+        if (refSignTreePt || mAdaptivityMaskTree) {
+            merge.setAdaptivityMask(&seamMask);
+        }
+
+        if (refSignTreePt) {
+            merge.setRefData(refSignTreePt, DistValueT(mSecAdaptivity));
+        }
+
+        merge.run();
+
+        signLeafs.foreach(internal::CountRegions<IntTreeT>(*idxTreePt, regions));
+
+    } else {
+        signLeafs.foreach(internal::CountPoints(regions));
+    }
+
+
+    {
+        mPointListSize = 0;
+        size_t tmp = 0;
+        for (size_t n = 0, N = regions.size(); n < N; ++n) {
+            tmp = regions[n];
+            regions[n] = mPointListSize;
+            mPointListSize += tmp;
+        }
+    }
+
+
+    // Generate the unique point list
+    mPoints.reset(new openvdb::Vec3s[mPointListSize]);
+    mPointFlags.clear();
+
+    // Generate seam line sample points
+    if (refSignTreePt && refIdxTreePt) {
+
+        if (mSeamPointListSize == 0) {
+
+            std::vector<size_t> pointMap;
+
+            {
+                Int16LeafManagerT refSignLeafs(*refSignTreePt);
+                pointMap.resize(refSignLeafs.leafCount(), 0);
+
+                refSignLeafs.foreach(internal::CountPoints(pointMap));
+
+                size_t tmp = 0;
+                for (size_t n = 0, N = pointMap.size(); n < N; ++n) {
+                    tmp = pointMap[n];
+                    pointMap[n] = mSeamPointListSize;
+                    mSeamPointListSize += tmp;
+                }
+            }
+
+            if (!pointMap.empty() && mSeamPointListSize != 0) {
+
+                mQuantizedSeamPoints.reset(new uint32_t[mSeamPointListSize]);
+                memset(mQuantizedSeamPoints.get(), 0, sizeof(uint32_t) * mSeamPointListSize);
+
+                typedef tree::LeafManager<IntTreeT> IntLeafManagerT;
+
+                IntLeafManagerT refIdxLeafs(*refIdxTreePt);
+                refIdxLeafs.foreach(internal::MapPoints<Int16TreeT>(pointMap, *refSignTreePt));
+            }
+        }
+
+        if (mSeamPointListSize != 0) {
+            signLeafs.foreach(internal::SeamWeights<DistTreeT>(
+                distTree, *refSignTreePt, *refIdxTreePt, mQuantizedSeamPoints, mIsovalue));
+        }
+    }
+
+
+    internal::GenPoints<DistTreeT, Int16LeafManagerT>
+        pointOp(signLeafs, distTree, *idxTreePt, mPoints, regions, transform, mIsovalue);
+
+
+    if (mSeamPointListSize != 0) {
+        mPointFlags.resize(mPointListSize);
+        pointOp.setRefData(refSignTreePt, refDistTreePt, refIdxTreePt,
+            &mQuantizedSeamPoints, &mPointFlags);
+    }
+
+    pointOp.run();
+
+
+    mPolygonPoolListSize = signLeafs.leafCount();
+    mPolygons.reset(new PolygonPool[mPolygonPoolListSize]);
+
+
+    if (adaptive) {
+
+        internal::GenPolygons<Int16LeafManagerT, internal::AdaptivePrimBuilder>
+            mesher(signLeafs, *signTreePt, *idxTreePt, mPolygons, Index32(mPointListSize));
+
+        mesher.setRefSignTree(refSignTreePt);
+        mesher.run();
+
+    } else {
+
+        internal::GenPolygons<Int16LeafManagerT, internal::UniformPrimBuilder>
+            mesher(signLeafs, *signTreePt, *idxTreePt, mPolygons, Index32(mPointListSize));
+
+        mesher.setRefSignTree(refSignTreePt);
+        mesher.run();
+    }
+
+    // Clean up unused points, only necessary if masking and/or
+    // automatic mesh partitioning is enabled.
+    if ((surfaceMask || mPartitions > 1) && mPointListSize > 0) {
+
+        // Flag used points
+        std::vector<unsigned char> usedPointMask(mPointListSize, 0);
+
+        internal::FlagUsedPoints flagPoints(mPolygons, mPolygonPoolListSize, usedPointMask);
+        flagPoints.run();
+
+        // Create index map
+        std::vector<unsigned> indexMap(mPointListSize);
+        size_t usedPointCount = 0;
+        for (size_t p = 0; p < mPointListSize; ++p) {
+            if (usedPointMask[p]) indexMap[p] = static_cast<unsigned>(usedPointCount++);
+        }
+
+        if (usedPointCount < mPointListSize) {
+
+            // move points
+            internal::UniquePtr<openvdb::Vec3s>::type
+                newPointList(new openvdb::Vec3s[usedPointCount]);
+
+            internal::MovePoints movePoints(newPointList, mPoints, indexMap, usedPointMask);
+            movePoints.run();
+
+            mPointListSize = usedPointCount;
+            mPoints.reset(newPointList.release());
+
+            // update primitives
+            internal::RemapIndices remap(mPolygons, mPolygonPoolListSize, indexMap);
+            remap.run();
+        }
+    }
+
+
+    // Subdivide nonplanar quads near the seamline edges
+    // todo: thread and clean up
+    if (refSignTreePt || refIdxTreePt || refDistTreePt) {
+        std::vector<Vec3s> newPoints;
+
+        for (size_t n = 0; n <  mPolygonPoolListSize; ++n) {
+
+            PolygonPool& polygons = mPolygons[n];
+
+            std::vector<size_t> nonPlanarQuads;
+            nonPlanarQuads.reserve(polygons.numQuads());
+
+            for (size_t i = 0; i < polygons.numQuads(); ++i) {
+
+                char& flags = polygons.quadFlags(i);
+
+                if ((flags & POLYFLAG_FRACTURE_SEAM) && !(flags & POLYFLAG_EXTERIOR)) {
+
+                    openvdb::Vec4I& quad = polygons.quad(i);
+
+                    const bool edgePoly = mPointFlags[quad[0]] || mPointFlags[quad[1]]
+                        || mPointFlags[quad[2]] || mPointFlags[quad[3]];
+
+                    if (!edgePoly) continue;
+
+                    const Vec3s& p0 = mPoints[quad[0]];
+                    const Vec3s& p1 = mPoints[quad[1]];
+                    const Vec3s& p2 = mPoints[quad[2]];
+                    const Vec3s& p3 = mPoints[quad[3]];
+
+                    if (!internal::isPlanarQuad(p0, p1, p2, p3, 1e-6f)) {
+                        nonPlanarQuads.push_back(i);
+                    }
+                }
+            }
+
+
+            if (!nonPlanarQuads.empty()) {
+
+                PolygonPool tmpPolygons;
+
+                tmpPolygons.resetQuads(polygons.numQuads() - nonPlanarQuads.size());
+                tmpPolygons.resetTriangles(polygons.numTriangles() + 4 * nonPlanarQuads.size());
+
+                size_t triangleIdx = 0;
+                for (size_t i = 0; i < nonPlanarQuads.size(); ++i) {
+
+                    size_t& quadIdx = nonPlanarQuads[i];
+
+                    openvdb::Vec4I& quad = polygons.quad(quadIdx);
+                    char& quadFlags = polygons.quadFlags(quadIdx);
+                    //quadFlags |= POLYFLAG_SUBDIVIDED;
+
+                    Vec3s centroid = (mPoints[quad[0]] + mPoints[quad[1]] +
+                        mPoints[quad[2]] + mPoints[quad[3]]) * 0.25;
+
+                    size_t pointIdx = newPoints.size() + mPointListSize;
+
+                    newPoints.push_back(centroid);
+
+
+                    {
+                        Vec3I& triangle = tmpPolygons.triangle(triangleIdx);
+
+                        triangle[0] = quad[0];
+                        triangle[1] = static_cast<unsigned>(pointIdx);
+                        triangle[2] = quad[3];
+
+                        tmpPolygons.triangleFlags(triangleIdx) = quadFlags;
+
+                        if (mPointFlags[triangle[0]] || mPointFlags[triangle[2]]) {
+                            tmpPolygons.triangleFlags(triangleIdx) |= POLYFLAG_SUBDIVIDED;
+                        }
+                    }
+
+                    ++triangleIdx;
+
+                    {
+                        Vec3I& triangle = tmpPolygons.triangle(triangleIdx);
+
+                        triangle[0] = quad[0];
+                        triangle[1] = quad[1];
+                        triangle[2] = static_cast<unsigned>(pointIdx);
+
+                        tmpPolygons.triangleFlags(triangleIdx) = quadFlags;
+
+                        if (mPointFlags[triangle[0]] || mPointFlags[triangle[1]]) {
+                            tmpPolygons.triangleFlags(triangleIdx) |= POLYFLAG_SUBDIVIDED;
+                        }
+                    }
+
+                    ++triangleIdx;
+
+                    {
+                        Vec3I& triangle = tmpPolygons.triangle(triangleIdx);
+
+                        triangle[0] = quad[1];
+                        triangle[1] = quad[2];
+                        triangle[2] = static_cast<unsigned>(pointIdx);
+
+                        tmpPolygons.triangleFlags(triangleIdx) = quadFlags;
+
+                        if (mPointFlags[triangle[0]] || mPointFlags[triangle[1]]) {
+                            tmpPolygons.triangleFlags(triangleIdx) |= POLYFLAG_SUBDIVIDED;
+                        }
+                    }
+
+
+                    ++triangleIdx;
+
+                    {
+                        Vec3I& triangle = tmpPolygons.triangle(triangleIdx);
+
+                        triangle[0] = quad[2];
+                        triangle[1] = quad[3];
+                        triangle[2] = static_cast<unsigned>(pointIdx);
+
+                        tmpPolygons.triangleFlags(triangleIdx) = quadFlags;
+
+                        if (mPointFlags[triangle[0]] || mPointFlags[triangle[1]]) {
+                            tmpPolygons.triangleFlags(triangleIdx) |= POLYFLAG_SUBDIVIDED;
+                        }
+                    }
+
+                    ++triangleIdx;
+
+                    quad[0] = util::INVALID_IDX;
+                }
+
+
+                for (size_t i = 0; i < polygons.numTriangles(); ++i) {
+                    tmpPolygons.triangle(triangleIdx) = polygons.triangle(i);
+                    tmpPolygons.triangleFlags(triangleIdx) = polygons.triangleFlags(i);
+                    ++triangleIdx;
+                }
+
+
+                size_t quadIdx = 0;
+                for (size_t i = 0; i < polygons.numQuads(); ++i) {
+                    openvdb::Vec4I& quad = polygons.quad(i);
+
+                    if (quad[0] != util::INVALID_IDX) {
+                        tmpPolygons.quad(quadIdx) = quad;
+                        tmpPolygons.quadFlags(quadIdx) = polygons.quadFlags(i);
+                        ++quadIdx;
+                    }
+                }
+
+
+                polygons.copy(tmpPolygons);
+            }
+
+        }
+
+
+        if (!newPoints.empty()) {
+
+            size_t newPointCount = newPoints.size() + mPointListSize;
+
+            internal::UniquePtr<openvdb::Vec3s>::type
+                newPointList(new openvdb::Vec3s[newPointCount]);
+
+            for (size_t i = 0; i < mPointListSize; ++i) {
+                newPointList.get()[i] = mPoints[i];
+            }
+
+            for (size_t i = mPointListSize; i < newPointCount; ++i) {
+                newPointList.get()[i] = newPoints[i - mPointListSize];
+            }
+
+            mPointListSize = newPointCount;
+            mPoints.reset(newPointList.release());
+            mPointFlags.resize(mPointListSize, 0);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+/// @internal This overload is enabled only for grids with a scalar ValueType.
+template<typename GridType>
+inline typename boost::enable_if<boost::is_scalar<typename GridType::ValueType>, void>::type
+doVolumeToMesh(
+    const GridType& grid,
+    std::vector<Vec3s>& points,
+    std::vector<Vec3I>& triangles,
+    std::vector<Vec4I>& quads,
+    double isovalue,
+    double adaptivity)
+{
+    VolumeToMesh mesher(isovalue, adaptivity);
+    mesher(grid);
+
+    // Preallocate the point list
+    points.clear();
+    points.resize(mesher.pointListSize());
+
+    { // Copy points
+        internal::PointListCopy ptnCpy(mesher.pointList(), points);
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, points.size()), ptnCpy);
+        mesher.pointList().reset(NULL);
+    }
+
+    PolygonPoolList& polygonPoolList = mesher.polygonPoolList();
+
+    { // Preallocate primitive lists
+        size_t numQuads = 0, numTriangles = 0;
+        for (size_t n = 0, N = mesher.polygonPoolListSize(); n < N; ++n) {
+            openvdb::tools::PolygonPool& polygons = polygonPoolList[n];
+            numTriangles += polygons.numTriangles();
+            numQuads += polygons.numQuads();
+        }
+
+        triangles.clear();
+        triangles.resize(numTriangles);
+        quads.clear();
+        quads.resize(numQuads);
+    }
+
+    // Copy primitives
+    size_t qIdx = 0, tIdx = 0;
+    for (size_t n = 0, N = mesher.polygonPoolListSize(); n < N; ++n) {
+        openvdb::tools::PolygonPool& polygons = polygonPoolList[n];
+
+        for (size_t i = 0, I = polygons.numQuads(); i < I; ++i) {
+            quads[qIdx++] = polygons.quad(i);
+        }
+
+        for (size_t i = 0, I = polygons.numTriangles(); i < I; ++i) {
+            triangles[tIdx++] = polygons.triangle(i);
+        }
+    }
+}
+
+/// @internal This overload is enabled only for grids that do not have a scalar ValueType.
+template<typename GridType>
+inline typename boost::disable_if<boost::is_scalar<typename GridType::ValueType>, void>::type
+doVolumeToMesh(
+    const GridType&,
+    std::vector<Vec3s>&,
+    std::vector<Vec3I>&,
+    std::vector<Vec4I>&,
+    double,
+    double)
+{
+    OPENVDB_THROW(TypeError, "volume to mesh conversion is supported only for scalar grids");
+}
+
+
+template<typename GridType>
+inline void
+volumeToMesh(
+    const GridType& grid,
+    std::vector<Vec3s>& points,
+    std::vector<Vec3I>& triangles,
+    std::vector<Vec4I>& quads,
+    double isovalue,
+    double adaptivity)
+{
+    doVolumeToMesh(grid, points, triangles, quads, isovalue, adaptivity);
+}
+
+
+template<typename GridType>
+inline void
+volumeToMesh(
+    const GridType& grid,
+    std::vector<Vec3s>& points,
+    std::vector<Vec4I>& quads,
+    double isovalue)
+{
+    std::vector<Vec3I> triangles;
+    doVolumeToMesh(grid, points, triangles, quads, isovalue, 0.0);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_VOLUME_TO_MESH_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tools/VolumeToSpheres.h b/nuparu/include/openvdb_new/tools/VolumeToSpheres.h
new file mode 100644
index 00000000..fe94293d
--- /dev/null
+++ b/nuparu/include/openvdb_new/tools/VolumeToSpheres.h
@@ -0,0 +1,1034 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TOOLS_VOLUME_TO_SPHERES_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_VOLUME_TO_SPHERES_HAS_BEEN_INCLUDED
+
+#include <openvdb/tree/ValueAccessor.h>
+#include <openvdb/tree/LeafManager.h>
+#include <openvdb/tools/Morphology.h> // for erodeVoxels()
+
+#include <openvdb/tools/PointScatter.h>
+#include <openvdb/tools/LevelSetUtil.h>
+#include <openvdb/tools/VolumeToMesh.h>
+
+#include <boost/scoped_array.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
+#include <vector>
+#include <limits> // std::numeric_limits
+
+//////////
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief  Threaded method to fill a closed level set or fog volume
+///         with adaptively sized spheres.
+///
+/// @param grid             a scalar gird to fill with spheres.
+///
+/// @param spheres          a @c Vec4 array representing the spheres that returned by this
+///                         method. The first three components specify the sphere center
+///                         and the fourth is the radius. The spheres in this array are
+///                         ordered by radius, biggest to smallest.
+///
+/// @param maxSphereCount   no more than this number of spheres are generated.
+///
+/// @param overlapping      toggle to allow spheres to overlap/intersect
+///
+/// @param minRadius        determines the smallest sphere size in voxel units.
+///
+/// @param maxRadius        determines the largest sphere size in voxel units.
+///
+/// @param isovalue         the crossing point of the volume values that is considered
+///                         the surface. The zero default value works for signed distance
+///                         fields while fog volumes require a larger positive value,
+///                         0.5 is a good initial guess.
+///
+/// @param instanceCount    how many interior points to consider for the sphere placement,
+///                         increasing this count increases the chances of finding optimal
+///                         sphere sizes.
+///
+/// @param interrupter      a pointer adhering to the util::NullInterrupter interface
+///
+template<typename GridT, typename InterrupterT>
+inline void
+fillWithSpheres(
+    const GridT& grid,
+    std::vector<openvdb::Vec4s>& spheres,
+    int maxSphereCount,
+    bool overlapping = false,
+    float minRadius = 1.0,
+    float maxRadius = std::numeric_limits<float>::max(),
+    float isovalue = 0.0,
+    int instanceCount = 10000,
+    InterrupterT* interrupter = NULL);
+
+
+/// @brief  @c fillWithSpheres method variant that automatically infers
+///         the util::NullInterrupter.
+template<typename GridT>
+inline void
+fillWithSpheres(
+    const GridT& grid,
+    std::vector<openvdb::Vec4s>& spheres,
+    int maxSphereCount,
+    bool overlapping = false,
+    float minRadius = 1.0,
+    float maxRadius = std::numeric_limits<float>::max(),
+    float isovalue = 0.0,
+    int instanceCount = 10000)
+{
+    fillWithSpheres<GridT, util::NullInterrupter>(grid, spheres,
+        maxSphereCount, overlapping, minRadius, maxRadius, isovalue, instanceCount);
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief  Accelerated closest surface point queries for narrow band level sets.
+///         Supports queries that originate at arbitrary world-space locations, is
+///         not confined to the narrow band region of the input volume geometry.
+template<typename GridT>
+class ClosestSurfacePoint
+{
+public:
+    typedef typename GridT::TreeType TreeT;
+    typedef typename TreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef typename TreeT::template ValueConverter<Int16>::Type Int16TreeT;
+
+
+    ClosestSurfacePoint();
+
+
+    /// @brief  Extracts the surface points and constructs a spatial acceleration structure.
+    ///
+    /// @param grid             a scalar gird, level set or fog volume.
+    ///
+    /// @param isovalue         the crossing point of the volume values that is considered
+    ///                         the surface. The zero default value works for signed distance
+    ///                         fields while fog volumes require a larger positive value,
+    ///                         0.5 is a good initial guess.
+    ///
+    /// @param interrupter      a pointer adhering to the util::NullInterrupter interface.
+    ///
+    template<typename InterrupterT>
+    void initialize(const GridT& grid, float isovalue = 0.0, InterrupterT* interrupter = NULL);
+
+
+    /// @brief  @c initialize method variant that automatically infers
+    ///         the util::NullInterrupter.
+    void initialize(const GridT& grid, float isovalue = 0.0);
+
+
+
+    /// @brief Computes distance to closest surface.
+    ///
+    /// @param points       search locations in world space.
+    ///
+    /// @param distances    list of closest surface point distances, populated by this method.
+    ///
+    bool search(const std::vector<Vec3R>& points, std::vector<float>& distances);
+
+
+    /// @brief Performs closest point searches.
+    ///
+    /// @param points       search locations in world space to be replaced by their closest
+    ///                     surface point.
+    ///
+    /// @param distances    list of closest surface point distances, populated by this method.
+    ///
+    bool searchAndReplace(std::vector<Vec3R>& points, std::vector<float>& distances);
+
+
+    /// @{
+    /// @brief Tree accessors
+    const IntTreeT& indexTree() const { return *mIdxTreePt; }
+    const Int16TreeT& signTree() const { return *mSignTreePt; }
+    /// @}
+
+private:
+    typedef typename IntTreeT::LeafNodeType IntLeafT;
+    typedef std::pair<size_t, size_t> IndexRange;
+
+    bool mIsInitialized;
+    std::vector<Vec4R> mLeafBoundingSpheres, mNodeBoundingSpheres;
+    std::vector<IndexRange> mLeafRanges;
+    std::vector<const IntLeafT*> mLeafNodes;
+    PointList mSurfacePointList;
+    size_t mPointListSize, mMaxNodeLeafs;
+    float mMaxRadiusSqr;
+    typename IntTreeT::Ptr mIdxTreePt;
+    typename Int16TreeT::Ptr mSignTreePt;
+
+    bool search(std::vector<Vec3R>&, std::vector<float>&, bool transformPoints);
+};
+
+
+////////////////////////////////////////
+
+
+
+
+// Internal utility methods
+
+
+namespace internal {
+
+struct PointAccessor
+{
+    PointAccessor(std::vector<Vec3R>& points)
+        : mPoints(points)
+    {
+    }
+
+    void add(const Vec3R &pos)
+    {
+        mPoints.push_back(pos);
+    }
+private:
+    std::vector<Vec3R>& mPoints;
+};
+
+
+template<typename IntLeafT>
+class LeafBS
+{
+public:
+
+    LeafBS(std::vector<Vec4R>& leafBoundingSpheres,
+        const std::vector<const IntLeafT*>& leafNodes,
+        const math::Transform& transform,
+        const PointList& surfacePointList);
+
+    void run(bool threaded = true);
+
+
+    void operator()(const tbb::blocked_range<size_t>&) const;
+
+private:
+    std::vector<Vec4R>& mLeafBoundingSpheres;
+    const std::vector<const IntLeafT*>& mLeafNodes;
+    const math::Transform& mTransform;
+    const PointList& mSurfacePointList;
+};
+
+template<typename IntLeafT>
+LeafBS<IntLeafT>::LeafBS(
+    std::vector<Vec4R>& leafBoundingSpheres,
+    const std::vector<const IntLeafT*>& leafNodes,
+    const math::Transform& transform,
+    const PointList& surfacePointList)
+    : mLeafBoundingSpheres(leafBoundingSpheres)
+    , mLeafNodes(leafNodes)
+    , mTransform(transform)
+    , mSurfacePointList(surfacePointList)
+{
+}
+
+template<typename IntLeafT>
+void
+LeafBS<IntLeafT>::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mLeafNodes.size()), *this);
+    } else {
+        (*this)(tbb::blocked_range<size_t>(0, mLeafNodes.size()));
+    }
+}
+
+template<typename IntLeafT>
+void
+LeafBS<IntLeafT>::operator()(const tbb::blocked_range<size_t>& range) const
+{
+    typename IntLeafT::ValueOnCIter iter;
+    Vec3s avg;
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        avg[0] = 0.0;
+        avg[1] = 0.0;
+        avg[2] = 0.0;
+
+        int count = 0;
+        for (iter = mLeafNodes[n]->cbeginValueOn(); iter; ++iter) {
+            avg += mSurfacePointList[iter.getValue()];
+            ++count;
+        }
+
+        if (count > 1) avg *= float(1.0 / double(count));
+
+        float maxDist = 0.0;
+
+        for (iter = mLeafNodes[n]->cbeginValueOn(); iter; ++iter) {
+            float tmpDist = (mSurfacePointList[iter.getValue()] - avg).lengthSqr();
+            if (tmpDist > maxDist) maxDist = tmpDist;
+        }
+
+        Vec4R& sphere = mLeafBoundingSpheres[n];
+
+        sphere[0] = avg[0];
+        sphere[1] = avg[1];
+        sphere[2] = avg[2];
+        sphere[3] = maxDist * 2.0; // padded radius
+    }
+}
+
+
+class NodeBS
+{
+public:
+    typedef std::pair<size_t, size_t> IndexRange;
+
+    NodeBS(std::vector<Vec4R>& nodeBoundingSpheres,
+        const std::vector<IndexRange>& leafRanges,
+        const std::vector<Vec4R>& leafBoundingSpheres);
+
+    inline void run(bool threaded = true);
+
+    inline void operator()(const tbb::blocked_range<size_t>&) const;
+
+private:
+    std::vector<Vec4R>& mNodeBoundingSpheres;
+    const std::vector<IndexRange>& mLeafRanges;
+    const std::vector<Vec4R>& mLeafBoundingSpheres;
+};
+
+inline
+NodeBS::NodeBS(std::vector<Vec4R>& nodeBoundingSpheres,
+    const std::vector<IndexRange>& leafRanges,
+    const std::vector<Vec4R>& leafBoundingSpheres)
+    : mNodeBoundingSpheres(nodeBoundingSpheres)
+    , mLeafRanges(leafRanges)
+    , mLeafBoundingSpheres(leafBoundingSpheres)
+{
+}
+
+inline void
+NodeBS::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mLeafRanges.size()), *this);
+    } else {
+        (*this)(tbb::blocked_range<size_t>(0, mLeafRanges.size()));
+    }
+}
+
+inline void
+NodeBS::operator()(const tbb::blocked_range<size_t>& range) const
+{
+    Vec3d avg, pos;
+
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        avg[0] = 0.0;
+        avg[1] = 0.0;
+        avg[2] = 0.0;
+
+        int count = int(mLeafRanges[n].second) - int(mLeafRanges[n].first);
+
+        for (size_t i = mLeafRanges[n].first; i < mLeafRanges[n].second; ++i) {
+            avg[0] += mLeafBoundingSpheres[i][0];
+            avg[1] += mLeafBoundingSpheres[i][1];
+            avg[2] += mLeafBoundingSpheres[i][2];
+        }
+
+        if (count > 1) avg *= float(1.0 / double(count));
+
+
+        double maxDist = 0.0;
+
+        for (size_t i = mLeafRanges[n].first; i < mLeafRanges[n].second; ++i) {
+            pos[0] = mLeafBoundingSpheres[i][0];
+            pos[1] = mLeafBoundingSpheres[i][1];
+            pos[2] = mLeafBoundingSpheres[i][2];
+
+            double tmpDist = (pos - avg).lengthSqr() + mLeafBoundingSpheres[i][3];
+            if (tmpDist > maxDist) maxDist = tmpDist;
+        }
+
+        Vec4R& sphere = mNodeBoundingSpheres[n];
+
+        sphere[0] = avg[0];
+        sphere[1] = avg[1];
+        sphere[2] = avg[2];
+        sphere[3] = maxDist * 2.0; // padded radius
+    }
+}
+
+
+
+////////////////////////////////////////
+
+
+template<typename IntLeafT>
+class ClosestPointDist
+{
+public:
+    typedef std::pair<size_t, size_t> IndexRange;
+
+    ClosestPointDist(
+        std::vector<Vec3R>& instancePoints,
+        std::vector<float>& instanceDistances,
+        const PointList& surfacePointList,
+        const std::vector<const IntLeafT*>& leafNodes,
+        const std::vector<IndexRange>& leafRanges,
+        const std::vector<Vec4R>& leafBoundingSpheres,
+        const std::vector<Vec4R>& nodeBoundingSpheres,
+        size_t maxNodeLeafs,
+        bool transformPoints = false);
+
+
+    void run(bool threaded = true);
+
+
+    void operator()(const tbb::blocked_range<size_t>&) const;
+
+private:
+
+    void evalLeaf(size_t index, const IntLeafT& leaf) const;
+    void evalNode(size_t pointIndex, size_t nodeIndex) const;
+
+
+    std::vector<Vec3R>& mInstancePoints;
+    std::vector<float>& mInstanceDistances;
+
+    const PointList& mSurfacePointList;
+
+    const std::vector<const IntLeafT*>& mLeafNodes;
+    const std::vector<IndexRange>& mLeafRanges;
+    const std::vector<Vec4R>& mLeafBoundingSpheres;
+    const std::vector<Vec4R>& mNodeBoundingSpheres;
+
+    std::vector<float> mLeafDistances, mNodeDistances;
+
+    const bool mTransformPoints;
+    size_t mClosestPointIndex;
+};
+
+
+template<typename IntLeafT>
+ClosestPointDist<IntLeafT>::ClosestPointDist(
+    std::vector<Vec3R>& instancePoints,
+    std::vector<float>& instanceDistances,
+    const PointList& surfacePointList,
+    const std::vector<const IntLeafT*>& leafNodes,
+    const std::vector<IndexRange>& leafRanges,
+    const std::vector<Vec4R>& leafBoundingSpheres,
+    const std::vector<Vec4R>& nodeBoundingSpheres,
+    size_t maxNodeLeafs,
+    bool transformPoints)
+    : mInstancePoints(instancePoints)
+    , mInstanceDistances(instanceDistances)
+    , mSurfacePointList(surfacePointList)
+    , mLeafNodes(leafNodes)
+    , mLeafRanges(leafRanges)
+    , mLeafBoundingSpheres(leafBoundingSpheres)
+    , mNodeBoundingSpheres(nodeBoundingSpheres)
+    , mLeafDistances(maxNodeLeafs, 0.0)
+    , mNodeDistances(leafRanges.size(), 0.0)
+    , mTransformPoints(transformPoints)
+    , mClosestPointIndex(0)
+{
+}
+
+
+template<typename IntLeafT>
+void
+ClosestPointDist<IntLeafT>::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, mInstancePoints.size()), *this);
+    } else {
+        (*this)(tbb::blocked_range<size_t>(0, mInstancePoints.size()));
+    }
+}
+
+template<typename IntLeafT>
+void
+ClosestPointDist<IntLeafT>::evalLeaf(size_t index, const IntLeafT& leaf) const
+{
+    typename IntLeafT::ValueOnCIter iter;
+    const Vec3s center = mInstancePoints[index];
+    size_t& closestPointIndex = const_cast<size_t&>(mClosestPointIndex);
+
+    for (iter = leaf.cbeginValueOn(); iter; ++iter) {
+
+        const Vec3s& point = mSurfacePointList[iter.getValue()];
+        float tmpDist = (point - center).lengthSqr();
+
+        if (tmpDist < mInstanceDistances[index]) {
+            mInstanceDistances[index] = tmpDist;
+            closestPointIndex = iter.getValue();
+        }
+    }
+}
+
+
+template<typename IntLeafT>
+void
+ClosestPointDist<IntLeafT>::evalNode(size_t pointIndex, size_t nodeIndex) const
+{
+    const Vec3R& pos = mInstancePoints[pointIndex];
+    float minDist = mInstanceDistances[pointIndex];
+    size_t minDistIdx = 0;
+    Vec3R center;
+    bool updatedDist = false;
+
+    for (size_t i = mLeafRanges[nodeIndex].first, n = 0;
+        i < mLeafRanges[nodeIndex].second; ++i, ++n)
+    {
+        float& distToLeaf = const_cast<float&>(mLeafDistances[n]);
+
+        center[0] = mLeafBoundingSpheres[i][0];
+        center[1] = mLeafBoundingSpheres[i][1];
+        center[2] = mLeafBoundingSpheres[i][2];
+
+        distToLeaf = float((pos - center).lengthSqr() - mLeafBoundingSpheres[i][3]);
+
+        if (distToLeaf < minDist) {
+            minDist = distToLeaf;
+            minDistIdx = i;
+            updatedDist = true;
+        }
+    }
+
+    if (!updatedDist) return;
+
+    evalLeaf(pointIndex, *mLeafNodes[minDistIdx]);
+
+    for (size_t i = mLeafRanges[nodeIndex].first, n = 0;
+        i < mLeafRanges[nodeIndex].second; ++i, ++n)
+    {
+        if (mLeafDistances[n] < mInstanceDistances[pointIndex] && i != minDistIdx) {
+            evalLeaf(pointIndex, *mLeafNodes[i]);
+        }
+    }
+}
+
+
+template<typename IntLeafT>
+void
+ClosestPointDist<IntLeafT>::operator()(const tbb::blocked_range<size_t>& range) const
+{
+    Vec3R center;
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+
+        const Vec3R& pos = mInstancePoints[n];
+        float minDist = mInstanceDistances[n];
+        size_t minDistIdx = 0;
+
+        for (size_t i = 0, I = mNodeDistances.size(); i < I; ++i) {
+            float& distToNode = const_cast<float&>(mNodeDistances[i]);
+
+            center[0] = mNodeBoundingSpheres[i][0];
+            center[1] = mNodeBoundingSpheres[i][1];
+            center[2] = mNodeBoundingSpheres[i][2];
+
+            distToNode = float((pos - center).lengthSqr() - mNodeBoundingSpheres[i][3]);
+
+            if (distToNode < minDist) {
+                minDist = distToNode;
+                minDistIdx = i;
+            }
+        }
+
+        evalNode(n, minDistIdx);
+
+        for (size_t i = 0, I = mNodeDistances.size(); i < I; ++i) {
+            if (mNodeDistances[i] < mInstanceDistances[n] && i != minDistIdx) {
+                evalNode(n, i);
+            }
+        }
+
+        mInstanceDistances[n] = std::sqrt(mInstanceDistances[n]);
+
+        if (mTransformPoints) mInstancePoints[n] = mSurfacePointList[mClosestPointIndex];
+    }
+}
+
+
+class UpdatePoints
+{
+public:
+    UpdatePoints(
+        const Vec4s& sphere,
+        const std::vector<Vec3R>& points,
+        std::vector<float>& distances,
+        std::vector<unsigned char>& mask,
+        bool overlapping);
+
+    float radius() const { return mRadius; }
+    int index() const { return mIndex; }
+
+    inline void run(bool threaded = true);
+
+
+    UpdatePoints(UpdatePoints&, tbb::split);
+    inline void operator()(const tbb::blocked_range<size_t>& range);
+    void join(const UpdatePoints& rhs)
+    {
+        if (rhs.mRadius > mRadius) {
+            mRadius = rhs.mRadius;
+            mIndex = rhs.mIndex;
+        }
+    }
+
+private:
+
+    const Vec4s& mSphere;
+    const std::vector<Vec3R>& mPoints;
+
+    std::vector<float>& mDistances;
+    std::vector<unsigned char>& mMask;
+
+    bool mOverlapping;
+    float mRadius;
+    int mIndex;
+};
+
+inline
+UpdatePoints::UpdatePoints(
+    const Vec4s& sphere,
+    const std::vector<Vec3R>& points,
+    std::vector<float>& distances,
+    std::vector<unsigned char>& mask,
+    bool overlapping)
+    : mSphere(sphere)
+    , mPoints(points)
+    , mDistances(distances)
+    , mMask(mask)
+    , mOverlapping(overlapping)
+    , mRadius(0.0)
+    , mIndex(0)
+{
+}
+
+inline
+UpdatePoints::UpdatePoints(UpdatePoints& rhs, tbb::split)
+    : mSphere(rhs.mSphere)
+    , mPoints(rhs.mPoints)
+    , mDistances(rhs.mDistances)
+    , mMask(rhs.mMask)
+    , mOverlapping(rhs.mOverlapping)
+    , mRadius(rhs.mRadius)
+    , mIndex(rhs.mIndex)
+{
+}
+
+inline void
+UpdatePoints::run(bool threaded)
+{
+    if (threaded) {
+        tbb::parallel_reduce(tbb::blocked_range<size_t>(0, mPoints.size()), *this);
+    } else {
+        (*this)(tbb::blocked_range<size_t>(0, mPoints.size()));
+    }
+}
+
+inline void
+UpdatePoints::operator()(const tbb::blocked_range<size_t>& range)
+{
+    Vec3s pos;
+    for (size_t n = range.begin(); n != range.end(); ++n) {
+        if (mMask[n]) continue;
+
+        pos.x() = float(mPoints[n].x()) - mSphere[0];
+        pos.y() = float(mPoints[n].y()) - mSphere[1];
+        pos.z() = float(mPoints[n].z()) - mSphere[2];
+
+        float dist = pos.length();
+
+        if (dist < mSphere[3]) {
+            mMask[n] = 1;
+            continue;
+        }
+
+        if (!mOverlapping) {
+            mDistances[n] = std::min(mDistances[n], (dist - mSphere[3]));
+        }
+
+        if (mDistances[n] > mRadius) {
+            mRadius = mDistances[n];
+            mIndex = int(n);
+        }
+    }
+}
+
+
+} // namespace internal
+
+
+////////////////////////////////////////
+
+
+template<typename GridT, typename InterrupterT>
+inline void
+fillWithSpheres(
+    const GridT& grid,
+    std::vector<openvdb::Vec4s>& spheres,
+    int maxSphereCount,
+    bool overlapping,
+    float minRadius,
+    float maxRadius,
+    float isovalue,
+    int instanceCount,
+    InterrupterT* interrupter)
+{
+    spheres.clear();
+    spheres.reserve(maxSphereCount);
+
+    const bool addNBPoints = grid.activeVoxelCount() < 10000;
+    int instances = std::max(instanceCount, maxSphereCount);
+
+    typedef typename GridT::TreeType TreeT;
+    typedef typename GridT::ValueType ValueT;
+
+    typedef typename TreeT::template ValueConverter<bool>::Type BoolTreeT;
+    typedef typename TreeT::template ValueConverter<int>::Type IntTreeT;
+    typedef typename TreeT::template ValueConverter<Int16>::Type Int16TreeT;
+
+    typedef boost::mt11213b RandGen;
+    RandGen mtRand(/*seed=*/0);
+
+    const TreeT& tree = grid.tree();
+    const math::Transform& transform = grid.transform();
+
+    std::vector<Vec3R> instancePoints;
+
+    { // Scatter candidate sphere centroids (instancePoints)
+        typename Grid<BoolTreeT>::Ptr interiorMaskPtr;
+
+        if (grid.getGridClass() == GRID_LEVEL_SET) {
+            interiorMaskPtr = sdfInteriorMask(grid, ValueT(isovalue));
+        } else {
+            interiorMaskPtr = typename Grid<BoolTreeT>::Ptr(Grid<BoolTreeT>::create(false));
+            interiorMaskPtr->setTransform(transform.copy());
+            interiorMaskPtr->tree().topologyUnion(tree);
+        }
+
+        if (interrupter && interrupter->wasInterrupted()) return;
+
+        erodeVoxels(interiorMaskPtr->tree(), 1);
+
+        instancePoints.reserve(instances);
+        internal::PointAccessor ptnAcc(instancePoints);
+
+        UniformPointScatter<internal::PointAccessor, RandGen, InterrupterT> scatter(
+            ptnAcc, Index64(addNBPoints ? (instances / 2) : instances), mtRand, interrupter);
+
+        scatter(*interiorMaskPtr);
+    }
+
+    if (interrupter && interrupter->wasInterrupted()) return;
+
+    std::vector<float> instanceRadius;
+
+    ClosestSurfacePoint<GridT> csp;
+    csp.initialize(grid, isovalue, interrupter);
+
+    // add extra instance points in the interior narrow band.
+    if (instancePoints.size() < size_t(instances)) {
+        const Int16TreeT& signTree = csp.signTree();
+        typename Int16TreeT::LeafNodeType::ValueOnCIter it;
+        typename Int16TreeT::LeafCIter leafIt = signTree.cbeginLeaf();
+
+        for (; leafIt; ++leafIt) {
+            for (it = leafIt->cbeginValueOn(); it; ++it) {
+                const int flags = it.getValue();
+                if (!(0xE00 & flags) && (flags & 0x100)) {
+                    instancePoints.push_back(transform.indexToWorld(it.getCoord()));
+                }
+
+                if (instancePoints.size() == size_t(instances)) break;
+            }
+            if (instancePoints.size() == size_t(instances)) break;
+        }
+    }
+
+
+    if (interrupter && interrupter->wasInterrupted()) return;
+
+    if (!csp.search(instancePoints, instanceRadius)) return;
+
+    std::vector<unsigned char> instanceMask(instancePoints.size(), 0);
+    float largestRadius = 0.0;
+    int largestRadiusIdx = 0;
+
+    for (size_t n = 0, N = instancePoints.size(); n < N; ++n) {
+        if (instanceRadius[n] > largestRadius) {
+            largestRadius = instanceRadius[n];
+            largestRadiusIdx = int(n);
+        }
+    }
+
+    Vec3s pos;
+    Vec4s sphere;
+    minRadius = float(minRadius * transform.voxelSize()[0]);
+    maxRadius = float(maxRadius * transform.voxelSize()[0]);
+
+    for (size_t s = 0, S = std::min(size_t(maxSphereCount), instancePoints.size()); s < S; ++s) {
+
+        if (interrupter && interrupter->wasInterrupted()) return;
+
+        largestRadius = std::min(maxRadius, largestRadius);
+
+        if (s != 0 && largestRadius < minRadius) break;
+
+        sphere[0] = float(instancePoints[largestRadiusIdx].x());
+        sphere[1] = float(instancePoints[largestRadiusIdx].y());
+        sphere[2] = float(instancePoints[largestRadiusIdx].z());
+        sphere[3] = largestRadius;
+
+        spheres.push_back(sphere);
+        instanceMask[largestRadiusIdx] = 1;
+
+        internal::UpdatePoints op(
+            sphere, instancePoints, instanceRadius, instanceMask, overlapping);
+        op.run();
+
+        largestRadius = op.radius();
+        largestRadiusIdx = op.index();
+    }
+}
+
+////////////////////////////////////////
+
+
+template<typename GridT>
+ClosestSurfacePoint<GridT>::ClosestSurfacePoint()
+    : mIsInitialized(false)
+    , mLeafBoundingSpheres(0)
+    , mNodeBoundingSpheres(0)
+    , mLeafRanges(0)
+    , mLeafNodes(0)
+    , mSurfacePointList()
+    , mPointListSize(0)
+    , mMaxNodeLeafs(0)
+    , mMaxRadiusSqr(0.0)
+    , mIdxTreePt()
+{
+}
+
+template<typename GridT>
+void
+ClosestSurfacePoint<GridT>::initialize(const GridT& grid, float isovalue)
+{
+    initialize<GridT, util::NullInterrupter>(grid, isovalue, NULL);
+}
+
+
+template<typename GridT>
+template<typename InterrupterT>
+void
+ClosestSurfacePoint<GridT>::initialize(
+    const GridT& grid, float isovalue, InterrupterT* interrupter)
+{
+    mIsInitialized = false;
+    typedef tree::LeafManager<const TreeT> LeafManagerT;
+    typedef tree::LeafManager<IntTreeT>    IntLeafManagerT;
+    typedef tree::LeafManager<Int16TreeT>  Int16LeafManagerT;
+    typedef typename GridT::ValueType ValueT;
+
+    const TreeT& tree = grid.tree();
+    const math::Transform& transform = grid.transform();
+
+    { // Extract surface point cloud
+
+        {
+            LeafManagerT leafs(tree);
+            internal::SignData<TreeT, LeafManagerT>
+                signDataOp(tree, leafs, ValueT(isovalue));
+
+            signDataOp.run();
+
+            mSignTreePt = signDataOp.signTree();
+            mIdxTreePt = signDataOp.idxTree();
+        }
+
+        if (interrupter && interrupter->wasInterrupted()) return;
+
+        Int16LeafManagerT signLeafs(*mSignTreePt);
+
+        std::vector<size_t> regions(signLeafs.leafCount(), 0);
+        signLeafs.foreach(internal::CountPoints(regions));
+
+        mPointListSize = 0;
+        for (size_t tmp = 0, n = 0, N = regions.size(); n < N; ++n) {
+            tmp = regions[n];
+            regions[n] = mPointListSize;
+            mPointListSize += tmp;
+        }
+
+        if (mPointListSize == 0) return;
+
+        mSurfacePointList.reset(new Vec3s[mPointListSize]);
+
+        internal::GenPoints<TreeT, Int16LeafManagerT>
+            pointOp(signLeafs, tree, *mIdxTreePt, mSurfacePointList, regions, transform, isovalue);
+
+        pointOp.run();
+
+        mIdxTreePt->topologyUnion(*mSignTreePt);
+    }
+
+    if (interrupter && interrupter->wasInterrupted()) return;
+
+    // estimate max sphere radius (sqr dist)
+    CoordBBox bbox =  grid.evalActiveVoxelBoundingBox();
+
+    Vec3s dim = transform.indexToWorld(bbox.min()) -
+        transform.indexToWorld(bbox.max());
+
+    dim[0] = std::abs(dim[0]);
+    dim[1] = std::abs(dim[1]);
+    dim[2] = std::abs(dim[2]);
+
+    mMaxRadiusSqr = std::min(std::min(dim[0], dim[1]), dim[2]);
+    mMaxRadiusSqr *= 0.51f;
+    mMaxRadiusSqr *= mMaxRadiusSqr;
+
+
+    IntLeafManagerT idxLeafs(*mIdxTreePt);
+
+
+    typedef typename IntTreeT::RootNodeType IntRootNodeT;
+    typedef typename IntRootNodeT::NodeChainType IntNodeChainT;
+    BOOST_STATIC_ASSERT(boost::mpl::size<IntNodeChainT>::value > 1);
+    typedef typename boost::mpl::at<IntNodeChainT, boost::mpl::int_<1> >::type IntInternalNodeT;
+
+
+    typename IntTreeT::NodeCIter nIt = mIdxTreePt->cbeginNode();
+    nIt.setMinDepth(IntTreeT::NodeCIter::LEAF_DEPTH - 1);
+    nIt.setMaxDepth(IntTreeT::NodeCIter::LEAF_DEPTH - 1);
+
+    std::vector<const IntInternalNodeT*> internalNodes;
+
+    const IntInternalNodeT* node = NULL;
+    for (; nIt; ++nIt) {
+        nIt.getNode(node);
+        if (node) internalNodes.push_back(node);
+    }
+
+    std::vector<IndexRange>().swap(mLeafRanges);
+    mLeafRanges.resize(internalNodes.size());
+
+    std::vector<const IntLeafT*>().swap(mLeafNodes);
+    mLeafNodes.reserve(idxLeafs.leafCount());
+
+    typename IntInternalNodeT::ChildOnCIter leafIt;
+    mMaxNodeLeafs = 0;
+    for (size_t n = 0, N = internalNodes.size(); n < N; ++n) {
+
+        mLeafRanges[n].first = mLeafNodes.size();
+
+        size_t leafCount = 0;
+        for (leafIt = internalNodes[n]->cbeginChildOn(); leafIt; ++leafIt) {
+            mLeafNodes.push_back(&(*leafIt));
+            ++leafCount;
+        }
+
+        mMaxNodeLeafs = std::max(leafCount, mMaxNodeLeafs);
+
+        mLeafRanges[n].second = mLeafNodes.size();
+    }
+
+    std::vector<Vec4R>().swap(mLeafBoundingSpheres);
+    mLeafBoundingSpheres.resize(mLeafNodes.size());
+
+    internal::LeafBS<IntLeafT> leafBS(
+        mLeafBoundingSpheres, mLeafNodes, transform, mSurfacePointList);
+    leafBS.run();
+
+
+    std::vector<Vec4R>().swap(mNodeBoundingSpheres);
+    mNodeBoundingSpheres.resize(internalNodes.size());
+
+    internal::NodeBS nodeBS(mNodeBoundingSpheres, mLeafRanges, mLeafBoundingSpheres);
+    nodeBS.run();
+    mIsInitialized = true;
+}
+
+
+template<typename GridT>
+bool
+ClosestSurfacePoint<GridT>::search(std::vector<Vec3R>& points,
+    std::vector<float>& distances, bool transformPoints)
+{
+    if (!mIsInitialized) return false;
+
+    distances.clear();
+    distances.resize(points.size(), mMaxRadiusSqr);
+
+    internal::ClosestPointDist<IntLeafT> cpd(points, distances, mSurfacePointList,
+        mLeafNodes, mLeafRanges, mLeafBoundingSpheres, mNodeBoundingSpheres,
+        mMaxNodeLeafs, transformPoints);
+
+    cpd.run();
+
+    return true;
+}
+
+
+template<typename GridT>
+bool
+ClosestSurfacePoint<GridT>::search(const std::vector<Vec3R>& points, std::vector<float>& distances)
+{
+    return search(const_cast<std::vector<Vec3R>& >(points), distances, false);
+}
+
+
+template<typename GridT>
+bool
+ClosestSurfacePoint<GridT>::searchAndReplace(std::vector<Vec3R>& points,
+    std::vector<float>& distances)
+{
+    return search(points, distances, true);
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TOOLS_VOLUME_TO_MESH_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/InternalNode.h b/nuparu/include/openvdb_new/tree/InternalNode.h
new file mode 100644
index 00000000..44fd6295
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/InternalNode.h
@@ -0,0 +1,3201 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file InternalNode.h
+///
+/// @brief Internal table nodes for OpenVDB trees
+///
+/// @todo Multi-thred topologyDifference
+
+#ifndef OPENVDB_TREE_INTERNALNODE_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_INTERNALNODE_HAS_BEEN_INCLUDED
+
+#include <boost/shared_array.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/mpl/if.hpp>
+#include <boost/type_traits/is_const.hpp>
+#include <boost/type_traits/is_pointer.hpp>
+#include <boost/type_traits/remove_pointer.hpp>
+#include <tbb/parallel_for.h>
+#include <openvdb/Platform.h>
+#include <openvdb/util/NodeMasks.h>
+#include <openvdb/io/Compression.h> // for io::readData(), etc.
+#include <openvdb/math/Math.h> // for Abs(), isExactlyEqual()
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+#include "Iterator.h"
+#include "NodeUnion.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+template<typename, Index, typename> struct SameInternalConfig; // forward declaration
+
+
+template<typename _ChildNodeType, Index Log2Dim>
+class InternalNode
+{
+public:
+    typedef _ChildNodeType                        ChildNodeType;
+    typedef typename ChildNodeType::LeafNodeType  LeafNodeType;
+    typedef typename ChildNodeType::ValueType     ValueType;
+    typedef typename ChildNodeType::BuildType     BuildType;
+    typedef NodeUnion<ValueType, ChildNodeType>   UnionType;
+    typedef util::NodeMask<Log2Dim>               NodeMaskType;
+
+    static const Index
+        LOG2DIM      = Log2Dim,
+        TOTAL        = Log2Dim + ChildNodeType::TOTAL,
+        DIM          = 1 << TOTAL,
+        NUM_VALUES   = 1 << (3 * Log2Dim),
+        LEVEL        = 1 + ChildNodeType::LEVEL; // level 0 = leaf
+    static const Index64
+        NUM_VOXELS   = uint64_t(1) << (3 * TOTAL); // total # of voxels represented by this node
+
+    /// @brief ValueConverter<T>::Type is the type of an InternalNode having the same
+    /// child hierarchy and dimensions as this node but a different value type, T.
+    template<typename OtherValueType>
+    struct ValueConverter {
+        typedef InternalNode<typename ChildNodeType::template ValueConverter<
+            OtherValueType>::Type, Log2Dim> Type;
+    };
+
+    /// @brief SameConfiguration<OtherNodeType>::value is @c true if and only if OtherNodeType
+    /// is the type of an InternalNode with the same dimensions as this node and whose
+    /// ChildNodeType has the same configuration as this node's ChildNodeType.
+    template<typename OtherNodeType>
+    struct SameConfiguration {
+        static const bool value =
+            SameInternalConfig<ChildNodeType, Log2Dim, OtherNodeType>::value;
+    };
+
+
+    InternalNode() {}
+
+    explicit InternalNode(const ValueType& offValue);
+
+    InternalNode(const Coord&, const ValueType& fillValue, bool active = false);
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    InternalNode(PartialCreate, const Coord&, const ValueType& fillValue, bool active = false);
+#endif
+
+    /// @brief Deep copy constructor
+    ///
+    /// @note This method is multi-threaded!
+    InternalNode(const InternalNode&);
+
+    /// @brief Value conversion copy constructor
+    ///
+    /// @note This method is multi-threaded!
+    template<typename OtherChildNodeType>
+    explicit InternalNode(const InternalNode<OtherChildNodeType, Log2Dim>& other);
+
+    /// @brief Topology copy constructor
+    ///
+    /// @note This method is multi-threaded!
+    template<typename OtherChildNodeType>
+    InternalNode(const InternalNode<OtherChildNodeType, Log2Dim>& other,
+                 const ValueType& background, TopologyCopy);
+
+    /// @brief Topology copy constructor
+    ///
+    /// @note This method is multi-threaded!
+    template<typename OtherChildNodeType>
+    InternalNode(const InternalNode<OtherChildNodeType, Log2Dim>& other,
+                 const ValueType& offValue, const ValueType& onValue, TopologyCopy);
+
+    virtual ~InternalNode();
+
+protected:
+    typedef typename NodeMaskType::OnIterator    MaskOnIterator;
+    typedef typename NodeMaskType::OffIterator   MaskOffIterator;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIterator;
+
+    // Type tags to disambiguate template instantiations
+    struct ValueOn {}; struct ValueOff {}; struct ValueAll {};
+    struct ChildOn {}; struct ChildOff {}; struct ChildAll {};
+
+    // The following class templates implement the iterator interfaces specified in Iterator.h
+    // by providing getItem(), setItem() and/or modifyItem() methods.
+
+    template<typename NodeT, typename ChildT, typename MaskIterT, typename TagT>
+    struct ChildIter: public SparseIteratorBase<
+        MaskIterT, ChildIter<NodeT, ChildT, MaskIterT, TagT>, NodeT, ChildT>
+    {
+        ChildIter() {}
+        ChildIter(const MaskIterT& iter, NodeT* parent): SparseIteratorBase<
+            MaskIterT, ChildIter<NodeT, ChildT, MaskIterT, TagT>, NodeT, ChildT>(iter, parent) {}
+
+        ChildT& getItem(Index pos) const
+        {
+            assert(this->parent().isChildMaskOn(pos));
+            return *(this->parent().getChildNode(pos));
+        }
+
+        // Note: setItem() can't be called on const iterators.
+        void setItem(Index pos, const ChildT& c) const { this->parent().resetChildNode(pos, &c); }
+
+        // Note: modifyItem() isn't implemented, since it's not useful for child node pointers.
+    };// ChildIter
+
+    template<typename NodeT, typename ValueT, typename MaskIterT, typename TagT>
+    struct ValueIter: public SparseIteratorBase<
+        MaskIterT, ValueIter<NodeT, ValueT, MaskIterT, TagT>, NodeT, ValueT>
+    {
+        ValueIter() {}
+        ValueIter(const MaskIterT& iter, NodeT* parent): SparseIteratorBase<
+            MaskIterT, ValueIter<NodeT, ValueT, MaskIterT, TagT>, NodeT, ValueT>(iter, parent) {}
+
+        const ValueT& getItem(Index pos) const { return this->parent().mNodes[pos].getValue(); }
+
+        // Note: setItem() can't be called on const iterators.
+        void setItem(Index pos, const ValueT& v) const { this->parent().mNodes[pos].setValue(v); }
+
+        // Note: modifyItem() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyItem(Index pos, const ModifyOp& op) const
+        {
+            op(this->parent().mNodes[pos].getValue());
+        }
+    };// ValueIter
+
+    template<typename NodeT, typename ChildT, typename ValueT, typename TagT>
+    struct DenseIter: public DenseIteratorBase<
+        MaskDenseIterator, DenseIter<NodeT, ChildT, ValueT, TagT>, NodeT, ChildT, ValueT>
+    {
+        typedef DenseIteratorBase<MaskDenseIterator, DenseIter, NodeT, ChildT, ValueT> BaseT;
+        typedef typename BaseT::NonConstValueType NonConstValueT;
+
+        DenseIter() {}
+        DenseIter(const MaskDenseIterator& iter, NodeT* parent):
+            DenseIteratorBase<MaskDenseIterator, DenseIter, NodeT, ChildT, ValueT>(iter, parent) {}
+
+        bool getItem(Index pos, ChildT*& child, NonConstValueT& value) const
+        {
+            if (this->parent().isChildMaskOn(pos)) {
+                child = this->parent().getChildNode(pos);
+                return true;
+            }
+            child = NULL;
+            value = this->parent().mNodes[pos].getValue();
+            return false;
+        }
+
+        // Note: setItem() can't be called on const iterators.
+        void setItem(Index pos, ChildT* child) const
+        {
+            this->parent().resetChildNode(pos, child);
+        }
+
+        // Note: unsetItem() can't be called on const iterators.
+        void unsetItem(Index pos, const ValueT& value) const
+        {
+            this->parent().unsetChildNode(pos, value);
+        }
+    };// DenseIter
+
+public:
+    // Iterators (see Iterator.h for usage)
+    typedef ChildIter<InternalNode, ChildNodeType, MaskOnIterator, ChildOn>          ChildOnIter;
+    typedef ChildIter<const InternalNode,const ChildNodeType,MaskOnIterator,ChildOn> ChildOnCIter;
+    typedef ValueIter<InternalNode, const ValueType, MaskOffIterator, ChildOff>      ChildOffIter;
+    typedef ValueIter<const InternalNode,const ValueType,MaskOffIterator,ChildOff>   ChildOffCIter;
+    typedef DenseIter<InternalNode, ChildNodeType, ValueType, ChildAll>              ChildAllIter;
+    typedef DenseIter<const InternalNode,const ChildNodeType, ValueType, ChildAll>   ChildAllCIter;
+
+    typedef ValueIter<InternalNode, const ValueType, MaskOnIterator, ValueOn>        ValueOnIter;
+    typedef ValueIter<const InternalNode,const ValueType,MaskOnIterator,ValueOn>     ValueOnCIter;
+    typedef ValueIter<InternalNode, const ValueType, MaskOffIterator, ValueOff>      ValueOffIter;
+    typedef ValueIter<const InternalNode,const ValueType,MaskOffIterator,ValueOff>   ValueOffCIter;
+    typedef ValueIter<InternalNode, const ValueType, MaskOffIterator, ValueAll>      ValueAllIter;
+    typedef ValueIter<const InternalNode,const ValueType,MaskOffIterator,ValueAll>   ValueAllCIter;
+
+    ChildOnCIter  cbeginChildOn()  const { return ChildOnCIter(mChildMask.beginOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(mChildMask.beginOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(mChildMask.beginDense(), this); }
+    ChildOnCIter   beginChildOn()  const { return cbeginChildOn(); }
+    ChildOffCIter  beginChildOff() const { return cbeginChildOff(); }
+    ChildAllCIter  beginChildAll() const { return cbeginChildAll(); }
+    ChildOnIter    beginChildOn()  { return ChildOnIter(mChildMask.beginOn(), this); }
+    ChildOffIter   beginChildOff() { return ChildOffIter(mChildMask.beginOff(), this); }
+    ChildAllIter   beginChildAll() { return ChildAllIter(mChildMask.beginDense(), this); }
+
+    ValueOnCIter  cbeginValueOn()  const { return ValueOnCIter(mValueMask.beginOn(), this); }
+    /// @warning This iterator will also visit child nodes so use isChildMaskOn to skip them!
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(mValueMask.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(mChildMask.beginOff(), this); }
+    ValueOnCIter   beginValueOn()  const { return cbeginValueOn(); }
+    /// @warning This iterator will also visit child nodes so use isChildMaskOn to skip them!
+    ValueOffCIter  beginValueOff() const { return cbeginValueOff(); }
+    ValueAllCIter  beginValueAll() const { return cbeginValueAll(); }
+    ValueOnIter    beginValueOn()  { return ValueOnIter(mValueMask.beginOn(), this); }
+    /// @warning This iterator will also visit child nodes so use isChildMaskOn to skip them!
+    ValueOffIter   beginValueOff() { return ValueOffIter(mValueMask.beginOff(), this); }
+    ValueAllIter   beginValueAll() { return ValueAllIter(mChildMask.beginOff(), this); }
+
+
+    static Index dim() { return DIM; }
+    static Index getLevel() { return LEVEL; }
+    static void getNodeLog2Dims(std::vector<Index>& dims);
+    static Index getChildDim() { return ChildNodeType::DIM; }
+
+    /// Return the linear table offset of the given global or local coordinates.
+    static Index coordToOffset(const Coord& xyz);
+    /// @brief Return the local coordinates for a linear table offset,
+    /// where offset 0 has coordinates (0, 0, 0).
+    static void offsetToLocalCoord(Index n, Coord& xyz);
+    /// Return the global coordinates for a linear table offset.
+    Coord offsetToGlobalCoord(Index n) const;
+
+    /// Return the grid index coordinates of this node's local origin.
+    const Coord& origin() const { return mOrigin; }
+    /// Set the grid index coordinates of this node's local origin.
+    void setOrigin(const Coord& origin) { mOrigin = origin; }
+
+    Index32 leafCount() const;
+    Index32 nonLeafCount() const;
+    Index64 onVoxelCount() const;
+    Index64 offVoxelCount() const;
+    Index64 onLeafVoxelCount() const;
+    Index64 offLeafVoxelCount() const;
+    Index64 onTileCount() const;
+
+    /// Return the total amount of memory in bytes occupied by this node and its children.
+    Index64 memUsage() const;
+
+    /// @brief Expand the specified bounding box so that it includes the active tiles
+    /// of this internal node as well as all the active values in its child nodes.
+    /// If visitVoxels is false LeafNodes will be approximated as dense, i.e. with all
+    /// voxels active. Else the individual active voxels are visited to produce a tight bbox.
+    void evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels = true) const;
+
+    /// @brief Return the bounding box of this node, i.e., the full index space
+    /// spanned by the node regardless of its content.
+    CoordBBox getNodeBoundingBox() const { return CoordBBox::createCube(mOrigin, DIM); }
+
+    bool isEmpty() const { return mChildMask.isOff(); }
+
+    /// Return @c true if all of this node's table entries have the same active state
+    /// and the same constant value to within the given tolerance,
+    /// and return that value in @a constValue and the active state in @a state.
+    ///
+    /// @note This method also returns @c false if this node contains any child nodes. 
+    bool isConstant(ValueType& constValue, bool& state,
+                    const ValueType& tolerance = zeroVal<ValueType>()) const;
+
+    /// Return @c true if all of this node's tables entries have
+    /// the same active @a state and the values are in the range
+    /// (@a maxValue + @a minValue)/2 +/- @a tolerance.
+    ///
+    /// @param minValue  Is updated with the minimum of all values IF method
+    ///                  returns @c true. Else the value is undefined!
+    /// @param maxValue  Is updated with the maximum of all values IF method
+    ///                  returns @c true. Else the value is undefined!
+    /// @param state     Is updated with the state of all values IF method
+    ///                  returns @c true. Else the value is undefined!
+    /// @param tolerance The tolerance used to determine if values are
+    ///                  approximatly constant.
+    ///
+    /// @note This method also returns @c false if this node contains any child nodes.
+    bool isConstant(ValueType& minValue, ValueType& maxValue,
+                    bool& state, const ValueType& tolerance = zeroVal<ValueType>()) const;
+    
+    /// Return @c true if this node has no children and only contains inactive values.
+    bool isInactive() const { return this->isChildMaskOff() && this->isValueMaskOff(); }
+
+    /// Return @c true if the voxel at the given coordinates is active.
+    bool isValueOn(const Coord& xyz) const;
+    /// Return @c true if the voxel at the given offset is active.
+    bool isValueOn(Index offset) const { return mValueMask.isOn(offset); }
+
+    /// Return @c true if this node or any of its child nodes have any active tiles.
+    bool hasActiveTiles() const;
+
+    const ValueType& getValue(const Coord& xyz) const;
+    bool probeValue(const Coord& xyz, ValueType& value) const;
+
+    /// @brief Return the level of the tree (0 = leaf) at which the value
+    /// at the given coordinates resides.
+    Index getValueLevel(const Coord& xyz) const;
+
+    /// @brief If the first entry in this node's table is a tile, return the tile's value.
+    /// Otherwise, return the result of calling getFirstValue() on the child.
+    const ValueType& getFirstValue() const;
+    /// @brief If the last entry in this node's table is a tile, return the tile's value.
+    /// Otherwise, return the result of calling getLastValue() on the child.
+    const ValueType& getLastValue() const;
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on);
+    /// Set the value of the voxel at the given coordinates but don't change its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value);
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValueOn(const Coord& xyz, const ValueType& value);
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value);
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op);
+    /// Apply a functor to the voxel at the given coordinates.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op);
+
+    /// Return the value of the voxel at the given coordinates and, if necessary, update
+    /// the accessor with pointers to the nodes along the path from the root node to
+    /// the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    const ValueType& getValueAndCache(const Coord& xyz, AccessorT&) const;
+
+    /// Return @c true if the voxel at the given coordinates is active and, if necessary,
+    /// update the accessor with pointers to the nodes along the path from the root node
+    /// to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool isValueOnAndCache(const Coord& xyz, AccessorT&) const;
+
+    /// Change the value of the voxel at the given coordinates and mark it as active.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueAndCache(const Coord& xyz, const ValueType& value, AccessorT&);
+
+    /// Set the value of the voxel at the given coordinate but preserves its active state.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord& xyz, const ValueType& value, AccessorT&);
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&);
+
+    /// Apply a functor to the voxel at the given coordinates.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&);
+
+    /// Change the value of the voxel at the given coordinates and mark it as inactive.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord& xyz, const ValueType& value, AccessorT&);
+
+    /// Set the active state of the voxel at the given coordinates without changing its value.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT&);
+
+    /// Return, in @a value, the value of the voxel at the given coordinates and,
+    /// if necessary, update the accessor with pointers to the nodes along
+    /// the path from the root node to the node containing the voxel.
+    /// @return @c true if the voxel at the given coordinates is active
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool probeValueAndCache(const Coord& xyz, ValueType& value, AccessorT&) const;
+
+    /// @brief Return the level of the tree (0 = leaf) at which the value
+    /// at the given coordinates resides.
+    ///
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    Index getValueLevelAndCache(const Coord& xyz, AccessorT&) const;
+
+    /// Mark all values (both tiles and voxels) as active.
+    void setValuesOn();
+
+    //
+    // I/O
+    //
+    void writeTopology(std::ostream&, bool toHalf = false) const;
+    void readTopology(std::istream&, bool fromHalf = false);
+    void writeBuffers(std::ostream&, bool toHalf = false) const;
+    void readBuffers(std::istream&, bool fromHalf = false);
+    void readBuffers(std::istream&, const CoordBBox&, bool fromHalf = false);
+
+
+    //
+    // Aux methods
+    //
+    /// @brief Set all voxels within an axis-aligned box to a constant value.
+    /// (The min and max coordinates are inclusive.)
+    void fill(const CoordBBox& bbox, const ValueType&, bool active = true);
+
+    /// Change the sign of all the values represented in this node and
+    /// its child nodes.
+    void negate();
+
+    /// @brief Densify active tiles, i.e., replace them with leaf-level active voxels.
+    /// @param threaded if true, this operation is multi-threaded (over the internal nodes).
+    void voxelizeActiveTiles(bool threaded = true);
+
+    /// @brief Copy into a dense grid the values of the voxels that lie within
+    /// a given bounding box.
+    /// @param bbox   inclusive bounding box of the voxels to be copied into the dense grid
+    /// @param dense  dense grid with a stride in @e z of one (see tools::Dense
+    ///               in tools/Dense.h for the required API)
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    template<typename DenseT>
+    void copyToDense(const CoordBBox& bbox, DenseT& dense) const;
+
+    /// @brief Efficiently merge another tree into this tree using one of several schemes.
+    /// @warning This operation cannibalizes the other tree.
+    template<MergePolicy Policy>
+    void merge(InternalNode& other, const ValueType& background, const ValueType& otherBackground);
+
+    /// @brief Merge, using one of several schemes, this node (and its descendants)
+    /// with a tile of the same dimensions and the given value and active state.
+    template<MergePolicy Policy> void merge(const ValueType& tileValue, bool tileActive);
+
+    /// @brief Union this branch's set of active values with the other branch's
+    /// active values.  The value type of the other branch can be different.
+    /// @details The resulting state of a value is active if the corresponding value
+    /// was already active OR if it is active in the other tree.  Also, a resulting
+    /// value maps to a voxel if the corresponding value already mapped to a voxel
+    /// OR if it is a voxel in the other tree.  Thus, a resulting value can only
+    /// map to a tile if the corresponding value already mapped to a tile
+    /// AND if it is a tile value in other tree.
+    ///
+    /// Specifically, active tiles and voxels in this branch are not changed, and
+    /// tiles or voxels that were inactive in this branch but active in the other branch
+    /// are marked as active in this branch but left with their original values.
+    template<typename OtherChildNodeType>
+    void topologyUnion(const InternalNode<OtherChildNodeType, Log2Dim>& other);
+
+    /// @brief Intersects this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different.
+    /// @details The resulting state of a value is active only if the corresponding
+    /// value was already active AND if it is active in the other tree. Also, a
+    /// resulting value maps to a voxel if the corresponding value
+    /// already mapped to an active voxel in either of the two grids
+    /// and it maps to an active tile or voxel in the other grid.
+    ///
+    /// @note This operation can delete branches in this grid if they
+    /// overlap with inactive tiles in the other grid. Likewise active
+    /// voxels can be turned into unactive voxels resulting in leaf
+    /// nodes with no active values. Thus, it is recommended to
+    /// subsequently call prune.
+    template<typename OtherChildNodeType>
+    void topologyIntersection(const InternalNode<OtherChildNodeType, Log2Dim>& other,
+                              const ValueType& background);
+
+    /// @brief Difference this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if the original voxel is
+    /// active in this node and inactive in the other node.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyDifference.
+    ///
+    /// @note This operation modifies only active states, not
+    /// values. Also note that this operation can result in all voxels
+    /// being inactive so consider subsequnetly calling prune.
+    template<typename OtherChildNodeType>
+    void topologyDifference(const InternalNode<OtherChildNodeType, Log2Dim>& other,
+                            const ValueType& background);
+
+    template<typename CombineOp>
+    void combine(InternalNode& other, CombineOp&);
+    template<typename CombineOp>
+    void combine(const ValueType& value, bool valueIsActive, CombineOp&);
+
+    template<typename CombineOp, typename OtherNodeType /*= InternalNode*/>
+    void combine2(const InternalNode& other0, const OtherNodeType& other1, CombineOp&);
+    template<typename CombineOp, typename OtherNodeType /*= InternalNode*/>
+    void combine2(const ValueType& value, const OtherNodeType& other, bool valIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherValueType>
+    void combine2(const InternalNode& other, const OtherValueType&, bool valIsActive, CombineOp&);
+
+    /// @brief Calls the templated functor BBoxOp with bounding box
+    /// information for all active tiles and leaf nodes in this node.
+    /// An additional level argument is provided for each callback.
+    ///
+    /// @note The bounding boxes are guarenteed to be non-overlapping.
+    template<typename BBoxOp> void visitActiveBBox(BBoxOp&) const;
+
+    template<typename VisitorOp> void visit(VisitorOp&);
+    template<typename VisitorOp> void visit(VisitorOp&) const;
+
+    template<typename OtherNodeType, typename VisitorOp>
+    void visit2Node(OtherNodeType& other, VisitorOp&);
+    template<typename OtherNodeType, typename VisitorOp>
+    void visit2Node(OtherNodeType& other, VisitorOp&) const;
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false);
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false) const;
+
+    /// Set all voxels that lie outside the given axis-aligned box to the background.
+    void clip(const CoordBBox&, const ValueType& background);
+
+    /// @brief Reduce the memory footprint of this tree by replacing with tiles
+    /// any nodes whose values are all the same (optionally to within a tolerance)
+    /// and have the same active state.
+    void prune(const ValueType& tolerance = zeroVal<ValueType>());
+
+    /// @brief Add the specified leaf to this node, possibly creating a child branch
+    /// in the process.  If the leaf node already exists, replace it.
+    void addLeaf(LeafNodeType* leaf);
+
+    /// @brief Same as addLeaf() except, if necessary, update the accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename AccessorT>
+    void addLeafAndCache(LeafNodeType* leaf, AccessorT&);
+
+    /// @brief Return a pointer to the node of type @c NodeT that contains voxel (x, y, z)
+    /// and replace it with a tile of the specified value and state.
+    /// If no such node exists, leave the tree unchanged and return @c NULL.
+    ///
+    /// @note The caller takes ownership of the node and is responsible for deleting it.
+    ///
+    /// @warning Since this method potentially removes nodes and branches of the tree,
+    /// it is important to clear the caches of all ValueAccessors associated with this tree.
+    template<typename NodeT>
+    NodeT* stealNode(const Coord& xyz, const ValueType& value, bool state);
+
+    /// @brief Add a tile at the specified tree level that contains voxel (x, y, z),
+    /// possibly creating a parent branch or deleting a child branch in the process.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state);
+
+    /// @brief Delete any existing child branch at the specified offset and add a tile.
+    void addTile(Index offset, const ValueType& value, bool state);
+
+    /// @brief Same as addTile() except, if necessary, update the accessor with pointers
+    /// to the nodes along the path from the root node to the node containing (x, y, z).
+    template<typename AccessorT>
+    void addTileAndCache(Index level, const Coord& xyz, const ValueType&, bool state, AccessorT&);
+
+    //@{
+    /// @brief Return a pointer to the node that contains voxel (x, y, z).
+    /// If no such node exists, return NULL.
+    template<typename NodeType> NodeType* probeNode(const Coord& xyz);
+    template<typename NodeType> const NodeType* probeConstNode(const Coord& xyz) const;
+    //@}
+
+    //@{
+    /// @brief Same as probeNode() except, if necessary, update the accessor with pointers
+    /// to the nodes along the path from the root node to the node containing (x, y, z).
+    template<typename NodeType, typename AccessorT>
+    NodeType* probeNodeAndCache(const Coord& xyz, AccessorT&);
+    template<typename NodeType, typename AccessorT>
+    const NodeType* probeConstNodeAndCache(const Coord& xyz, AccessorT&) const;
+    //@}
+
+    //@{
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, return NULL.
+    LeafNodeType* probeLeaf(const Coord& xyz);
+    const LeafNodeType* probeConstLeaf(const Coord& xyz) const;
+    const LeafNodeType* probeLeaf(const Coord& xyz) const;
+    //@}
+
+    //@{
+    /// @brief Same as probeLeaf() except, if necessary, update the accessor with pointers
+    /// to the nodes along the path from the root node to the node containing (x, y, z).
+    template<typename AccessorT>
+    LeafNodeType* probeLeafAndCache(const Coord& xyz, AccessorT& acc);
+    template<typename AccessorT>
+    const LeafNodeType* probeConstLeafAndCache(const Coord& xyz, AccessorT& acc) const;
+    template<typename AccessorT>
+    const LeafNodeType* probeLeafAndCache(const Coord& xyz, AccessorT& acc) const;
+    //@}
+
+    /// @brief Return the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, create one, but preserve the values and
+    /// active states of all voxels.
+    ///
+    /// @details Use this method to preallocate a static tree topology
+    /// over which to safely perform multithreaded processing.
+    LeafNodeType* touchLeaf(const Coord& xyz);
+
+    /// @brief Same as touchLeaf() except, if necessary, update the accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename AccessorT>
+    LeafNodeType* touchLeafAndCache(const Coord& xyz, AccessorT&);
+
+    //@{
+    /// @brief Adds all nodes of a certain type to a container with the following API:
+    /// @code
+    /// struct ArrayT {
+    ///    typedef value_type;// defines the type of nodes to be added to the array
+    ///    void push_back(value_type nodePtr);// method that add nodes to the array
+    /// };
+    /// @endcode
+    /// @details An example of a wrapper around a c-style array is:
+    /// @code
+    /// struct MyArray {
+    ///    typedef LeafType* value_type;
+    ///    value_type* ptr;
+    ///    MyArray(value_type* array) : ptr(array) {}
+    ///    void push_back(value_type leaf) { *ptr++ = leaf; }
+    ///};
+    /// @endcode
+    /// @details An example that constructs a list of pointer to all leaf nodes is:
+    /// @code
+    /// std::vector<const LeafNodeType*> array;//most std contains have the required API
+    /// array.reserve(tree.leafCount());//this is a fast preallocation.
+    /// tree.getNodes(array);
+    /// @endcode
+    template<typename ArrayT>
+    void getNodes(ArrayT& array);
+    template<typename ArrayT>
+    void getNodes(ArrayT& array) const;
+    //@}
+    
+    /// @brief Steals all nodes of a certain type from the tree and
+    /// adds them to a container with the following API:
+    /// @code
+    /// struct ArrayT {
+    ///    typedef value_type;// defines the type of nodes to be added to the array
+    ///    void push_back(value_type nodePtr);// method that add nodes to the array
+    /// };
+    /// @endcode
+    /// @details An example of a wrapper around a c-style array is:
+    /// @code
+    /// struct MyArray {
+    ///    typedef LeafType* value_type;
+    ///    value_type* ptr;
+    ///    MyArray(value_type* array) : ptr(array) {}
+    ///    void push_back(value_type leaf) { *ptr++ = leaf; }
+    ///};
+    /// @endcode
+    /// @details An example that constructs a list of pointer to all leaf nodes is:
+    /// @code
+    /// std::vector<const LeafNodeType*> array;//most std contains have the required API
+    /// array.reserve(tree.leafCount());//this is a fast preallocation.
+    /// tree.stealNodes(array);
+    /// @endcode
+    template<typename ArrayT>
+    void stealNodes(ArrayT& array, const ValueType& value, bool state);
+
+    /// @brief Change inactive tiles or voxels with value oldBackground to newBackground
+    /// or -oldBackground to -newBackground. Active values are unchanged.
+    void resetBackground(const ValueType& oldBackground, const ValueType& newBackground);
+
+    /// @brief Return @c true if the given tree branch has the same node and active value
+    /// topology as this tree branch (but possibly a different @c ValueType).
+    template<typename OtherChildNodeType, Index OtherLog2Dim>
+    bool hasSameTopology(const InternalNode<OtherChildNodeType, OtherLog2Dim>* other) const;
+
+protected:
+    //@{
+    /// Allow iterators to call mask accessor methods (setValueMask(), setChildMask(), etc.).
+    /// @todo Make mask accessors public?
+    friend class IteratorBase<MaskOnIterator, InternalNode>;
+    friend class IteratorBase<MaskOffIterator, InternalNode>;
+    friend class IteratorBase<MaskDenseIterator, InternalNode>;
+    //@}
+
+    /// @brief During topology-only construction, access is needed
+    /// to protected/private members of other template instances.
+    template<typename, Index> friend class InternalNode;
+
+    // Mask accessors
+public:
+    bool isValueMaskOn(Index n) const { return mValueMask.isOn(n); }
+    bool isValueMaskOn() const { return mValueMask.isOn(); }
+    bool isValueMaskOff(Index n) const { return mValueMask.isOff(n); }
+    bool isValueMaskOff() const { return mValueMask.isOff(); }
+    bool isChildMaskOn(Index n) const { return mChildMask.isOn(n); }
+    bool isChildMaskOff(Index n) const { return mChildMask.isOff(n); }
+    bool isChildMaskOff() const { return mChildMask.isOff(); }
+    const NodeMaskType& getValueMask() const { return mValueMask; }
+    const NodeMaskType& getChildMask() const { return mChildMask; }
+    NodeMaskType getValueOffMask() const
+    {
+        NodeMaskType mask = mValueMask;
+        mask |= mChildMask;
+        mask.toggle();
+        return mask;
+    }
+    const UnionType* getTable() const { return mNodes; }
+protected:
+    //@{
+    /// Use a mask accessor to ensure consistency between the child and value masks;
+    /// i.e., the value mask should always be off wherever the child mask is on.
+    void setValueMask(Index n, bool on) { mValueMask.set(n, mChildMask.isOn(n) ? false : on); }
+    //@}
+
+    void makeChildNodeEmpty(Index n, const ValueType& value);
+    void setChildNode(  Index i, ChildNodeType* child);//assumes a tile
+    void resetChildNode(Index i, ChildNodeType* child);//checks for an existing child
+    ChildNodeType* unsetChildNode(Index i, const ValueType& value);
+
+    template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+    static inline void doVisit(NodeT&, VisitorOp&);
+
+    template<typename NodeT, typename OtherNodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2Node(NodeT&, OtherNodeT&, VisitorOp&);
+
+    template<typename NodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2(NodeT&, OtherChildAllIterT&, VisitorOp&, bool otherIsLHS);
+
+    ///@{
+    /// @brief Returns a pointer to the child node at the linear offset n.
+    /// @warning This protected method assumes that a child node exists at
+    /// the specified linear offset!
+    ChildNodeType* getChildNode(Index n);
+    const ChildNodeType* getChildNode(Index n) const;
+    ///@}
+
+    ///@{
+    /// @brief Protected member classes for recursive multi-threading
+    struct VoxelizeActiveTiles;
+    template<typename OtherInternalNode> struct DeepCopy;
+    template<typename OtherInternalNode> struct TopologyCopy1;
+    template<typename OtherInternalNode> struct TopologyCopy2;
+    template<typename OtherInternalNode> struct TopologyUnion;
+    template<typename OtherInternalNode> struct TopologyDifference;
+    template<typename OtherInternalNode> struct TopologyIntersection;
+    ///@}
+   
+    UnionType mNodes[NUM_VALUES];
+    NodeMaskType mChildMask, mValueMask;
+    /// Global grid index coordinates (x,y,z) of the local origin of this node
+    Coord mOrigin;
+}; // class InternalNode
+
+
+////////////////////////////////////////
+
+
+//@{
+/// Helper metafunction used to implement InternalNode::SameConfiguration
+/// (which, as an inner class, can't be independently specialized)
+template<typename ChildT1, Index Dim1, typename NodeT2>
+struct SameInternalConfig {
+    static const bool value = false;
+};
+
+template<typename ChildT1, Index Dim1, typename ChildT2>
+struct SameInternalConfig<ChildT1, Dim1, InternalNode<ChildT2, Dim1> > {
+    static const bool value = ChildT1::template SameConfiguration<ChildT2>::value;
+};
+//@}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(const ValueType& background)
+{
+    for (Index i = 0; i < NUM_VALUES; ++i) mNodes[i].setValue(background);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(const Coord& origin, const ValueType& val, bool active):
+    mOrigin(origin[0] & ~(DIM - 1), // zero out the low-order bits
+            origin[1] & ~(DIM - 1),
+            origin[2] & ~(DIM - 1))
+{
+    if (active) mValueMask.setOn();
+    for (Index i = 0; i < NUM_VALUES; ++i) mNodes[i].setValue(val);
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+// For InternalNodes, the PartialCreate constructor is identical to its
+// non-PartialCreate counterpart.
+template<typename ChildT, Index Log2Dim>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(PartialCreate,
+    const Coord& origin, const ValueType& val, bool active)
+    : mOrigin(origin[0] & ~(DIM-1), origin[1] & ~(DIM-1), origin[2] & ~(DIM-1))
+{
+    if (active) mValueMask.setOn();
+    for (Index i = 0; i < NUM_VALUES; ++i) mNodes[i].setValue(val);
+}
+#endif
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherInternalNode>
+struct InternalNode<ChildT, Log2Dim>::DeepCopy
+{
+    DeepCopy(const OtherInternalNode* source, InternalNode* target) : s(source), t(target) {
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+        //(*this)(tbb::blocked_range<Index>(0, NUM_VALUES));//serial
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const {
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (s->mChildMask.isOff(i)) {
+                t->mNodes[i].setValue(ValueType(s->mNodes[i].getValue()));
+            } else {
+                t->mNodes[i].setChild(new ChildNodeType(*(s->mNodes[i].getChild())));
+            }
+        }
+    }
+    const OtherInternalNode* s;
+    InternalNode* t;
+};// DeepCopy
+
+template<typename ChildT, Index Log2Dim>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(const InternalNode& other):
+    mChildMask(other.mChildMask),
+    mValueMask(other.mValueMask),
+    mOrigin(other.mOrigin)
+{
+    DeepCopy<InternalNode<ChildT, Log2Dim> > tmp(&other, this);
+}
+
+
+// Copy-construct from a node with the same configuration but a different ValueType.
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildNodeType>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(const InternalNode<OtherChildNodeType, Log2Dim>& other)
+    : mChildMask(other.mChildMask)
+    , mValueMask(other.mValueMask)
+    , mOrigin(other.mOrigin)
+{
+    DeepCopy<InternalNode<OtherChildNodeType, Log2Dim> > tmp(&other, this);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherInternalNode>
+struct InternalNode<ChildT, Log2Dim>::TopologyCopy1
+{
+    TopologyCopy1(const OtherInternalNode* source, InternalNode* target,
+                  const ValueType& background) : s(source), t(target), b(background) {
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+        //(*this)(tbb::blocked_range<Index>(0, NUM_VALUES));//serial
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const {
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (s->isChildMaskOn(i)) {
+                t->mNodes[i].setChild(new ChildNodeType(*(s->mNodes[i].getChild()),
+                                                        b, TopologyCopy()));
+            } else {
+                t->mNodes[i].setValue(b);
+            }
+        }
+    }
+    const OtherInternalNode* s;
+    InternalNode* t;
+    const ValueType &b; 
+};// TopologyCopy1
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildNodeType>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(const InternalNode<OtherChildNodeType, Log2Dim>& other,
+                                            const ValueType& background, TopologyCopy):
+    mChildMask(other.mChildMask),
+    mValueMask(other.mValueMask),
+    mOrigin(other.mOrigin)
+{
+    TopologyCopy1<InternalNode<OtherChildNodeType, Log2Dim> > tmp(&other, this, background);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherInternalNode>
+struct InternalNode<ChildT, Log2Dim>::TopologyCopy2
+{
+    TopologyCopy2(const OtherInternalNode* source, InternalNode* target,
+                  const ValueType& offValue, const ValueType& onValue)
+        : s(source), t(target), offV(offValue), onV(onValue) {
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const {
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (s->isChildMaskOn(i)) {
+                t->mNodes[i].setChild(new ChildNodeType(*(s->mNodes[i].getChild()),
+                                                        offV, onV, TopologyCopy()));
+            } else {
+                t->mNodes[i].setValue(s->isValueMaskOn(i) ? onV : offV);
+            }
+        }
+    }
+    const OtherInternalNode* s;
+    InternalNode* t;
+    const ValueType &offV, &onV; 
+ };// TopologyCopy2
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildNodeType>
+inline
+InternalNode<ChildT, Log2Dim>::InternalNode(const InternalNode<OtherChildNodeType, Log2Dim>& other,
+                                            const ValueType& offValue,
+                                            const ValueType& onValue, TopologyCopy):
+    mChildMask(other.mChildMask),
+    mValueMask(other.mValueMask),
+    mOrigin(other.mOrigin)
+{
+    TopologyCopy2<InternalNode<OtherChildNodeType, Log2Dim> > tmp(&other, this, offValue, onValue);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline
+InternalNode<ChildT, Log2Dim>::~InternalNode()
+{
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        delete mNodes[iter.pos()].getChild();
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index32
+InternalNode<ChildT, Log2Dim>::leafCount() const
+{
+    if (ChildNodeType::getLevel() == 0) return mChildMask.countOn();
+    Index32 sum = 0;
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        sum += iter->leafCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index32
+InternalNode<ChildT, Log2Dim>::nonLeafCount() const
+{
+    Index32 sum = 1;
+    if (ChildNodeType::getLevel() == 0) return sum;
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        sum += iter->nonLeafCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index64
+InternalNode<ChildT, Log2Dim>::onVoxelCount() const
+{
+    Index64 sum = ChildT::NUM_VOXELS * mValueMask.countOn();
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        sum += iter->onVoxelCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index64
+InternalNode<ChildT, Log2Dim>::offVoxelCount() const
+{
+    Index64 sum = ChildT::NUM_VOXELS * (NUM_VALUES-mValueMask.countOn()-mChildMask.countOn());
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        sum += iter->offVoxelCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index64
+InternalNode<ChildT, Log2Dim>::onLeafVoxelCount() const
+{
+    Index64 sum = 0;
+    for (ChildOnCIter iter = this->beginChildOn(); iter; ++iter) {
+        sum += mNodes[iter.pos()].getChild()->onLeafVoxelCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index64
+InternalNode<ChildT, Log2Dim>::offLeafVoxelCount() const
+{
+    Index64 sum = 0;
+    for (ChildOnCIter iter = this->beginChildOn(); iter; ++iter) {
+        sum += mNodes[iter.pos()].getChild()->offLeafVoxelCount();
+    }
+    return sum;
+}
+
+template<typename ChildT, Index Log2Dim>
+inline Index64
+InternalNode<ChildT, Log2Dim>::onTileCount() const
+{
+    Index64 sum = mValueMask.countOn();
+    for (ChildOnCIter iter = this->cbeginChildOn(); LEVEL>1 && iter; ++iter) {
+        sum += iter->onTileCount();
+    }
+    return sum;
+}
+
+template<typename ChildT, Index Log2Dim>
+inline Index64
+InternalNode<ChildT, Log2Dim>::memUsage() const
+{
+    Index64 sum = NUM_VALUES * sizeof(UnionType) + mChildMask.memUsage()
+                + mValueMask.memUsage() + sizeof(mOrigin);
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        sum += iter->memUsage();
+    }
+    return sum;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels) const
+{
+    if (bbox.isInside(this->getNodeBoundingBox())) return;
+
+    for (ValueOnCIter i = this->cbeginValueOn(); i; ++i) {
+        bbox.expand(i.getCoord(), ChildT::DIM);
+    }
+    for (ChildOnCIter i = this->cbeginChildOn(); i; ++i) {
+        i->evalActiveBoundingBox(bbox, visitVoxels);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::prune(const ValueType& tolerance)
+{
+    bool state = false;
+    ValueType value = zeroVal<ValueType>();
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        const Index i = iter.pos();
+        ChildT* child = mNodes[i].getChild();
+        child->prune(tolerance);
+        if (child->isConstant(value, state, tolerance)) {
+            delete child;
+            mChildMask.setOff(i);
+            mValueMask.set(i, state);
+            mNodes[i].setValue(value);
+        }
+     }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT>
+inline NodeT*
+InternalNode<ChildT, Log2Dim>::stealNode(const Coord& xyz, const ValueType& value, bool state)
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    const Index n = this->coordToOffset(xyz);
+    if (mChildMask.isOff(n)) return NULL;
+    ChildT* child = mNodes[n].getChild();
+    if (boost::is_same<NodeT, ChildT>::value) {
+        mChildMask.setOff(n);
+        mValueMask.set(n, state);
+        mNodes[n].setValue(value);
+    }
+    return (boost::is_same<NodeT, ChildT>::value)
+        ? reinterpret_cast<NodeT*>(child)
+        : child->template stealNode<NodeT>(xyz, value, state);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT>
+inline NodeT*
+InternalNode<ChildT, Log2Dim>::probeNode(const Coord& xyz)
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    const Index n = this->coordToOffset(xyz);
+    if (mChildMask.isOff(n)) return NULL;
+    ChildT* child = mNodes[n].getChild();
+    return (boost::is_same<NodeT, ChildT>::value)
+           ? reinterpret_cast<NodeT*>(child)
+           : child->template probeNode<NodeT>(xyz);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT, typename AccessorT>
+inline NodeT*
+InternalNode<ChildT, Log2Dim>::probeNodeAndCache(const Coord& xyz, AccessorT& acc)
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    const Index n = this->coordToOffset(xyz);
+    if (mChildMask.isOff(n)) return NULL;
+    ChildT* child = mNodes[n].getChild();
+    acc.insert(xyz, child);
+    return (boost::is_same<NodeT, ChildT>::value)
+           ? reinterpret_cast<NodeT*>(child)
+           : child->template probeNodeAndCache<NodeT>(xyz, acc);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT>
+inline const NodeT*
+InternalNode<ChildT, Log2Dim>::probeConstNode(const Coord& xyz) const
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    const Index n = this->coordToOffset(xyz);
+    if (mChildMask.isOff(n)) return NULL;
+    const ChildT* child = mNodes[n].getChild();
+    return (boost::is_same<NodeT, ChildT>::value)
+            ? reinterpret_cast<const NodeT*>(child)
+            : child->template probeConstNode<NodeT>(xyz);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT, typename AccessorT>
+inline const NodeT*
+InternalNode<ChildT, Log2Dim>::probeConstNodeAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    const Index n = this->coordToOffset(xyz);
+    if (mChildMask.isOff(n)) return NULL;
+    const ChildT* child = mNodes[n].getChild();
+    acc.insert(xyz, child);
+    return (boost::is_same<NodeT, ChildT>::value)
+            ? reinterpret_cast<const NodeT*>(child)
+            : child->template probeConstNodeAndCache<NodeT>(xyz, acc);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::probeLeaf(const Coord& xyz)
+{
+    return this->template probeNode<LeafNodeType>(xyz);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::probeLeafAndCache(const Coord& xyz, AccessorT& acc)
+{
+    return this->template probeNodeAndCache<LeafNodeType>(xyz, acc);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline const typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::probeLeafAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    return this->probeConstLeafAndCache(xyz, acc);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline const typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::probeConstLeaf(const Coord& xyz) const
+{
+    return this->template probeConstNode<LeafNodeType>(xyz);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline const typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::probeConstLeafAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    return this->template probeConstNodeAndCache<LeafNodeType>(xyz, acc);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::addLeaf(LeafNodeType* leaf)
+{
+    assert(leaf != NULL);
+    const Coord& xyz = leaf->origin();
+    const Index n = this->coordToOffset(xyz);
+    ChildT* child = NULL;
+    if (mChildMask.isOff(n)) {
+        if (ChildT::LEVEL>0) {
+            child = new ChildT(xyz, mNodes[n].getValue(), mValueMask.isOn(n));
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+        }
+        this->setChildNode(n, child);
+    } else {
+        if (ChildT::LEVEL>0) {
+            child = mNodes[n].getChild();
+        } else {
+            delete mNodes[n].getChild();
+            child = reinterpret_cast<ChildT*>(leaf);
+            mNodes[n].setChild(child);
+        }
+    }
+    child->addLeaf(leaf);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::addLeafAndCache(LeafNodeType* leaf, AccessorT& acc)
+{
+    assert(leaf != NULL);
+    const Coord& xyz = leaf->origin();
+    const Index n = this->coordToOffset(xyz);
+    ChildT* child = NULL;
+    if (mChildMask.isOff(n)) {
+        if (ChildT::LEVEL>0) {
+            child = new ChildT(xyz, mNodes[n].getValue(), mValueMask.isOn(n));
+            acc.insert(xyz, child);//we only cache internal nodes
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+        }
+        this->setChildNode(n, child);
+    } else {
+        if (ChildT::LEVEL>0) {
+            child = mNodes[n].getChild();
+            acc.insert(xyz, child);//we only cache internal nodes
+        } else {
+            delete mNodes[n].getChild();
+            child = reinterpret_cast<ChildT*>(leaf);
+            mNodes[n].setChild(child);
+        }
+    }
+    child->addLeafAndCache(leaf, acc);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::addTile(Index n, const ValueType& value, bool state)
+{
+    assert(n < NUM_VALUES);
+    this->makeChildNodeEmpty(n, value);
+    mValueMask.set(n, state);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::addTile(Index level, const Coord& xyz,
+                                       const ValueType& value, bool state)
+{
+    if (LEVEL >= level) {
+        const Index n = this->coordToOffset(xyz);
+        if (mChildMask.isOff(n)) {// tile case
+            if (LEVEL > level) {
+                ChildT* child = new ChildT(xyz, mNodes[n].getValue(), mValueMask.isOn(n));
+                this->setChildNode(n, child);
+                child->addTile(level, xyz, value, state);
+            } else {
+                mValueMask.set(n, state);
+                mNodes[n].setValue(value);
+            }
+        } else {// child branch case
+            ChildT* child = mNodes[n].getChild();
+            if (LEVEL > level) {
+                child->addTile(level, xyz, value, state);
+            } else {
+                delete child;
+                mChildMask.setOff(n);
+                mValueMask.set(n, state);
+                mNodes[n].setValue(value);
+            }
+        }
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::addTileAndCache(Index level, const Coord& xyz,
+    const ValueType& value, bool state, AccessorT& acc)
+{
+    if (LEVEL >= level) {
+        const Index n = this->coordToOffset(xyz);
+        if (mChildMask.isOff(n)) {// tile case
+            if (LEVEL > level) {
+                ChildT* child = new ChildT(xyz, mNodes[n].getValue(), mValueMask.isOn(n));
+                this->setChildNode(n, child);
+                acc.insert(xyz, child);
+                child->addTileAndCache(level, xyz, value, state, acc);
+            } else {
+                mValueMask.set(n, state);
+                mNodes[n].setValue(value);
+            }
+        } else {// child branch case
+            ChildT* child = mNodes[n].getChild();
+            if (LEVEL > level) {
+                acc.insert(xyz, child);
+                child->addTileAndCache(level, xyz, value, state, acc);
+            } else {
+                delete child;
+                mChildMask.setOff(n);
+                mValueMask.set(n, state);
+                mNodes[n].setValue(value);
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::touchLeaf(const Coord& xyz)
+{
+    const Index n = this->coordToOffset(xyz);
+    ChildT* child = NULL;
+    if (mChildMask.isOff(n)) {
+        child = new ChildT(xyz, mNodes[n].getValue(), mValueMask.isOn(n));
+        this->setChildNode(n, child);
+    } else {
+        child = mNodes[n].getChild();
+    }
+    return child->touchLeaf(xyz);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline typename ChildT::LeafNodeType*
+InternalNode<ChildT, Log2Dim>::touchLeafAndCache(const Coord& xyz, AccessorT& acc)
+{
+    const Index n = this->coordToOffset(xyz);
+    if (mChildMask.isOff(n)) {
+        this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), mValueMask.isOn(n)));
+    }
+    acc.insert(xyz, mNodes[n].getChild());
+    return mNodes[n].getChild()->touchLeafAndCache(xyz, acc);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline bool
+InternalNode<ChildT, Log2Dim>::isConstant(ValueType& value, bool& state,
+                                          const ValueType& tolerance) const
+{
+    if ( !(mChildMask.isOff()) ) return false;
+
+    state = mValueMask.isOn();
+    if (!(state || mValueMask.isOff())) return false;// Are values neither active nor inactive?
+    
+    value = mNodes[0].getValue();
+    for (Index i = 1; i < NUM_VALUES; ++i) {
+        if ( !math::isApproxEqual(mNodes[i].getValue(), value, tolerance) ) return false;
+    }
+    return true;
+}
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline bool
+InternalNode<ChildT, Log2Dim>::isConstant(ValueType& minValue, ValueType& maxValue,
+                                          bool& state, const ValueType& tolerance) const
+{
+    if ( !(mChildMask.isOff()) ) return false;
+
+    state = mValueMask.isOn();
+    if (!(state || mValueMask.isOff())) return false;// Are values neither active nor inactive?
+    
+    const ValueType range = 2 * tolerance;
+    minValue = maxValue = mNodes[0].getValue();
+    for (Index i = 1; i < NUM_VALUES; ++i) {
+        const ValueType& v = mNodes[i].getValue();
+        if (v < minValue) {
+            if ((maxValue - v) > range) return false;
+            minValue = v;
+        } else if (v > maxValue) {
+            if ((v - minValue) > range) return false;
+            maxValue = v;
+        }
+    }
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline bool
+InternalNode<ChildT, Log2Dim>::hasActiveTiles() const
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    const bool anyActiveTiles = !mValueMask.isOff();
+    if (LEVEL==1 || anyActiveTiles) return anyActiveTiles;
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        if (iter->hasActiveTiles()) return true;
+    }
+    return false;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline bool
+InternalNode<ChildT, Log2Dim>::isValueOn(const Coord& xyz) const
+{
+    const Index n = this->coordToOffset(xyz);
+    if (this->isChildMaskOff(n)) return this->isValueMaskOn(n);
+    return mNodes[n].getChild()->isValueOn(xyz);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline bool
+InternalNode<ChildT, Log2Dim>::isValueOnAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    const Index n = this->coordToOffset(xyz);
+    if (this->isChildMaskOff(n)) return this->isValueMaskOn(n);
+    acc.insert(xyz, mNodes[n].getChild());
+    return mNodes[n].getChild()->isValueOnAndCache(xyz, acc);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline const typename ChildT::ValueType&
+InternalNode<ChildT, Log2Dim>::getValue(const Coord& xyz) const
+{
+    const Index n = this->coordToOffset(xyz);
+    return this->isChildMaskOff(n) ? mNodes[n].getValue()
+        :  mNodes[n].getChild()->getValue(xyz);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline const typename ChildT::ValueType&
+InternalNode<ChildT, Log2Dim>::getValueAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    const Index n = this->coordToOffset(xyz);
+    if (this->isChildMaskOn(n)) {
+        acc.insert(xyz, mNodes[n].getChild());
+        return mNodes[n].getChild()->getValueAndCache(xyz, acc);
+    }
+    return mNodes[n].getValue();
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index
+InternalNode<ChildT, Log2Dim>::getValueLevel(const Coord& xyz) const
+{
+    const Index n = this->coordToOffset(xyz);
+    return this->isChildMaskOff(n) ? LEVEL : mNodes[n].getChild()->getValueLevel(xyz);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline Index
+InternalNode<ChildT, Log2Dim>::getValueLevelAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    const Index n = this->coordToOffset(xyz);
+    if (this->isChildMaskOn(n)) {
+        acc.insert(xyz, mNodes[n].getChild());
+        return mNodes[n].getChild()->getValueLevelAndCache(xyz, acc);
+    }
+    return LEVEL;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline bool
+InternalNode<ChildT, Log2Dim>::probeValue(const Coord& xyz, ValueType& value) const
+{
+    const Index n = this->coordToOffset(xyz);
+    if (this->isChildMaskOff(n)) {
+        value = mNodes[n].getValue();
+        return this->isValueMaskOn(n);
+    }
+    return mNodes[n].getChild()->probeValue(xyz, value);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline bool
+InternalNode<ChildT, Log2Dim>::probeValueAndCache(const Coord& xyz,
+    ValueType& value, AccessorT& acc) const
+{
+    const Index n = this->coordToOffset(xyz);
+    if (this->isChildMaskOn(n)) {
+        acc.insert(xyz, mNodes[n].getChild());
+        return mNodes[n].getChild()->probeValueAndCache(xyz, value, acc);
+    }
+    value = mNodes[n].getValue();
+    return this->isValueMaskOn(n);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOff(const Coord& xyz)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild && this->isValueMaskOn(n)) {
+        // If the voxel belongs to a constant tile that is active,
+        // a child subtree must be constructed.
+        hasChild = true;
+        this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), /*active=*/true));
+    }
+    if (hasChild) mNodes[n].getChild()->setValueOff(xyz);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOn(const Coord& xyz)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild && !this->isValueMaskOn(n)) {
+        // If the voxel belongs to a constant tile that is inactive,
+        // a child subtree must be constructed.
+        hasChild = true;
+        this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), /*active=*/false));
+    }
+    if (hasChild) mNodes[n].getChild()->setValueOn(xyz);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOff(const Coord& xyz, const ValueType& value)
+{
+    const Index n = InternalNode::coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        const bool active = this->isValueMaskOn(n);
+        if (active || !math::isExactlyEqual(mNodes[n].getValue(), value)) {
+            // If the voxel belongs to a tile that is either active or that
+            // has a constant value that is different from the one provided,
+            // a child subtree must be constructed.
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+        }
+    }
+    if (hasChild) mNodes[n].getChild()->setValueOff(xyz, value);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOffAndCache(const Coord& xyz,
+    const ValueType& value, AccessorT& acc)
+{
+    const Index n = InternalNode::coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        const bool active = this->isValueMaskOn(n);
+        if (active || !math::isExactlyEqual(mNodes[n].getValue(), value)) {
+            // If the voxel belongs to a tile that is either active or that
+            // has a constant value that is different from the one provided,
+            // a child subtree must be constructed.
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+        }
+    }
+    if (hasChild) {
+        ChildT* child = mNodes[n].getChild();
+        acc.insert(xyz, child);
+        child->setValueOffAndCache(xyz, value, acc);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOn(const Coord& xyz, const ValueType& value)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        const bool active = this->isValueMaskOn(n); // tile's active state
+        if (!active || !math::isExactlyEqual(mNodes[n].getValue(), value)) {
+            // If the voxel belongs to a tile that is either inactive or that
+            // has a constant value that is different from the one provided,
+            // a child subtree must be constructed.
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+        }
+    }
+    if (hasChild) mNodes[n].getChild()->setValueOn(xyz, value);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueAndCache(const Coord& xyz,
+    const ValueType& value, AccessorT& acc)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        const bool active = this->isValueMaskOn(n);
+        if (!active || !math::isExactlyEqual(mNodes[n].getValue(), value)) {
+            // If the voxel belongs to a tile that is either inactive or that
+            // has a constant value that is different from the one provided,
+            // a child subtree must be constructed.
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+        }
+    }
+    if (hasChild) {
+        acc.insert(xyz, mNodes[n].getChild());
+        mNodes[n].getChild()->setValueAndCache(xyz, value, acc);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOnly(const Coord& xyz, const ValueType& value)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild && !math::isExactlyEqual(mNodes[n].getValue(), value)) {
+        // If the voxel has a tile value that is different from the one provided,
+        // a child subtree must be constructed.
+        const bool active = this->isValueMaskOn(n);
+        hasChild = true;
+        this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+    }
+    if (hasChild) mNodes[n].getChild()->setValueOnly(xyz, value);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::setValueOnlyAndCache(const Coord& xyz,
+                                                    const ValueType& value, AccessorT& acc)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild && !math::isExactlyEqual(mNodes[n].getValue(), value)) {
+        // If the voxel has a tile value that is different from the one provided,
+        // a child subtree must be constructed.
+        const bool active = this->isValueMaskOn(n);
+        hasChild = true;
+        this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+    }
+    if (hasChild) {
+        acc.insert(xyz, mNodes[n].getChild());
+        mNodes[n].getChild()->setValueOnlyAndCache(xyz, value, acc);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setActiveState(const Coord& xyz, bool on)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        if (on != this->isValueMaskOn(n)) {
+            // If the voxel belongs to a tile with the wrong active state,
+            // then a child subtree must be constructed.
+            // 'on' is the voxel's new state, therefore '!on' is the tile's current state
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), !on));
+        }
+    }
+    if (hasChild) mNodes[n].getChild()->setActiveState(xyz, on);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::setActiveStateAndCache(const Coord& xyz, bool on, AccessorT& acc)
+{
+    const Index n = this->coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        if (on != this->isValueMaskOn(n)) {
+            // If the voxel belongs to a tile with the wrong active state,
+            // then a child subtree must be constructed.
+            // 'on' is the voxel's new state, therefore '!on' is the tile's current state
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), !on));
+        }
+    }
+    if (hasChild) {
+        ChildT* child = mNodes[n].getChild();
+        acc.insert(xyz, child);
+        child->setActiveStateAndCache(xyz, on, acc);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setValuesOn()
+{
+    mValueMask = !mChildMask;
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        mNodes[iter.pos()].getChild()->setValuesOn();
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename ModifyOp>
+inline void
+InternalNode<ChildT, Log2Dim>::modifyValue(const Coord& xyz, const ModifyOp& op)
+{
+    const Index n = InternalNode::coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        // Need to create a child if the tile is inactive,
+        // in order to activate voxel (x, y, z).
+        const bool active = this->isValueMaskOn(n);
+        bool createChild = !active;
+        if (!createChild) {
+            // Need to create a child if applying the functor
+            // to the tile value produces a different value.
+            const ValueType& tileVal = mNodes[n].getValue();
+            ValueType modifiedVal = tileVal;
+            op(modifiedVal);
+            createChild = !math::isExactlyEqual(tileVal, modifiedVal);
+        }
+        if (createChild) {
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+        }
+    }
+    if (hasChild) mNodes[n].getChild()->modifyValue(xyz, op);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename ModifyOp, typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::modifyValueAndCache(const Coord& xyz, const ModifyOp& op,
+    AccessorT& acc)
+{
+    const Index n = InternalNode::coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        // Need to create a child if the tile is inactive,
+        // in order to activate voxel (x, y, z).
+        const bool active = this->isValueMaskOn(n);
+        bool createChild = !active;
+        if (!createChild) {
+            // Need to create a child if applying the functor
+            // to the tile value produces a different value.
+            const ValueType& tileVal = mNodes[n].getValue();
+            ValueType modifiedVal = tileVal;
+            op(modifiedVal);
+            createChild = !math::isExactlyEqual(tileVal, modifiedVal);
+        }
+        if (createChild) {
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, mNodes[n].getValue(), active));
+        }
+    }
+    if (hasChild) {
+        ChildNodeType* child = mNodes[n].getChild();
+        acc.insert(xyz, child);
+        child->modifyValueAndCache(xyz, op, acc);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename ModifyOp>
+inline void
+InternalNode<ChildT, Log2Dim>::modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+{
+    const Index n = InternalNode::coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        const bool tileState = this->isValueMaskOn(n);
+        const ValueType& tileVal = mNodes[n].getValue();
+        bool modifiedState = !tileState;
+        ValueType modifiedVal = tileVal;
+        op(modifiedVal, modifiedState);
+        // Need to create a child if applying the functor to the tile
+        // produces a different value or active state.
+        if (modifiedState != tileState || !math::isExactlyEqual(modifiedVal, tileVal)) {
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, tileVal, tileState));
+        }
+    }
+    if (hasChild) mNodes[n].getChild()->modifyValueAndActiveState(xyz, op);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename ModifyOp, typename AccessorT>
+inline void
+InternalNode<ChildT, Log2Dim>::modifyValueAndActiveStateAndCache(
+    const Coord& xyz, const ModifyOp& op, AccessorT& acc)
+{
+    const Index n = InternalNode::coordToOffset(xyz);
+    bool hasChild = this->isChildMaskOn(n);
+    if (!hasChild) {
+        const bool tileState = this->isValueMaskOn(n);
+        const ValueType& tileVal = mNodes[n].getValue();
+        bool modifiedState = !tileState;
+        ValueType modifiedVal = tileVal;
+        op(modifiedVal, modifiedState);
+        // Need to create a child if applying the functor to the tile
+        // produces a different value or active state.
+        if (modifiedState != tileState || !math::isExactlyEqual(modifiedVal, tileVal)) {
+            hasChild = true;
+            this->setChildNode(n, new ChildNodeType(xyz, tileVal, tileState));
+        }
+    }
+    if (hasChild) {
+        ChildNodeType* child = mNodes[n].getChild();
+        acc.insert(xyz, child);
+        child->modifyValueAndActiveStateAndCache(xyz, op, acc);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::clip(const CoordBBox& clipBBox, const ValueType& background)
+{
+    CoordBBox nodeBBox = this->getNodeBoundingBox();
+    if (!clipBBox.hasOverlap(nodeBBox)) {
+        // This node lies completely outside the clipping region.  Fill it with background tiles.
+        this->fill(nodeBBox, background, /*active=*/false);
+    } else if (clipBBox.isInside(nodeBBox)) {
+        // This node lies completely inside the clipping region.  Leave it intact.
+        return;
+    }
+
+    // This node isn't completely contained inside the clipping region.
+    // Clip tiles and children, and replace any that lie outside the region
+    // with background tiles.
+
+    for (Index pos = 0; pos < NUM_VALUES; ++pos) {
+        const Coord xyz = this->offsetToGlobalCoord(pos); // tile or child origin
+        CoordBBox tileBBox(xyz, xyz.offsetBy(ChildT::DIM - 1)); // tile or child bounds
+        if (!clipBBox.hasOverlap(tileBBox)) {
+            // This table entry lies completely outside the clipping region.
+            // Replace it with a background tile.
+            this->makeChildNodeEmpty(pos, background);
+            mValueMask.setOff(pos);
+        } else if (!clipBBox.isInside(tileBBox)) {
+            // This table entry does not lie completely inside the clipping region
+            // and must be clipped.
+            if (this->isChildMaskOn(pos)) {
+                mNodes[pos].getChild()->clip(clipBBox, background);
+            } else {
+                // Replace this tile with a background tile, then fill the clip region
+                // with the tile's original value.  (This might create a child branch.)
+                tileBBox.intersect(clipBBox);
+                const ValueType val = mNodes[pos].getValue();
+                const bool on = this->isValueMaskOn(pos);
+                mNodes[pos].setValue(background);
+                mValueMask.setOff(pos);
+                this->fill(tileBBox, val, on);
+            }
+        } else {
+            // This table entry lies completely inside the clipping region.  Leave it intact.
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::fill(const CoordBBox& bbox, const ValueType& value, bool active)
+{
+    Coord xyz, tileMin, tileMax;
+    for (int x = bbox.min().x(); x <= bbox.max().x(); x = tileMax.x() + 1) {
+        xyz.setX(x);
+        for (int y = bbox.min().y(); y <= bbox.max().y(); y = tileMax.y() + 1) {
+            xyz.setY(y);
+            for (int z = bbox.min().z(); z <= bbox.max().z(); z = tileMax.z() + 1) {
+                xyz.setZ(z);
+
+                // Get the bounds of the tile that contains voxel (x, y, z).
+                const Index n = this->coordToOffset(xyz);
+                tileMin = this->offsetToGlobalCoord(n);
+                tileMax = tileMin.offsetBy(ChildT::DIM - 1);
+
+                if (xyz != tileMin || Coord::lessThan(bbox.max(), tileMax)) {
+                    // If the box defined by (xyz, bbox.max()) doesn't completely enclose
+                    // the tile to which xyz belongs, create a child node (or retrieve
+                    // the existing one).
+                    ChildT* child = NULL;
+                    if (this->isChildMaskOff(n)) {
+                        // Replace the tile with a newly-created child that is initialized
+                        // with the tile's value and active state.
+                        child = new ChildT(xyz, mNodes[n].getValue(), this->isValueMaskOn(n));
+                        this->setChildNode(n, child);
+                    } else {
+                        child = mNodes[n].getChild();
+                    }
+
+                    // Forward the fill request to the child.
+                    if (child) {
+                        child->fill(CoordBBox(xyz, Coord::minComponent(bbox.max(), tileMax)),
+                            value, active);
+                    }
+
+                } else {
+                    // If the box given by (xyz, bbox.max()) completely encloses
+                    // the tile to which xyz belongs, create the tile (if it
+                    // doesn't already exist) and give it the fill value.
+                    this->makeChildNodeEmpty(n, value);
+                    mValueMask.set(n, active);
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename DenseT>
+inline void
+InternalNode<ChildT, Log2Dim>::copyToDense(const CoordBBox& bbox, DenseT& dense) const
+{
+    typedef typename DenseT::ValueType DenseValueType;
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    for (Coord xyz = bbox.min(), max; xyz[0] <= bbox.max()[0]; xyz[0] = max[0] + 1) {
+        for (xyz[1] = bbox.min()[1]; xyz[1] <= bbox.max()[1]; xyz[1] = max[1] + 1) {
+            for (xyz[2] = bbox.min()[2]; xyz[2] <= bbox.max()[2]; xyz[2] = max[2] + 1) {
+                const Index n = this->coordToOffset(xyz);
+                // Get max coordinates of the child node that contains voxel xyz.
+                max = this->offsetToGlobalCoord(n).offsetBy(ChildT::DIM-1);
+
+                // Get the bbox of the interection of bbox and the child node
+                CoordBBox sub(xyz, Coord::minComponent(bbox.max(), max));
+
+                if (this->isChildMaskOn(n)) {//is a child
+                    mNodes[n].getChild()->copyToDense(sub, dense);
+                } else {//a tile value
+                    const ValueType value = mNodes[n].getValue();
+                    sub.translate(-min);
+                    DenseValueType* a0 = dense.data() + zStride*sub.min()[2];
+                    for (Int32 x=sub.min()[0], ex=sub.max()[0]+1; x<ex; ++x) {
+                        DenseValueType* a1 = a0 + x*xStride;
+                        for (Int32 y=sub.min()[1], ey=sub.max()[1]+1; y<ey; ++y) {
+                            DenseValueType* a2 = a1 + y*yStride;
+                            for (Int32 z=sub.min()[2], ez=sub.max()[2]+1; z<ez; ++z, a2 += zStride) {
+                                *a2 = DenseValueType(value);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::writeTopology(std::ostream& os, bool toHalf) const
+{
+    mChildMask.save(os);
+    mValueMask.save(os);
+
+    {
+        // Copy all of this node's values into an array.
+        boost::shared_array<ValueType> values(new ValueType[NUM_VALUES]);
+        const ValueType zero = zeroVal<ValueType>();
+        for (Index i = 0; i < NUM_VALUES; ++i) {
+            values[i] = (mChildMask.isOff(i) ? mNodes[i].getValue() : zero);
+        }
+        // Compress (optionally) and write out the contents of the array.
+        io::writeCompressedValues(os, values.get(), NUM_VALUES, mValueMask, mChildMask, toHalf);
+    }
+    // Write out the child nodes in order.
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        iter->writeTopology(os, toHalf);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::readTopology(std::istream& is, bool fromHalf)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    const ValueType background = (!io::getGridBackgroundValuePtr(is) ? zeroVal<ValueType>()
+        : *static_cast<const ValueType*>(io::getGridBackgroundValuePtr(is)));
+#endif
+
+    mChildMask.load(is);
+    mValueMask.load(is);
+
+    if (io::getFormatVersion(is) < OPENVDB_FILE_VERSION_INTERNALNODE_COMPRESSION) {
+        for (Index i = 0; i < NUM_VALUES; ++i) {
+            if (this->isChildMaskOn(i)) {
+                ChildNodeType* child =
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+                    new ChildNodeType(offsetToGlobalCoord(i), zeroVal<ValueType>());
+#else
+                    new ChildNodeType(PartialCreate(), offsetToGlobalCoord(i), background);
+#endif
+                mNodes[i].setChild(child);
+                child->readTopology(is);
+            } else {
+                ValueType value;
+                is.read(reinterpret_cast<char*>(&value), sizeof(ValueType));
+                mNodes[i].setValue(value);
+            }
+        }
+    } else {
+        const bool oldVersion =
+            (io::getFormatVersion(is) < OPENVDB_FILE_VERSION_NODE_MASK_COMPRESSION);
+        const Index numValues = (oldVersion ? mChildMask.countOff() : NUM_VALUES);
+        {
+            // Read in (and uncompress, if necessary) all of this node's values
+            // into a contiguous array.
+            boost::shared_array<ValueType> values(new ValueType[numValues]);
+            io::readCompressedValues(is, values.get(), numValues, mValueMask, fromHalf);
+
+            // Copy values from the array into this node's table.
+            if (oldVersion) {
+                Index n = 0;
+                for (ValueAllIter iter = this->beginValueAll(); iter; ++iter) {
+                    mNodes[iter.pos()].setValue(values[n++]);
+                }
+                assert(n == numValues);
+            } else {
+                for (ValueAllIter iter = this->beginValueAll(); iter; ++iter) {
+                    mNodes[iter.pos()].setValue(values[iter.pos()]);
+                }
+            }
+        }
+        // Read in all child nodes and insert them into the table at their proper locations.
+        for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+            ChildNodeType* child = new ChildNodeType(iter.getCoord(), zeroVal<ValueType>());
+#else
+            ChildNodeType* child = new ChildNodeType(PartialCreate(), iter.getCoord(), background);
+#endif
+            mNodes[iter.pos()].setChild(child);
+            child->readTopology(is, fromHalf);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline const typename ChildT::ValueType&
+InternalNode<ChildT, Log2Dim>::getFirstValue() const
+{
+    return (this->isChildMaskOn(0) ? mNodes[0].getChild()->getFirstValue() : mNodes[0].getValue());
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline const typename ChildT::ValueType&
+InternalNode<ChildT, Log2Dim>::getLastValue() const
+{
+    const Index n = NUM_VALUES - 1;
+    return (this->isChildMaskOn(n) ? mNodes[n].getChild()->getLastValue() : mNodes[n].getValue());
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::negate()
+{
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+        if (this->isChildMaskOn(i)) {
+            mNodes[i].getChild()->negate();
+        } else {
+            mNodes[i].setValue(math::negative(mNodes[i].getValue()));
+        }
+    }
+
+}
+
+////////////////////////////////////////
+
+template<typename ChildT, Index Log2Dim>
+struct InternalNode<ChildT, Log2Dim>::VoxelizeActiveTiles
+{
+    VoxelizeActiveTiles(InternalNode &node) : mNode(&node) {
+        //(*this)(tbb::blocked_range<Index>(0, NUM_VALUES));//single thread for debugging
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+
+        node.mChildMask |= node.mValueMask;
+        node.mValueMask.setOff();
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const
+    {    
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (mNode->mChildMask.isOn(i)) {// Loop over node's child nodes
+                mNode->mNodes[i].getChild()->voxelizeActiveTiles(true);    
+            } else if (mNode->mValueMask.isOn(i)) {// Loop over node's active tiles
+                const Coord &ijk = mNode->offsetToGlobalCoord(i);
+                ChildNodeType *child = new ChildNodeType(ijk, mNode->mNodes[i].getValue(), true);
+                child->voxelizeActiveTiles(true); 
+                mNode->mNodes[i].setChild(child);
+            }
+        }
+    }
+    InternalNode* mNode;
+};// VoxelizeActiveTiles
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::voxelizeActiveTiles(bool threaded)
+{
+    if (threaded) {
+        VoxelizeActiveTiles tmp(*this);
+    } else {
+        for (ValueOnIter iter = this->beginValueOn(); iter; ++iter) {
+            this->setChildNode(iter.pos(), new ChildNodeType(iter.getCoord(), iter.getValue(), true));
+        }
+        for (ChildOnIter iter = this->beginChildOn(); iter; ++iter)
+            iter->voxelizeActiveTiles(false);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+InternalNode<ChildT, Log2Dim>::merge(InternalNode& other,
+    const ValueType& background, const ValueType& otherBackground)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+
+    switch (Policy) {
+
+    case MERGE_ACTIVE_STATES:
+    default:
+    {
+        for (ChildOnIter iter = other.beginChildOn(); iter; ++iter) {
+            const Index n = iter.pos();
+            if (mChildMask.isOn(n)) {
+                // Merge this node's child with the other node's child.
+                mNodes[n].getChild()->template merge<MERGE_ACTIVE_STATES>(*iter,
+                    background, otherBackground);
+            } else if (mValueMask.isOff(n)) {
+                // Replace this node's inactive tile with the other node's child
+                // and replace the other node's child with a tile of undefined value
+                // (which is okay since the other tree is assumed to be cannibalized
+                // in the process of merging).
+                ChildNodeType* child = other.mNodes[n].getChild();
+                other.mChildMask.setOff(n);
+                child->resetBackground(otherBackground, background);
+                this->setChildNode(n, child);
+            }
+        }
+
+        // Copy active tile values.
+        for (ValueOnCIter iter = other.cbeginValueOn(); iter; ++iter) {
+            const Index n = iter.pos();
+            if (mValueMask.isOff(n)) {
+                // Replace this node's child or inactive tile with the other node's active tile.
+                this->makeChildNodeEmpty(n, iter.getValue());
+                mValueMask.setOn(n);
+            }
+        }
+        break;
+    }
+
+    case MERGE_NODES:
+    {
+        for (ChildOnIter iter = other.beginChildOn(); iter; ++iter) {
+            const Index n = iter.pos();
+            if (mChildMask.isOn(n)) {
+                // Merge this node's child with the other node's child.
+                mNodes[n].getChild()->template merge<Policy>(*iter, background, otherBackground);
+            } else {
+                // Replace this node's tile (regardless of its active state) with
+                // the other node's child and replace the other node's child with
+                // a tile of undefined value (which is okay since the other tree
+                // is assumed to be cannibalized in the process of merging).
+                ChildNodeType* child = other.mNodes[n].getChild();
+                other.mChildMask.setOff(n);
+                child->resetBackground(otherBackground, background);
+                this->setChildNode(n, child);
+            }
+        }
+        break;
+    }
+
+    case MERGE_ACTIVE_STATES_AND_NODES:
+    {
+        // Transfer children from the other tree to this tree.
+        for (ChildOnIter iter = other.beginChildOn(); iter; ++iter) {
+            const Index n = iter.pos();
+            if (mChildMask.isOn(n)) {
+                // Merge this node's child with the other node's child.
+                mNodes[n].getChild()->template merge<Policy>(*iter, background, otherBackground);
+            } else {
+                // Replace this node's tile with the other node's child, leaving the other
+                // node with an inactive tile of undefined value (which is okay since
+                // the other tree is assumed to be cannibalized in the process of merging).
+                ChildNodeType* child = other.mNodes[n].getChild();
+                other.mChildMask.setOff(n);
+                child->resetBackground(otherBackground, background);
+                if (mValueMask.isOn(n)) {
+                    // Merge the child with this node's active tile.
+                    child->template merge<Policy>(mNodes[n].getValue(), /*on=*/true);
+                    mValueMask.setOff(n);
+                }
+                mChildMask.setOn(n);
+                mNodes[n].setChild(child);
+            }
+        }
+
+        // Merge active tiles into this tree.
+        for (ValueOnCIter iter = other.cbeginValueOn(); iter; ++iter) {
+            const Index n = iter.pos();
+            if (mChildMask.isOn(n)) {
+                // Merge the other node's active tile into this node's child.
+                mNodes[n].getChild()->template merge<Policy>(iter.getValue(), /*on=*/true);
+            } else if (mValueMask.isOff(n)) {
+                // Replace this node's inactive tile with the other node's active tile.
+                mNodes[n].setValue(iter.getValue());
+                mValueMask.setOn(n);
+            }
+        }
+        break;
+    }
+
+    }
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+InternalNode<ChildT, Log2Dim>::merge(const ValueType& tileValue, bool tileActive)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+
+    if (Policy != MERGE_ACTIVE_STATES_AND_NODES) return;
+
+    // For MERGE_ACTIVE_STATES_AND_NODES, inactive tiles in the other tree are ignored.
+    if (!tileActive) return;
+
+    // Iterate over this node's children and inactive tiles.
+    for (ValueOffIter iter = this->beginValueOff(); iter; ++iter) {
+        const Index n = iter.pos();
+        if (mChildMask.isOn(n)) {
+            // Merge the other node's active tile into this node's child.
+            mNodes[n].getChild()->template merge<Policy>(tileValue, /*on=*/true);
+        } else {
+            // Replace this node's inactive tile with the other node's active tile.
+            iter.setValue(tileValue);
+            mValueMask.setOn(n);
+        }
+    }
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+////////////////////////////////////////
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherInternalNode>
+struct InternalNode<ChildT, Log2Dim>::TopologyUnion
+{
+    typedef typename NodeMaskType::Word W;
+    struct A { inline void operator()(W &tV, const W& sV, const W& tC) const
+        { tV = (tV | sV) & ~tC; }
+    };
+    TopologyUnion(const OtherInternalNode* source, InternalNode* target) : s(source), t(target) {
+        //(*this)(tbb::blocked_range<Index>(0, NUM_VALUES));//single thread for debugging
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+
+        // Bit processing is done in a single thread!
+        t->mChildMask |= s->mChildMask;//serial but very fast bitwise post-process
+        A op;
+        t->mValueMask.foreach(s->mValueMask, t->mChildMask, op);
+        assert((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles and child nodes
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const {
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (s->mChildMask.isOn(i)) {// Loop over other node's child nodes
+                const typename OtherInternalNode::ChildNodeType& other = *(s->mNodes[i].getChild());
+                if (t->mChildMask.isOn(i)) {//this has a child node
+                    t->mNodes[i].getChild()->topologyUnion(other);
+                } else {// this is a tile so replace it with a child branch with identical topology
+                    ChildT* child = new ChildT(other, t->mNodes[i].getValue(), TopologyCopy());
+                    if (t->mValueMask.isOn(i)) child->setValuesOn();//activate all values
+                    t->mNodes[i].setChild(child);
+                }
+            } else if (s->mValueMask.isOn(i) && t->mChildMask.isOn(i)) {
+                t->mNodes[i].getChild()->setValuesOn();
+            }
+        }
+    }
+    const OtherInternalNode* s;
+    InternalNode* t;
+};// TopologyUnion
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildT>
+inline void
+InternalNode<ChildT, Log2Dim>::topologyUnion(const InternalNode<OtherChildT, Log2Dim>& other)
+{
+    TopologyUnion<InternalNode<OtherChildT, Log2Dim> > tmp(&other, this);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherInternalNode>
+struct InternalNode<ChildT, Log2Dim>::TopologyIntersection
+{
+    typedef typename NodeMaskType::Word W;
+    struct A { inline void operator()(W &tC, const W& sC, const W& sV, const W& tV) const
+        { tC = (tC & (sC | sV)) | (tV & sC); }
+    };
+    TopologyIntersection(const OtherInternalNode* source, InternalNode* target,
+                         const ValueType& background) : s(source), t(target), b(background) {
+        //(*this)(tbb::blocked_range<Index>(0, NUM_VALUES));//single thread for debugging
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+
+        // Bit processing is done in a single thread!
+        A op;
+        t->mChildMask.foreach(s->mChildMask, s->mValueMask, t->mValueMask, op);
+        
+        t->mValueMask &= s->mValueMask;
+        assert((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles and child nodes
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const {
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (t->mChildMask.isOn(i)) {// Loop over this node's child nodes
+                ChildT* child = t->mNodes[i].getChild();
+                if (s->mChildMask.isOn(i)) {//other also has a child node
+                    child->topologyIntersection(*(s->mNodes[i].getChild()), b);
+                } else if (s->mValueMask.isOff(i)) {//other is an inactive tile
+                    delete child;//convert child to an inactive tile
+                    t->mNodes[i].setValue(b);
+                }
+            } else if (t->mValueMask.isOn(i) && s->mChildMask.isOn(i)) {//active tile -> a branch
+                t->mNodes[i].setChild(new ChildT(*(s->mNodes[i].getChild()),
+                                                 t->mNodes[i].getValue(), TopologyCopy()));
+            }
+        }
+    }
+    const OtherInternalNode* s;
+    InternalNode* t;
+    const ValueType& b;
+};// TopologyIntersection
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildT>
+inline void
+InternalNode<ChildT, Log2Dim>::topologyIntersection(const InternalNode<OtherChildT, Log2Dim>& other,
+                                                    const ValueType& background)
+{
+    TopologyIntersection<InternalNode<OtherChildT, Log2Dim> > tmp(&other, this, background);
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherInternalNode>
+struct InternalNode<ChildT, Log2Dim>::TopologyDifference
+{
+    typedef typename NodeMaskType::Word W;
+    struct A {inline void operator()(W &tC, const W& sC, const W& sV, const W& tV) const
+        { tC = (tC & (sC | ~sV)) | (tV & sC); }
+    };
+    struct B {inline void operator()(W &tV, const W& sC, const W& sV, const W& tC) const
+        { tV &= ~((tC & sV) | (sC | sV)); }
+    };
+    TopologyDifference(const OtherInternalNode* source, InternalNode* target,
+                       const ValueType& background) : s(source), t(target), b(background) {
+        //(*this)(tbb::blocked_range<Index>(0, NUM_VALUES));//single thread for debugging
+        tbb::parallel_for(tbb::blocked_range<Index>(0, NUM_VALUES), *this);
+
+        // Bit processing is done in a single thread!
+        const NodeMaskType oldChildMask(t->mChildMask);//important to avoid cross pollution
+        A op1;
+        t->mChildMask.foreach(s->mChildMask, s->mValueMask, t->mValueMask, op1);
+        
+        B op2;
+        t->mValueMask.foreach(t->mChildMask, s->mValueMask, oldChildMask, op2);
+        assert((t->mValueMask & t->mChildMask).isOff());//no overlapping active tiles and child nodes
+    }
+    void operator()(const tbb::blocked_range<Index> &r) const {
+        for (Index i = r.begin(), end=r.end(); i!=end; ++i) {
+            if (t->mChildMask.isOn(i)) {// Loop over this node's child nodes
+                ChildT* child = t->mNodes[i].getChild();
+                if (s->mChildMask.isOn(i)) {
+                    child->topologyDifference(*(s->mNodes[i].getChild()), b);
+                } else if (s->mValueMask.isOn(i)) {
+                    delete child;//convert child to an inactive tile
+                    t->mNodes[i].setValue(b);
+                }
+            } else if (t->mValueMask.isOn(i)) {//this is an active tile
+                if (s->mChildMask.isOn(i)) {
+                    const typename OtherInternalNode::ChildNodeType& other = *(s->mNodes[i].getChild());
+                    ChildT* child = new ChildT(other.origin(), t->mNodes[i].getValue(), true);
+                    child->topologyDifference(other, b);
+                    t->mNodes[i].setChild(child);//replace the active tile with a child branch
+                }
+            }
+        }
+    }
+    const OtherInternalNode* s;
+    InternalNode* t;
+    const ValueType& b;
+};// TopologyDifference
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildT>
+inline void
+InternalNode<ChildT, Log2Dim>::topologyDifference(const InternalNode<OtherChildT, Log2Dim>& other,
+                                                  const ValueType& background)
+{
+    TopologyDifference<InternalNode<OtherChildT, Log2Dim> > tmp(&other, this, background);
+}
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename CombineOp>
+inline void
+InternalNode<ChildT, Log2Dim>::combine(InternalNode& other, CombineOp& op)
+{
+    const ValueType zero = zeroVal<ValueType>();
+
+    CombineArgs<ValueType> args;
+
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+        if (this->isChildMaskOff(i) && other.isChildMaskOff(i)) {
+            // Both this node and the other node have constant values (tiles).
+            // Combine the two values and store the result as this node's new tile value.
+            op(args.setARef(mNodes[i].getValue())
+                .setAIsActive(isValueMaskOn(i))
+                .setBRef(other.mNodes[i].getValue())
+               .setBIsActive(other.isValueMaskOn(i)));
+            mNodes[i].setValue(args.result());
+            mValueMask.set(i, args.resultIsActive());
+        } else if (this->isChildMaskOn(i) && other.isChildMaskOff(i)) {
+            // Combine this node's child with the other node's constant value.
+            ChildNodeType* child = mNodes[i].getChild();
+            assert(child);
+            if (child) {
+                child->combine(other.mNodes[i].getValue(), other.isValueMaskOn(i), op);
+            }
+        } else if (this->isChildMaskOff(i) && other.isChildMaskOn(i)) {
+            // Combine this node's constant value with the other node's child.
+            ChildNodeType* child = other.mNodes[i].getChild();
+            assert(child);
+            if (child) {
+                // Combine this node's constant value with the other node's child,
+                // but use a new functor in which the A and B values are swapped,
+                // since the constant value is the A value, not the B value.
+                SwappedCombineOp<ValueType, CombineOp> swappedOp(op);
+                child->combine(mNodes[i].getValue(), isValueMaskOn(i), swappedOp);
+
+                // Steal the other node's child.
+                other.mChildMask.setOff(i);
+                other.mNodes[i].setValue(zero);
+                this->setChildNode(i, child);
+            }
+
+        } else /*if (isChildMaskOn(i) && other.isChildMaskOn(i))*/ {
+            // Combine this node's child with the other node's child.
+            ChildNodeType
+                *child = mNodes[i].getChild(),
+                *otherChild = other.mNodes[i].getChild();
+            assert(child);
+            assert(otherChild);
+            if (child && otherChild) {
+                child->combine(*otherChild, op);
+            }
+        }
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename CombineOp>
+inline void
+InternalNode<ChildT, Log2Dim>::combine(const ValueType& value, bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<ValueType> args;
+
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+        if (this->isChildMaskOff(i)) {
+            // Combine this node's constant value with the given constant value.
+            op(args.setARef(mNodes[i].getValue())
+               .setAIsActive(isValueMaskOn(i))
+               .setBRef(value)
+               .setBIsActive(valueIsActive));
+            mNodes[i].setValue(args.result());
+            mValueMask.set(i, args.resultIsActive());
+        } else /*if (isChildMaskOn(i))*/ {
+            // Combine this node's child with the given constant value.
+            ChildNodeType* child = mNodes[i].getChild();
+            assert(child);
+            if (child) child->combine(value, valueIsActive, op);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename CombineOp, typename OtherNodeType>
+inline void
+InternalNode<ChildT, Log2Dim>::combine2(const InternalNode& other0, const OtherNodeType& other1,
+    CombineOp& op)
+{
+    CombineArgs<ValueType, typename OtherNodeType::ValueType> args;
+
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+        if (other0.isChildMaskOff(i) && other1.isChildMaskOff(i)) {
+            op(args.setARef(other0.mNodes[i].getValue())
+                .setAIsActive(other0.isValueMaskOn(i))
+                .setBRef(other1.mNodes[i].getValue())
+                .setBIsActive(other1.isValueMaskOn(i)));
+            // Replace child i with a constant value.
+            this->makeChildNodeEmpty(i, args.result());
+            mValueMask.set(i, args.resultIsActive());
+        } else {
+            if (this->isChildMaskOff(i)) {
+                // Add a new child with the same coordinates, etc. as the other node's child.
+                const Coord& childOrigin = other0.isChildMaskOn(i)
+                    ? other0.mNodes[i].getChild()->origin()
+                    : other1.mNodes[i].getChild()->origin();
+                this->setChildNode(i, new ChildNodeType(childOrigin, mNodes[i].getValue()));
+            }
+
+            if (other0.isChildMaskOff(i)) {
+                // Combine node1's child with node0's constant value
+                // and write the result into child i.
+                mNodes[i].getChild()->combine2(other0.mNodes[i].getValue(),
+                    *other1.mNodes[i].getChild(), other0.isValueMaskOn(i), op);
+            } else if (other1.isChildMaskOff(i)) {
+                // Combine node0's child with node1's constant value
+                // and write the result into child i.
+                mNodes[i].getChild()->combine2(*other0.mNodes[i].getChild(),
+                    other1.mNodes[i].getValue(), other1.isValueMaskOn(i), op);
+            } else {
+                // Combine node0's child with node1's child
+                // and write the result into child i.
+                mNodes[i].getChild()->combine2(*other0.mNodes[i].getChild(),
+                    *other1.mNodes[i].getChild(), op);
+            }
+        }
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename CombineOp, typename OtherNodeType>
+inline void
+InternalNode<ChildT, Log2Dim>::combine2(const ValueType& value, const OtherNodeType& other,
+    bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<ValueType, typename OtherNodeType::ValueType> args;
+
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+        if (other.isChildMaskOff(i)) {
+            op(args.setARef(value)
+                .setAIsActive(valueIsActive)
+                .setBRef(other.mNodes[i].getValue())
+                .setBIsActive(other.isValueMaskOn(i)));
+            // Replace child i with a constant value.
+            this->makeChildNodeEmpty(i, args.result());
+            mValueMask.set(i, args.resultIsActive());
+        } else {
+            typename OtherNodeType::ChildNodeType* otherChild = other.mNodes[i].getChild();
+            assert(otherChild);
+            if (this->isChildMaskOff(i)) {
+                // Add a new child with the same coordinates, etc.
+                // as the other node's child.
+                this->setChildNode(i, new ChildNodeType(*otherChild));
+            }
+            // Combine the other node's child with a constant value
+            // and write the result into child i.
+            mNodes[i].getChild()->combine2(value, *otherChild, valueIsActive, op);
+        }
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename CombineOp, typename OtherValueType>
+inline void
+InternalNode<ChildT, Log2Dim>::combine2(const InternalNode& other, const OtherValueType& value,
+    bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<ValueType, OtherValueType> args;
+
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+        if (other.isChildMaskOff(i)) {
+            op(args.setARef(other.mNodes[i].getValue())
+                .setAIsActive(other.isValueMaskOn(i))
+                .setBRef(value)
+                .setBIsActive(valueIsActive));
+            // Replace child i with a constant value.
+            this->makeChildNodeEmpty(i, args.result());
+            mValueMask.set(i, args.resultIsActive());
+        } else {
+            ChildNodeType* otherChild = other.mNodes[i].getChild();
+            assert(otherChild);
+            if (this->isChildMaskOff(i)) {
+                // Add a new child with the same coordinates, etc. as the other node's child.
+                this->setChildNode(i,
+                    new ChildNodeType(otherChild->origin(), mNodes[i].getValue()));
+            }
+            // Combine the other node's child with a constant value
+            // and write the result into child i.
+            mNodes[i].getChild()->combine2(*otherChild, value, valueIsActive, op);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename BBoxOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visitActiveBBox(BBoxOp& op) const
+{
+    for (ValueOnCIter i = this->cbeginValueOn(); i; ++i) {
+#ifdef _MSC_VER
+        op.operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), ChildNodeType::DIM));
+#else
+        op.template operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), ChildNodeType::DIM));
+#endif
+    }
+    if (op.template descent<LEVEL>()) {
+        for (ChildOnCIter i = this->cbeginChildOn(); i; ++i) i->visitActiveBBox(op);
+    } else {
+        for (ChildOnCIter i = this->cbeginChildOn(); i; ++i) {
+#ifdef _MSC_VER
+            op.operator()<LEVEL>(i->getNodeBoundingBox());
+#else
+            op.template operator()<LEVEL>(i->getNodeBoundingBox());
+#endif
+        }
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename VisitorOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visit(VisitorOp& op)
+{
+    doVisit<InternalNode, VisitorOp, ChildAllIter>(*this, op);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename VisitorOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visit(VisitorOp& op) const
+{
+    doVisit<const InternalNode, VisitorOp, ChildAllCIter>(*this, op);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+inline void
+InternalNode<ChildT, Log2Dim>::doVisit(NodeT& self, VisitorOp& op)
+{
+    typename NodeT::ValueType val;
+    for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+        if (op(iter)) continue;
+        if (typename ChildAllIterT::ChildNodeType* child = iter.probeChild(val)) {
+            child->visit(op);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherNodeType, typename VisitorOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visit2Node(OtherNodeType& other, VisitorOp& op)
+{
+    doVisit2Node<InternalNode, OtherNodeType, VisitorOp, ChildAllIter,
+        typename OtherNodeType::ChildAllIter>(*this, other, op);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherNodeType, typename VisitorOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visit2Node(OtherNodeType& other, VisitorOp& op) const
+{
+    doVisit2Node<const InternalNode, OtherNodeType, VisitorOp, ChildAllCIter,
+        typename OtherNodeType::ChildAllCIter>(*this, other, op);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<
+    typename NodeT,
+    typename OtherNodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+InternalNode<ChildT, Log2Dim>::doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp& op)
+{
+    // Allow the two nodes to have different ValueTypes, but not different dimensions.
+    BOOST_STATIC_ASSERT(OtherNodeT::NUM_VALUES == NodeT::NUM_VALUES);
+    BOOST_STATIC_ASSERT(OtherNodeT::LEVEL == NodeT::LEVEL);
+
+    typename NodeT::ValueType val;
+    typename OtherNodeT::ValueType otherVal;
+
+    ChildAllIterT iter = self.beginChildAll();
+    OtherChildAllIterT otherIter = other.beginChildAll();
+
+    for ( ; iter && otherIter; ++iter, ++otherIter)
+    {
+        const size_t skipBranch = static_cast<size_t>(op(iter, otherIter));
+
+        typename ChildAllIterT::ChildNodeType* child =
+            (skipBranch & 1U) ? NULL : iter.probeChild(val);
+        typename OtherChildAllIterT::ChildNodeType* otherChild =
+            (skipBranch & 2U) ? NULL : otherIter.probeChild(otherVal);
+
+        if (child != NULL && otherChild != NULL) {
+            child->visit2Node(*otherChild, op);
+        } else if (child != NULL) {
+            child->visit2(otherIter, op);
+        } else if (otherChild != NULL) {
+            otherChild->visit2(iter, op, /*otherIsLHS=*/true);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildAllIterType, typename VisitorOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visit2(OtherChildAllIterType& otherIter,
+    VisitorOp& op, bool otherIsLHS)
+{
+    doVisit2<InternalNode, VisitorOp, ChildAllIter, OtherChildAllIterType>(
+        *this, otherIter, op, otherIsLHS);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildAllIterType, typename VisitorOp>
+inline void
+InternalNode<ChildT, Log2Dim>::visit2(OtherChildAllIterType& otherIter,
+    VisitorOp& op, bool otherIsLHS) const
+{
+    doVisit2<const InternalNode, VisitorOp, ChildAllCIter, OtherChildAllIterType>(
+        *this, otherIter, op, otherIsLHS);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+template<typename NodeT, typename VisitorOp, typename ChildAllIterT, typename OtherChildAllIterT>
+inline void
+InternalNode<ChildT, Log2Dim>::doVisit2(NodeT& self, OtherChildAllIterT& otherIter,
+    VisitorOp& op, bool otherIsLHS)
+{
+    if (!otherIter) return;
+
+    const size_t skipBitMask = (otherIsLHS ? 2U : 1U);
+
+    typename NodeT::ValueType val;
+    for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+        const size_t skipBranch = static_cast<size_t>(
+            otherIsLHS ? op(otherIter, iter) : op(iter, otherIter));
+
+        typename ChildAllIterT::ChildNodeType* child =
+            (skipBranch & skipBitMask) ? NULL : iter.probeChild(val);
+
+        if (child != NULL) child->visit2(otherIter, op, otherIsLHS);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::writeBuffers(std::ostream& os, bool toHalf) const
+{
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        iter->writeBuffers(os, toHalf);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::readBuffers(std::istream& is, bool fromHalf)
+{
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        iter->readBuffers(is, fromHalf);
+    }
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::readBuffers(std::istream& is,
+    const CoordBBox& clipBBox, bool fromHalf)
+{
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        // Stream in the branch rooted at this child.
+        // (We can't skip over children that lie outside the clipping region,
+        // because buffers are serialized in depth-first order and need to be
+        // unserialized in the same order.)
+        iter->readBuffers(is, clipBBox, fromHalf);
+    }
+
+    // Get this tree's background value.
+    ValueType background = zeroVal<ValueType>();
+    if (const void* bgPtr = io::getGridBackgroundValuePtr(is)) {
+        background = *static_cast<const ValueType*>(bgPtr);
+    }
+    this->clip(clipBBox, background);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+void
+InternalNode<ChildT, Log2Dim>::getNodeLog2Dims(std::vector<Index>& dims)
+{
+    dims.push_back(Log2Dim);
+    ChildNodeType::getNodeLog2Dims(dims);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::offsetToLocalCoord(Index n, Coord &xyz)
+{
+    assert(n<(1<<3*Log2Dim));
+    xyz.setX(n >> 2*Log2Dim);
+    n &= ((1<<2*Log2Dim)-1);
+    xyz.setY(n >> Log2Dim);
+    xyz.setZ(n & ((1<<Log2Dim)-1));
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Index
+InternalNode<ChildT, Log2Dim>::coordToOffset(const Coord& xyz)
+{
+    return (((xyz[0] & (DIM-1u)) >> ChildNodeType::TOTAL) << 2*Log2Dim)
+        +  (((xyz[1] & (DIM-1u)) >> ChildNodeType::TOTAL) <<   Log2Dim)
+        +   ((xyz[2] & (DIM-1u)) >> ChildNodeType::TOTAL);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline Coord
+InternalNode<ChildT, Log2Dim>::offsetToGlobalCoord(Index n) const
+{
+    Coord local;
+    this->offsetToLocalCoord(n, local);
+    local <<= ChildT::TOTAL;
+    return local + this->origin();
+}
+
+////////////////////////////////////////
+
+template<typename ChildT, Index Log2Dim>
+template<typename ArrayT>
+inline void
+InternalNode<ChildT, Log2Dim>::getNodes(ArrayT& array)
+{
+    typedef typename ArrayT::value_type T;
+    BOOST_STATIC_ASSERT(boost::is_pointer<T>::value);
+    typedef typename boost::mpl::if_<boost::is_const<typename boost::remove_pointer<T>::type>,
+                                     const ChildT, ChildT>::type ArrayChildT;
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (boost::is_same<T, ArrayChildT*>::value) {
+            array.push_back(reinterpret_cast<T>(mNodes[iter.pos()].getChild()));
+        } else {
+            iter->getNodes(array);//descent
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename ArrayT>
+inline void
+InternalNode<ChildT, Log2Dim>::getNodes(ArrayT& array) const
+{
+    typedef typename ArrayT::value_type T;
+    BOOST_STATIC_ASSERT(boost::is_pointer<T>::value);
+    BOOST_STATIC_ASSERT(boost::is_const<typename boost::remove_pointer<T>::type>::value);
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (boost::is_same<T, const ChildT*>::value) {
+            array.push_back(reinterpret_cast<T>(mNodes[iter.pos()].getChild()));
+        } else {
+            iter->getNodes(array);//descent
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+}
+
+////////////////////////////////////////
+
+template<typename ChildT, Index Log2Dim>
+template<typename ArrayT>
+inline void
+InternalNode<ChildT, Log2Dim>::stealNodes(ArrayT& array, const ValueType& value, bool state)
+{
+    typedef typename ArrayT::value_type T;
+    BOOST_STATIC_ASSERT(boost::is_pointer<T>::value);
+    typedef typename boost::mpl::if_<boost::is_const<typename boost::remove_pointer<T>::type>,
+                                     const ChildT, ChildT>::type ArrayChildT;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN 
+    for (ChildOnIter iter = this->beginChildOn(); iter; ++iter) {
+        const Index n = iter.pos();
+        if (boost::is_same<T, ArrayChildT*>::value) {
+            array.push_back(reinterpret_cast<T>(mNodes[n].getChild()));
+            mValueMask.set(n, state);
+            mNodes[n].setValue(value);
+        } else {
+            iter->stealNodes(array, value, state);//descent
+        }
+    }
+    if (boost::is_same<T, ArrayChildT*>::value) mChildMask.setOff();     
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+////////////////////////////////////////
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::resetBackground(const ValueType& oldBackground,
+                                               const ValueType& newBackground)
+{
+    if (math::isExactlyEqual(oldBackground, newBackground)) return;
+    for (Index i = 0; i < NUM_VALUES; ++i) {
+       if (this->isChildMaskOn(i)) {
+           mNodes[i].getChild()->resetBackground(oldBackground, newBackground);
+       } else if (this->isValueMaskOff(i)) {
+           if (math::isApproxEqual(mNodes[i].getValue(), oldBackground)) {
+               mNodes[i].setValue(newBackground);
+           } else if (math::isApproxEqual(mNodes[i].getValue(), math::negative(oldBackground))) {
+               mNodes[i].setValue(math::negative(newBackground));
+           }
+       }
+    }
+}
+
+template<typename ChildT, Index Log2Dim>
+template<typename OtherChildNodeType, Index OtherLog2Dim>
+inline bool
+InternalNode<ChildT, Log2Dim>::hasSameTopology(
+    const InternalNode<OtherChildNodeType, OtherLog2Dim>* other) const
+{
+    if (Log2Dim != OtherLog2Dim || mChildMask != other->mChildMask ||
+        mValueMask != other->mValueMask) return false;
+    for (ChildOnCIter iter = this->cbeginChildOn(); iter; ++iter) {
+        if (!iter->hasSameTopology(other->mNodes[iter.pos()].getChild())) return false;
+    }
+    return true;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::resetChildNode(Index i, ChildNodeType* child)
+{
+    assert(child);
+    if (this->isChildMaskOn(i)) {
+        delete mNodes[i].getChild();
+    } else {
+        mChildMask.setOn(i);
+        mValueMask.setOff(i);
+    }
+    mNodes[i].setChild(child);
+}
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::setChildNode(Index i, ChildNodeType* child)
+{
+    assert(child);
+    assert(mChildMask.isOff(i));
+    mChildMask.setOn(i);
+    mValueMask.setOff(i);
+    mNodes[i].setChild(child);
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline ChildT*
+InternalNode<ChildT, Log2Dim>::unsetChildNode(Index i, const ValueType& value)
+{
+    if (this->isChildMaskOff(i)) {
+        mNodes[i].setValue(value);
+        return NULL;
+    }
+    ChildNodeType* child = mNodes[i].getChild();
+    mChildMask.setOff(i);
+    mNodes[i].setValue(value);
+    return child;
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline void
+InternalNode<ChildT, Log2Dim>::makeChildNodeEmpty(Index n, const ValueType& value)
+{
+    delete this->unsetChildNode(n, value);
+}
+
+template<typename ChildT, Index Log2Dim>
+inline ChildT*
+InternalNode<ChildT, Log2Dim>::getChildNode(Index n)
+{
+    assert(this->isChildMaskOn(n));
+    return mNodes[n].getChild();
+}
+
+
+template<typename ChildT, Index Log2Dim>
+inline const ChildT*
+InternalNode<ChildT, Log2Dim>::getChildNode(Index n) const
+{
+    assert(this->isChildMaskOn(n));
+    return mNodes[n].getChild();
+}
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_INTERNALNODE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/Iterator.h b/nuparu/include/openvdb_new/tree/Iterator.h
new file mode 100644
index 00000000..41fa104d
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/Iterator.h
@@ -0,0 +1,290 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file Iterator.h
+///
+/// @author Peter Cucka and Ken Museth
+
+#ifndef OPENVDB_TREE_ITERATOR_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_ITERATOR_HAS_BEEN_INCLUDED
+
+#include <sstream>
+#include <boost/static_assert.hpp>
+#include <boost/type_traits/is_const.hpp>
+#include <boost/type_traits/remove_const.hpp>
+#include <openvdb/util/NodeMasks.h>
+#include <openvdb/Exceptions.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+/// @brief Base class for iterators over internal and leaf nodes
+///
+/// This class is typically not instantiated directly, since it doesn't provide methods
+/// to dereference the iterator.  Those methods (@vdblink::tree::SparseIteratorBase::operator*()
+/// operator*()@endlink, @vdblink::tree::SparseIteratorBase::setValue() setValue()@endlink, etc.)
+/// are implemented in the @vdblink::tree::SparseIteratorBase sparse@endlink and
+/// @vdblink::tree::DenseIteratorBase dense@endlink iterator subclasses.
+template<typename MaskIterT, typename NodeT>
+class IteratorBase
+{
+public:
+    IteratorBase(): mParentNode(NULL) {}
+    IteratorBase(const MaskIterT& iter, NodeT* parent):
+        mParentNode(parent), mMaskIter(iter) {}
+
+    void operator=(const IteratorBase& other)
+    {
+        mParentNode = other.mParentNode;
+        mMaskIter = other.mMaskIter;
+    }
+
+    bool operator==(const IteratorBase& other) const
+    {
+        return (mParentNode == other.mParentNode) && (mMaskIter == other.mMaskIter);
+    }
+    bool operator!=(const IteratorBase& other) const
+    {
+        return !(*this == other);
+    }
+
+    /// Return a pointer to the node (if any) over which this iterator is iterating.
+    NodeT* getParentNode() const { return mParentNode; }
+    /// @brief Return a reference to the node over which this iterator is iterating.
+    /// @throw ValueError if there is no parent node.
+    NodeT& parent() const
+    {
+        if (!mParentNode) OPENVDB_THROW(ValueError, "iterator references a null node");
+        return *mParentNode;
+    }
+
+    /// Return this iterator's position as an index into the parent node's table.
+    Index offset() const { return mMaskIter.offset(); }
+
+    /// Identical to offset
+    Index pos() const { return mMaskIter.offset(); }
+
+    /// Return @c true if this iterator is not yet exhausted.
+    bool test() const { return mMaskIter.test(); }
+    /// Return @c true if this iterator is not yet exhausted.
+    operator bool() const { return this->test(); }
+
+    /// Advance to the next item in the parent node's table.
+    bool next() { return mMaskIter.next(); }
+    /// Advance to the next item in the parent node's table.
+    void increment() { mMaskIter.increment(); }
+    /// Advance to the next item in the parent node's table.
+    IteratorBase& operator++() { this->increment(); return *this; }
+    /// Advance @a n items in the parent node's table.
+    void increment(Index n) { mMaskIter.increment(n); }
+
+    /// @brief Return @c true if this iterator is pointing to an active value.
+    /// Return @c false if it is pointing to either an inactive value or a child node.
+    bool isValueOn() const { return parent().isValueMaskOn(this->pos()); }
+    /// @brief If this iterator is pointing to a value, set the value's active state.
+    /// Otherwise, do nothing.
+    void setValueOn(bool on = true) const { parent().setValueMask(this->pos(), on); }
+    /// @brief If this iterator is pointing to a value, mark the value as inactive.
+    /// @details If this iterator is pointing to a child node, then the current item
+    /// in the parent node's table is required to be inactive.  In that case,
+    /// this method has no effect.
+    void setValueOff() const { parent().mValueMask.setOff(this->pos()); }
+
+    /// Return the coordinates of the item to which this iterator is pointing.
+    Coord getCoord() const { return parent().offsetToGlobalCoord(this->pos()); }
+    /// Return in @a xyz the coordinates of the item to which this iterator is pointing.
+    void getCoord(Coord& xyz) const { xyz = this->getCoord(); }
+
+private:
+    /// @note This parent node pointer is mutable, because setValueOn() and
+    /// setValueOff(), though const, need to call non-const methods on the parent.
+    /// There is a distinction between a const iterator (e.g., const ValueOnIter),
+    /// which is an iterator that can't be incremented, and an iterator over
+    /// a const node (e.g., ValueOnCIter), which might be const or non-const itself
+    /// but can't call non-const methods like setValue() on the node.
+    mutable NodeT* mParentNode;
+    MaskIterT mMaskIter;
+}; // class IteratorBase
+
+
+////////////////////////////////////////
+
+
+/// @brief Base class for sparse iterators over internal and leaf nodes
+template<
+    typename MaskIterT, // mask iterator type (OnIterator, OffIterator, etc.)
+    typename IterT,     // SparseIteratorBase subclass (the "Curiously Recurring Template Pattern")
+    typename NodeT,     // type of node over which to iterate
+    typename ItemT>     // type of value to which this iterator points
+struct SparseIteratorBase: public IteratorBase<MaskIterT, NodeT>
+{
+    typedef NodeT NodeType;
+    typedef ItemT ValueType;
+    typedef typename boost::remove_const<NodeT>::type NonConstNodeType;
+    typedef typename boost::remove_const<ItemT>::type NonConstValueType;
+    static const bool IsSparseIterator = true, IsDenseIterator = false;
+
+    SparseIteratorBase() {}
+    SparseIteratorBase(const MaskIterT& iter, NodeT* parent):
+        IteratorBase<MaskIterT, NodeT>(iter, parent) {}
+
+    /// @brief Return the item at the given index in the parent node's table.
+    /// @note All subclasses must implement this accessor.
+    ItemT& getItem(Index) const;
+    /// @brief Set the value of the item at the given index in the parent node's table.
+    /// @note All non-const iterator subclasses must implement this accessor.
+    void setItem(Index, const ItemT&) const;
+
+    /// Return a reference to the item to which this iterator is pointing.
+    ItemT& operator*() const { return this->getValue(); }
+    /// Return a pointer to the item to which this iterator is pointing.
+    ItemT* operator->() const { return &(this->operator*()); }
+
+    /// Return the item to which this iterator is pointing.
+    ItemT& getValue() const
+    {
+        return static_cast<const IterT*>(this)->getItem(this->pos()); // static polymorphism
+    }
+    /// @brief Set the value of the item to which this iterator is pointing.
+    /// (Not valid for const iterators.)
+    void setValue(const ItemT& value) const
+    {
+        BOOST_STATIC_ASSERT(!boost::is_const<NodeT>::value);
+        static_cast<const IterT*>(this)->setItem(this->pos(), value); // static polymorphism
+    }
+    /// @brief Apply a functor to the item to which this iterator is pointing.
+    /// (Not valid for const iterators.)
+    /// @param op  a functor of the form <tt>void op(ValueType&) const</tt> that modifies
+    ///            its argument in place
+    /// @see Tree::modifyValue()
+    template<typename ModifyOp>
+    void modifyValue(const ModifyOp& op) const
+    {
+        BOOST_STATIC_ASSERT(!boost::is_const<NodeT>::value);
+        static_cast<const IterT*>(this)->modifyItem(this->pos(), op); // static polymorphism
+    }
+}; // class SparseIteratorBase
+
+
+////////////////////////////////////////
+
+
+/// @brief Base class for dense iterators over internal and leaf nodes
+/// @note Dense iterators have no @c %operator*() or @c %operator->(),
+/// because their return type would have to vary depending on whether
+/// the iterator is pointing to a value or a child node.
+template<
+    typename MaskIterT,  // mask iterator type (typically a DenseIterator)
+    typename IterT,      // DenseIteratorBase subclass (the "Curiously Recurring Template Pattern")
+    typename NodeT,      // type of node over which to iterate
+    typename SetItemT,   // type of set value (ChildNodeType, for non-leaf nodes)
+    typename UnsetItemT> // type of unset value (ValueType, usually)
+struct DenseIteratorBase: public IteratorBase<MaskIterT, NodeT>
+{
+    typedef NodeT NodeType;
+    typedef UnsetItemT ValueType;
+    typedef SetItemT ChildNodeType;
+    typedef typename boost::remove_const<NodeT>::type NonConstNodeType;
+    typedef typename boost::remove_const<UnsetItemT>::type NonConstValueType;
+    typedef typename boost::remove_const<SetItemT>::type NonConstChildNodeType;
+    static const bool IsSparseIterator = false, IsDenseIterator = true;
+
+    DenseIteratorBase() {}
+    DenseIteratorBase(const MaskIterT& iter, NodeT* parent):
+        IteratorBase<MaskIterT, NodeT>(iter, parent) {}
+
+    /// @brief Return @c true if the item at the given index in the parent node's table
+    /// is a set value and return either the set value in @a child or the unset value
+    /// in @a value.
+    /// @note All subclasses must implement this accessor.
+    bool getItem(Index, SetItemT*& child, NonConstValueType& value) const;
+    /// @brief Set the value of the item at the given index in the parent node's table.
+    /// @note All non-const iterator subclasses must implement this accessor.
+    void setItem(Index, SetItemT*) const;
+    /// @brief "Unset" the value of the item at the given index in the parent node's table.
+    /// @note All non-const iterator subclasses must implement this accessor.
+    void unsetItem(Index, const UnsetItemT&) const;
+
+    /// Return @c true if this iterator is pointing to a child node.
+    bool isChildNode() const { return this->parent().isChildMaskOn(this->pos()); }
+
+    /// @brief If this iterator is pointing to a child node, return a pointer to the node.
+    /// Otherwise, return NULL and, in @a value, the value to which this iterator is pointing.
+    SetItemT* probeChild(NonConstValueType& value) const
+    {
+        SetItemT* child = NULL;
+        static_cast<const IterT*>(this)->getItem(this->pos(), child, value); // static polymorphism
+        return child;
+    }
+    /// @brief If this iterator is pointing to a child node, return @c true and return
+    /// a pointer to the child node in @a child.  Otherwise, return @c false and return
+    /// the value to which this iterator is pointing in @a value.
+    bool probeChild(SetItemT*& child, NonConstValueType& value) const
+    {
+        child = probeChild(value);
+        return (child != NULL);
+    }
+
+    /// @brief Return @c true if this iterator is pointing to a value and return
+    /// the value in @a value.  Otherwise, return @c false.
+    bool probeValue(NonConstValueType& value) const
+    {
+        SetItemT* child = NULL;
+        const bool isChild = static_cast<const IterT*>(this)-> // static polymorphism
+            getItem(this->pos(), child, value);
+        return !isChild;
+    }
+
+    /// @brief Replace with the given child node the item in the parent node's table
+    /// to which this iterator is pointing.
+    void setChild(SetItemT* child) const
+    {
+        static_cast<const IterT*>(this)->setItem(this->pos(), child); // static polymorphism
+    }
+
+    /// @brief Replace with the given value the item in the parent node's table
+    /// to which this iterator is pointing.
+    void setValue(const UnsetItemT& value) const
+    {
+        static_cast<const IterT*>(this)->unsetItem(this->pos(), value); // static polymorphism
+    }
+}; // struct DenseIteratorBase
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_ITERATOR_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/LeafManager.h b/nuparu/include/openvdb_new/tree/LeafManager.h
new file mode 100644
index 00000000..c06a0243
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/LeafManager.h
@@ -0,0 +1,846 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file LeafManager.h
+///
+/// @brief A LeafManager manages a linear array of pointers to a given tree's
+/// leaf nodes, as well as optional auxiliary buffers (one or more per leaf)
+/// that can be swapped with the leaf nodes' voxel data buffers.
+/// @details The leaf array is useful for multithreaded computations over
+/// leaf voxels in a tree with static topology but varying voxel values.
+/// The auxiliary buffers are convenient for temporal integration.
+/// Efficient methods are provided for multithreaded swapping and synching
+/// (i.e., copying the contents) of these buffers.
+
+#ifndef OPENVDB_TREE_LEAFMANAGER_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_LEAFMANAGER_HAS_BEEN_INCLUDED
+
+#include <boost/shared_ptr.hpp>
+#include <boost/bind.hpp>
+#include <boost/function.hpp>
+#include <boost/mpl/if.hpp>
+#include <boost/type_traits/is_const.hpp>
+#include <boost/type_traits/is_pointer.hpp>
+#include <boost/type_traits/is_same.hpp>
+#include <boost/type_traits/remove_pointer.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <openvdb/Types.h>
+#include "TreeIterator.h" // for CopyConstness
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+namespace leafmgr {
+
+//@{
+/// Useful traits for Tree types
+template<typename TreeT> struct TreeTraits {
+    static const bool IsConstTree = false;
+    typedef typename TreeT::LeafIter LeafIterType;
+};
+template<typename TreeT> struct TreeTraits<const TreeT> {
+    static const bool IsConstTree = true;
+    typedef typename TreeT::LeafCIter LeafIterType;
+};
+//@}
+
+} // namespace leafmgr
+
+
+/// This helper class implements LeafManager methods that need to be
+/// specialized for const vs. non-const trees.
+template<typename ManagerT>
+struct LeafManagerImpl
+{
+    typedef typename ManagerT::RangeType  RangeT;
+    typedef typename ManagerT::LeafType   LeafT;
+    typedef typename ManagerT::BufferType BufT;
+
+    static inline void doSwapLeafBuffer(const RangeT& r, size_t auxBufferIdx,
+                                        LeafT** leafs, BufT* bufs, size_t bufsPerLeaf)
+    {
+        for (size_t n = r.begin(), m = r.end(), N = bufsPerLeaf; n != m; ++n) {
+            leafs[n]->swap(bufs[n * N + auxBufferIdx]);
+        }
+    }
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief This class manages a linear array of pointers to a given tree's
+/// leaf nodes, as well as optional auxiliary buffers (one or more per leaf)
+/// that can be swapped with the leaf nodes' voxel data buffers.
+/// @details The leaf array is useful for multithreaded computations over
+/// leaf voxels in a tree with static topology but varying voxel values.
+/// The auxiliary buffers are convenient for temporal integration.
+/// Efficient methods are provided for multithreaded swapping and sync'ing
+/// (i.e., copying the contents) of these buffers.
+///
+/// @note Buffer index 0 denotes a leaf node's internal voxel data buffer.
+/// Any auxiliary buffers are indexed starting from one.
+template<typename TreeT>
+class LeafManager
+{
+public:
+    typedef TreeT                                                      TreeType;
+    typedef typename TreeT::ValueType                                  ValueType;
+    typedef typename TreeT::RootNodeType                               RootNodeType;
+    typedef typename TreeType::LeafNodeType                            NonConstLeafType;
+    typedef typename CopyConstness<TreeType, NonConstLeafType>::Type   LeafType;
+    typedef LeafType                                                   LeafNodeType;
+    typedef typename leafmgr::TreeTraits<TreeT>::LeafIterType          LeafIterType;
+    typedef typename LeafType::Buffer                                  NonConstBufferType;
+    typedef typename CopyConstness<TreeType, NonConstBufferType>::Type BufferType;
+    typedef tbb::blocked_range<size_t>                                 RangeType;//leaf index range
+    static const Index DEPTH = 2;//root + leafs
+
+    static const bool IsConstTree = leafmgr::TreeTraits<TreeT>::IsConstTree;
+
+    class LeafRange
+    {
+    public:
+        class Iterator
+        {
+        public:
+            Iterator(const LeafRange& range, size_t pos): mRange(range), mPos(pos)
+            {
+                assert(this->isValid());
+            }
+            Iterator& operator=(const Iterator& other)
+            {
+                mRange = other.mRange; mPos = other.mPos; return *this;
+            }
+            /// Advance to the next leaf node.
+            Iterator& operator++() { ++mPos; return *this; }
+            /// Return a reference to the leaf node to which this iterator is pointing.
+            LeafType& operator*() const { return mRange.mLeafManager.leaf(mPos); }
+            /// Return a pointer to the leaf node to which this iterator is pointing.
+            LeafType* operator->() const { return &(this->operator*()); }
+            /// @brief Return the nth buffer for the leaf node to which this iterator is pointing,
+            /// where n = @a bufferIdx and n = 0 corresponds to the leaf node's own buffer.
+            BufferType& buffer(size_t bufferIdx)
+            {
+                return mRange.mLeafManager.getBuffer(mPos, bufferIdx);
+            }
+            /// Return the index into the leaf array of the current leaf node.
+            size_t pos() const { return mPos; }
+            /// Return @c true if the position of this iterator is in a valid range.
+            bool isValid() const { return mPos>=mRange.mBegin && mPos<=mRange.mEnd; }
+            /// Return @c true if this iterator is not yet exhausted.
+            bool test() const { return mPos < mRange.mEnd; }
+            /// Return @c true if this iterator is not yet exhausted.
+            operator bool() const { return this->test(); }
+            /// Return @c true if this iterator is exhausted.
+            bool empty() const { return !this->test(); }
+            bool operator!=(const Iterator& other) const
+            {
+                return (mPos != other.mPos) || (&mRange != &other.mRange);
+            }
+            bool operator==(const Iterator& other) const { return !(*this != other); }
+            const LeafRange& leafRange() const { return mRange; }
+
+        private:
+            const LeafRange& mRange;
+            size_t mPos;
+        };// end Iterator
+
+        LeafRange(size_t begin, size_t end, const LeafManager& leafManager, size_t grainSize=1)
+            : mEnd(end)
+            , mBegin(begin)
+            , mGrainSize(grainSize)
+            , mLeafManager(leafManager)
+        {
+        }
+
+        Iterator begin() const {return Iterator(*this, mBegin);}
+
+        Iterator end() const {return Iterator(*this, mEnd);}
+
+        size_t size() const { return mEnd - mBegin; }
+
+        size_t grainsize() const { return mGrainSize; }
+
+        const LeafManager& leafManager() const { return mLeafManager; }
+
+        bool empty() const {return !(mBegin < mEnd);}
+
+        bool is_divisible() const {return mGrainSize < this->size();}
+
+        LeafRange(LeafRange& r, tbb::split)
+            : mEnd(r.mEnd)
+            , mBegin(doSplit(r))
+            , mGrainSize(r.mGrainSize)
+            , mLeafManager(r.mLeafManager)
+        {
+        }
+
+    private:
+        size_t mEnd, mBegin, mGrainSize;
+        const LeafManager& mLeafManager;
+
+        static size_t doSplit(LeafRange& r)
+        {
+            assert(r.is_divisible());
+            size_t middle = r.mBegin + (r.mEnd - r.mBegin) / 2u;
+            r.mEnd = middle;
+            return middle;
+        }
+    };// end of LeafRange
+
+    /// @brief Constructor from a tree reference and an auxiliary buffer count
+    /// (default is no auxiliary buffers)
+    LeafManager(TreeType& tree, size_t auxBuffersPerLeaf=0, bool serial=false)
+        : mTree(&tree)
+        , mLeafCount(0)
+        , mAuxBufferCount(0)
+        , mAuxBuffersPerLeaf(auxBuffersPerLeaf)
+        , mLeafs(NULL)
+        , mAuxBuffers(NULL)
+        , mTask(0)
+        , mIsMaster(true)
+    {
+        this->rebuild(serial);
+    }
+
+    /// @brief Constructor from a tree reference and an existing array
+    /// of pointers to LeafNodes from said tree. This c-tor is only
+    /// intended for experts that try to squice out a
+    LeafManager(TreeType& tree, LeafType** begin, LeafType** end,
+                size_t auxBuffersPerLeaf=0, bool serial=false)
+        : mTree(&tree)
+        , mLeafCount(end-begin)
+        , mAuxBufferCount(0)
+        , mAuxBuffersPerLeaf(auxBuffersPerLeaf)
+        , mLeafs(new LeafType*[mLeafCount])
+        , mAuxBuffers(NULL)
+        , mTask(0)
+        , mIsMaster(true)
+    {
+        size_t n = mLeafCount; 
+        LeafType **target = mLeafs, **source = begin;
+        while (n--) *target++ = *source++;
+        if (auxBuffersPerLeaf) this->initAuxBuffers(serial);
+    }
+
+    /// Shallow copy constructor called by tbb::parallel_for() threads
+    ///
+    /// @note This should never get called directly
+    LeafManager(const LeafManager& other)
+        : mTree(other.mTree)
+        , mLeafCount(other.mLeafCount)
+        , mAuxBufferCount(other.mAuxBufferCount)
+        , mAuxBuffersPerLeaf(other.mAuxBuffersPerLeaf)
+        , mLeafs(other.mLeafs)
+        , mAuxBuffers(other.mAuxBuffers)
+        , mTask(other.mTask)
+        , mIsMaster(false)
+    {
+    }
+    
+    virtual ~LeafManager()
+    {
+        if (mIsMaster) {
+            delete [] mLeafs;
+            delete [] mAuxBuffers;
+        }
+    }
+
+    /// @brief (Re)initialize by resizing (if necessary) and repopulating the leaf array
+    /// and by deleting existing auxiliary buffers and allocating new ones.
+    /// @details Call this method if the tree's topology, and therefore the number
+    /// of leaf nodes, changes.  New auxiliary buffers are initialized with copies
+    /// of corresponding leaf node buffers.
+    void rebuild(bool serial=false)
+    {
+        this->initLeafArray();
+        this->initAuxBuffers(serial);
+    }
+    //@{
+    /// Repopulate the leaf array and delete and reallocate auxiliary buffers.
+    void rebuild(size_t auxBuffersPerLeaf, bool serial=false)
+    {
+        mAuxBuffersPerLeaf = auxBuffersPerLeaf;
+        this->rebuild(serial);
+    }
+    void rebuild(TreeType& tree, bool serial=false)
+    {
+        mTree = &tree;
+        this->rebuild(serial);
+    }
+    void rebuild(TreeType& tree, size_t auxBuffersPerLeaf, bool serial=false)
+    {
+        mTree = &tree;
+        mAuxBuffersPerLeaf = auxBuffersPerLeaf;
+        this->rebuild(serial);
+    }
+    //@}
+    /// @brief Change the number of auxiliary buffers.
+    /// @details If auxBuffersPerLeaf is 0, all existing auxiliary buffers are deleted.
+    /// New auxiliary buffers are initialized with copies of corresponding leaf node buffers.
+    /// This method does not rebuild the leaf array.
+    void rebuildAuxBuffers(size_t auxBuffersPerLeaf, bool serial=false)
+    {
+        mAuxBuffersPerLeaf = auxBuffersPerLeaf;
+        this->initAuxBuffers(serial);
+    }
+    /// @brief Remove the auxiliary buffers, but don't rebuild the leaf array.
+    void removeAuxBuffers() { this->rebuildAuxBuffers(0); }
+
+    /// @brief Remove the auxiliary buffers and rebuild the leaf array.
+    void rebuildLeafArray()
+    {
+        this->removeAuxBuffers();
+        this->initLeafArray();
+    }
+
+    /// Return the total number of allocated auxiliary buffers.
+    size_t auxBufferCount() const { return mAuxBufferCount; }
+    /// Return the number of auxiliary buffers per leaf node.
+    size_t auxBuffersPerLeaf() const { return mAuxBuffersPerLeaf; }
+
+    /// Return the number of leaf nodes.
+    size_t leafCount() const { return mLeafCount; }
+
+    /// Return a const reference to tree associated with this manager.
+    const TreeType& tree() const { return *mTree; }
+
+    /// Return a reference to the tree associated with this manager.
+    TreeType& tree() { return *mTree; }
+
+    /// Return a const reference to root node associated with this manager.
+    const RootNodeType& root() const { return mTree->root(); }
+
+    /// Return a reference to the root node associated with this manager.
+    RootNodeType& root() { return mTree->root(); }
+
+    /// Return @c true if the tree associated with this manager is immutable.
+    bool isConstTree() const { return this->IsConstTree; }
+
+    /// @brief Return a pointer to the leaf node at index @a leafIdx in the array.
+    /// @note For performance reasons no range check is performed (other than an assertion)!
+    LeafType& leaf(size_t leafIdx) const { assert(leafIdx<mLeafCount); return *mLeafs[leafIdx]; }
+
+    /// @brief Return the leaf or auxiliary buffer for the leaf node at index @a leafIdx.
+    /// If @a bufferIdx is zero, return the leaf buffer, otherwise return the nth
+    /// auxiliary buffer, where n = @a bufferIdx - 1.
+    ///
+    /// @note For performance reasons no range checks are performed on the inputs
+    /// (other than assertions)! Since auxiliary buffers, unlike leaf buffers,
+    /// might not exist, be especially careful when specifying the @a bufferIdx.
+    /// @note For const trees, this method always returns a reference to a const buffer.
+    /// It is safe to @c const_cast and modify any auxiliary buffer (@a bufferIdx > 0),
+    /// but it is not safe to modify the leaf buffer (@a bufferIdx = 0).
+    BufferType& getBuffer(size_t leafIdx, size_t bufferIdx) const
+    {
+        assert(leafIdx < mLeafCount);
+        assert(bufferIdx == 0 || bufferIdx - 1 < mAuxBuffersPerLeaf);
+        return bufferIdx == 0 ? mLeafs[leafIdx]->buffer()
+             : mAuxBuffers[leafIdx * mAuxBuffersPerLeaf + bufferIdx - 1];
+    }
+
+    /// @brief Return a @c tbb::blocked_range of leaf array indices.
+    ///
+    /// @note Consider using leafRange() instead, which provides access methods
+    /// to leaf nodes and buffers.
+    RangeType getRange(size_t grainsize = 1) const { return RangeType(0, mLeafCount, grainsize); }
+
+    /// Return a TBB-compatible LeafRange.
+    LeafRange leafRange(size_t grainsize = 1) const
+    {
+        return LeafRange(0, mLeafCount, *this, grainsize);
+    }
+
+    /// @brief Swap each leaf node's buffer with the nth corresponding auxiliary buffer,
+    /// where n = @a bufferIdx.
+    /// @return @c true if the swap was successful
+    /// @param bufferIdx  index of the buffer that will be swapped with
+    ///                   the corresponding leaf node buffer
+    /// @param serial     if false, swap buffers in parallel using multiple threads.
+    /// @note Recall that the indexing of auxiliary buffers is 1-based, since
+    /// buffer index 0 denotes the leaf node buffer.  So buffer index 1 denotes
+    /// the first auxiliary buffer.
+    bool swapLeafBuffer(size_t bufferIdx, bool serial = false)
+    {
+        if (bufferIdx == 0 || bufferIdx > mAuxBuffersPerLeaf || this->isConstTree()) return false;
+        mTask = boost::bind(&LeafManager::doSwapLeafBuffer, _1, _2, bufferIdx - 1);
+        this->cook(serial ? 0 : 512);
+        return true;//success
+    }
+    /// @brief Swap any two buffers for each leaf node.
+    /// @note Recall that the indexing of auxiliary buffers is 1-based, since
+    /// buffer index 0 denotes the leaf node buffer.  So buffer index 1 denotes
+    /// the first auxiliary buffer.
+    bool swapBuffer(size_t bufferIdx1, size_t bufferIdx2, bool serial = false)
+    {
+        const size_t b1 = std::min(bufferIdx1, bufferIdx2);
+        const size_t b2 = std::max(bufferIdx1, bufferIdx2);
+        if (b1 == b2 || b2 > mAuxBuffersPerLeaf) return false;
+        if (b1 == 0) {
+            if (this->isConstTree()) return false;
+            mTask = boost::bind(&LeafManager::doSwapLeafBuffer, _1, _2, b2-1);
+        } else {
+            mTask = boost::bind(&LeafManager::doSwapAuxBuffer, _1, _2, b1-1, b2-1);
+        }
+        this->cook(serial ? 0 : 512);
+        return true;//success
+    }
+
+    /// @brief Sync up the specified auxiliary buffer with the corresponding leaf node buffer.
+    /// @return @c true if the sync was successful
+    /// @param bufferIdx index of the buffer that will contain a
+    ///                  copy of the corresponding leaf node buffer
+    /// @param serial    if false, sync buffers in parallel using multiple threads.
+    /// @note Recall that the indexing of auxiliary buffers is 1-based, since
+    /// buffer index 0 denotes the leaf node buffer.  So buffer index 1 denotes
+    /// the first auxiliary buffer.
+    bool syncAuxBuffer(size_t bufferIdx, bool serial = false)
+    {
+        if (bufferIdx == 0 || bufferIdx > mAuxBuffersPerLeaf) return false;
+        mTask = boost::bind(&LeafManager::doSyncAuxBuffer, _1, _2, bufferIdx - 1);
+        this->cook(serial ? 0 : 64);
+        return true;//success
+    }
+
+    /// @brief Sync up all auxiliary buffers with their corresponding leaf node buffers.
+    /// @return true if the sync was successful
+    /// @param serial  if false, sync buffers in parallel using multiple threads.
+    bool syncAllBuffers(bool serial = false)
+    {
+        switch (mAuxBuffersPerLeaf) {
+            case 0: return false;//nothing to do
+            case 1: mTask = boost::bind(&LeafManager::doSyncAllBuffers1, _1, _2); break;
+            case 2: mTask = boost::bind(&LeafManager::doSyncAllBuffers2, _1, _2); break;
+            default: mTask = boost::bind(&LeafManager::doSyncAllBuffersN, _1, _2); break;
+        }
+        this->cook(serial ? 0 : 64);
+        return true;//success
+    }
+
+    /// @brief   Threaded method that applies a user-supplied functor
+    ///          to each leaf node in the LeafManager.
+    ///
+    /// @details The user-supplied functor needs to define the methods
+    ///          required for tbb::parallel_for.
+    ///
+    /// @param op        user-supplied functor, see examples for interface details.
+    /// @param threaded  optional toggle to disable threading, on by default.
+    /// @param grainSize optional parameter to specify the grainsize
+    ///                  for threading, one by default.
+    ///
+    /// @warning The functor object is deep-copied to create TBB tasks. 
+    ///          This allows the function to use non-thread-safe members
+    ///          like a ValueAccessor.
+    ///
+    /// @par Example:
+    /// @code
+    /// // Functor to offset a tree's voxel values with values from another tree.
+    /// template<typename TreeType>
+    /// struct OffsetOp
+    /// {
+    ///     typedef tree::ValueAccessor<const TreeType> Accessor;
+    ///
+    ///     OffsetOp(const TreeType& tree): mRhsTreeAcc(tree) {}
+    ///
+    ///     template <typename LeafNodeType>
+    ///     void operator()(LeafNodeType &lhsLeaf, size_t) const
+    ///     {
+    ///         const LeafNodeType *rhsLeaf = mRhsTreeAcc.probeConstLeaf(lhsLeaf.origin());
+    ///         if (rhsLeaf) {
+    ///             typename LeafNodeType::ValueOnIter iter = lhsLeaf.beginValueOn();
+    ///             for (; iter; ++iter) {
+    ///                 iter.setValue(iter.getValue() + rhsLeaf->getValue(iter.pos()));
+    ///             }
+    ///         }
+    ///     }
+    ///     Accessor mRhsTreeAcc;
+    /// };
+    ///
+    /// // usage:
+    /// tree::LeafManager<FloatTree> leafNodes(lhsTree);
+    /// leafNodes.foreach(OffsetOp<FloatTree>(rhsTree));
+    ///
+    /// // A functor that performs a min operation between different auxiliary buffers.
+    /// template<typename LeafManagerType>
+    /// struct MinOp
+    /// {
+    ///     typedef typename LeafManagerType::BufferType BufferType;
+    ///
+    ///     MinOp(LeafManagerType& leafNodes): mLeafs(leafNodes) {}
+    ///
+    ///     template <typename LeafNodeType>
+    ///     void operator()(LeafNodeType &leaf, size_t leafIndex) const
+    ///     {
+    ///         // get the first buffer
+    ///         BufferType& buffer = mLeafs.getBuffer(leafIndex, 1);
+    ///
+    ///         // min ...
+    ///     }
+    ///     LeafManagerType& mLeafs;
+    /// };
+    /// @endcode
+    template<typename LeafOp>
+    void foreach(const LeafOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        LeafTransformer<LeafOp> transform(op);
+        transform.run(this->leafRange(grainSize), threaded);
+    }
+
+    /// @brief   Threaded method that applies a user-supplied functor
+    ///          to each leaf node in the LeafManager. Unlike foreach
+    ///          (defined above) this method performs a reduction on
+    ///          all the leaf nodes.
+    ///
+    /// @details The user-supplied functor needs to define the methods
+    ///          required for tbb::parallel_reduce.
+    ///
+    /// @param op        user-supplied functor, see examples for interface details.
+    /// @param threaded  optional toggle to disable threading, on by default.
+    /// @param grainSize optional parameter to specify the grainsize
+    ///                  for threading, one by default.
+    ///
+    /// @warning The functor object is deep-copied to create TBB tasks.
+    ///          This allows the function to use non-thread-safe members
+    ///          like a ValueAccessor.
+    ///
+    /// @par Example:
+    /// @code
+    /// // Functor to count the number of negative (active) leaf values 
+    /// struct CountOp
+    /// {
+    ///     CountOp() : mCounter(0) {}
+    ///     CountOp(const CountOp &other) : mCounter(other.mCounter) {}
+    ///     CountOp(const CountOp &other, tbb::split) : mCounter(0) {}
+    ///     template <typename LeafNodeType>
+    ///     void operator()(LeafNodeType &leaf, size_t)
+    ///     {
+    ///       typename LeafNodeType::ValueOnIter iter = leaf.beginValueOn();
+    ///       for (; iter; ++iter) if (*iter < 0.0f) ++mCounter;
+    ///     }
+    ///     void join(const CountOp &other) {mCounter += other.mCounter;}
+    ///     size_t mCounter; 
+    /// };
+    ///
+    /// // usage:
+    /// tree::LeafManager<FloatTree> leafNodes(tree);
+    /// MinValueOp min;
+    /// leafNodes.reduce(min);
+    /// std::cerr << "Number of negative active voxels = " << min.mCounter << std::endl;
+    ///
+    /// @endcode
+    template<typename LeafOp>
+    void reduce(LeafOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        LeafReducer<LeafOp> transform(op);
+        transform.run(this->leafRange(grainSize), threaded);
+    }
+
+
+    /// @brief Insert pointers to nodes of the specified type into the array.
+    /// @details The type of node pointer is defined by the type
+    /// ArrayT::value_type. If the node type is a LeafNode the nodes
+    /// are inserted from this LeafManager, else of the corresponding tree.
+    template<typename ArrayT>
+    void getNodes(ArrayT& array)
+    {
+        typedef typename ArrayT::value_type T;
+        BOOST_STATIC_ASSERT(boost::is_pointer<T>::value);
+        typedef typename boost::mpl::if_<boost::is_const<typename boost::remove_pointer<T>::type>,
+            const LeafType, LeafType>::type LeafT;
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (boost::is_same<T, LeafT*>::value) {
+            array.resize(mLeafCount);
+            for (size_t i=0; i<mLeafCount; ++i) array[i] = reinterpret_cast<T>(mLeafs[i]);
+        } else {
+            mTree->getNodes(array);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    /// @brief Insert node pointers of the specified type into the array.
+    /// @details The type of node pointer is defined by the type
+    /// ArrayT::value_type. If the node type is a LeafNode the nodes
+    /// are inserted from this LeafManager, else of the corresponding tree.
+    template<typename ArrayT>
+    void getNodes(ArrayT& array) const
+    {
+        typedef typename ArrayT::value_type T;
+        BOOST_STATIC_ASSERT(boost::is_pointer<T>::value);
+        BOOST_STATIC_ASSERT(boost::is_const<typename boost::remove_pointer<T>::type>::value);
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (boost::is_same<T, const LeafType*>::value) {
+            array.resize(mLeafCount);
+            for (size_t i=0; i<mLeafCount; ++i) array[i] = reinterpret_cast<T>(mLeafs[i]);
+        } else {
+            mTree->getNodes(array);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    /// @brief Generate a linear array of pre-fix sums of offsets into the
+    /// active voxels in the leafs. So @a offsets[n]+m is the offset to the
+    /// mth active voxel in the nth leaf node (useful for
+    /// user-managed value buffers, e.g. in tools/LevelSetAdvect.h).
+    /// @return The total number of active values in the leaf nodes
+    /// @param offsets Array of pre-fix sums of offsets to active voxels
+    /// @param size      On input the size of @a offsets, and on ouput
+    ///                  the new size of @a offsets.
+    /// @param grainSize Optional parameter to specify the grainsize
+    ///                  for threading, one by default.
+    /// @details If @a offsets is NULL or @a size is smaller than the
+    /// total number of active voxels (the return value) then @a offsets
+    /// is re-allocated and @a size equals the total number of active voxels. 
+    size_t getPreFixSum(size_t*& offsets, size_t& size, size_t grainSize=1) const
+    {
+        if (offsets == NULL || size < mLeafCount) {
+            delete [] offsets;
+            offsets = new size_t[mLeafCount];
+            size = mLeafCount;
+        }
+        size_t prefix = 0;
+        if ( grainSize > 0 ) {
+            PreFixSum tmp(this->leafRange( grainSize ), offsets, prefix);
+        } else {// serial
+            for (size_t i=0; i<mLeafCount; ++i) {
+                offsets[i] = prefix;
+                prefix += mLeafs[i]->onVoxelCount();
+            }
+        }
+        return prefix;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // All methods below are for internal use only and should never be called directly
+
+    /// Used internally by tbb::parallel_for() - never call it directly!
+    void operator()(const RangeType& r) const
+    {
+        if (mTask) mTask(const_cast<LeafManager*>(this), r);
+        else OPENVDB_THROW(ValueError, "task is undefined");
+    }
+
+  private:
+
+    // This a simple wrapper for a c-style array so it mimics the api
+    // of a std container, e.g. std::vector or std::deque, and can be
+    // passed to Tree::getNodes().
+    struct MyArray {
+        typedef LeafType* value_type;//required by Tree::getNodes
+        value_type* ptr;
+        MyArray(value_type* array) : ptr(array) {}
+        void push_back(value_type leaf) { *ptr++ = leaf; }//required by Tree::getNodes
+    };
+
+    void initLeafArray()
+    {
+        const size_t leafCount = mTree->leafCount();
+        if (leafCount != mLeafCount) {
+            delete [] mLeafs;
+            mLeafs = (leafCount == 0) ? NULL : new LeafType*[leafCount];
+            mLeafCount = leafCount;
+        }
+        MyArray a(mLeafs);
+        mTree->getNodes(a);
+    }
+
+    void initAuxBuffers(bool serial)
+    {
+        const size_t auxBufferCount = mLeafCount * mAuxBuffersPerLeaf;
+        if (auxBufferCount != mAuxBufferCount) {
+            delete [] mAuxBuffers;
+            mAuxBuffers = (auxBufferCount == 0) ? NULL : new NonConstBufferType[auxBufferCount];
+            mAuxBufferCount = auxBufferCount;
+        }
+        this->syncAllBuffers(serial);
+    }
+
+    void cook(size_t grainsize)
+    {
+        if (grainsize>0) {
+            tbb::parallel_for(this->getRange(grainsize), *this);
+        } else {
+            (*this)(this->getRange());
+        }
+    }
+
+    void doSwapLeafBuffer(const RangeType& r, size_t auxBufferIdx)
+    {
+        LeafManagerImpl<LeafManager>::doSwapLeafBuffer(
+            r, auxBufferIdx, mLeafs, mAuxBuffers, mAuxBuffersPerLeaf);
+    }
+
+    void doSwapAuxBuffer(const RangeType& r, size_t auxBufferIdx1, size_t auxBufferIdx2)
+    {
+        for (size_t N = mAuxBuffersPerLeaf, n = N*r.begin(), m = N*r.end(); n != m; n+=N) {
+            mAuxBuffers[n + auxBufferIdx1].swap(mAuxBuffers[n + auxBufferIdx2]);
+        }
+    }
+
+    void doSyncAuxBuffer(const RangeType& r, size_t auxBufferIdx)
+    {
+        for (size_t n = r.begin(), m = r.end(), N = mAuxBuffersPerLeaf; n != m; ++n) {
+            mAuxBuffers[n*N + auxBufferIdx] = mLeafs[n]->buffer();
+        }
+    }
+
+    void doSyncAllBuffers1(const RangeType& r)
+    {
+        for (size_t n = r.begin(), m = r.end(); n != m; ++n) {
+            mAuxBuffers[n] = mLeafs[n]->buffer();
+        }
+    }
+
+    void doSyncAllBuffers2(const RangeType& r)
+    {
+        for (size_t n = r.begin(), m = r.end(); n != m; ++n) {
+            const BufferType& leafBuffer = mLeafs[n]->buffer();
+            mAuxBuffers[2*n  ] = leafBuffer;
+            mAuxBuffers[2*n+1] = leafBuffer;
+        }
+    }
+
+    void doSyncAllBuffersN(const RangeType& r)
+    {
+        for (size_t n = r.begin(), m = r.end(), N = mAuxBuffersPerLeaf; n != m; ++n) {
+            const BufferType& leafBuffer = mLeafs[n]->buffer();
+            for (size_t i=n*N, j=i+N; i!=j; ++i) mAuxBuffers[i] = leafBuffer;
+        }
+    }
+
+    /// @brief Private member class that applies a user-defined
+    /// functor to perform parallel_for on all the leaf nodes.
+    template<typename LeafOp>
+    struct LeafTransformer
+    {
+        LeafTransformer(const LeafOp &leafOp) : mLeafOp(leafOp)
+        {
+        }
+        void run(const LeafRange &range, bool threaded) const
+        {
+            threaded ? tbb::parallel_for(range, *this) : (*this)(range);
+        }
+        void operator()(const LeafRange &range) const
+        {
+            for (typename LeafRange::Iterator it = range.begin(); it; ++it) mLeafOp(*it, it.pos());
+        }
+        const LeafOp mLeafOp;
+    };// LeafTransformer
+    
+    /// @brief Private member class that applies a user-defined
+    /// functor to perform parallel_reduce on all the leaf nodes.
+    template<typename LeafOp>
+    struct LeafReducer
+    {
+        LeafReducer(LeafOp &leafOp) : mLeafOp(&leafOp), mOwnsOp(false)
+        {
+        }
+        LeafReducer(const LeafReducer &other, tbb::split)
+            : mLeafOp(new LeafOp(*(other.mLeafOp), tbb::split())), mOwnsOp(true)
+        {
+        }
+        ~LeafReducer() { if (mOwnsOp) delete mLeafOp; }
+        void run(const LeafRange& range, bool threaded)
+        {
+            threaded ? tbb::parallel_reduce(range, *this) : (*this)(range);
+        }
+        void operator()(const LeafRange& range)
+        {
+            LeafOp &op = *mLeafOp;//local registry
+            for (typename LeafRange::Iterator it = range.begin(); it; ++it) op(*it, it.pos());
+        }
+        void join(const LeafReducer& other) { mLeafOp->join(*(other.mLeafOp)); }
+        LeafOp *mLeafOp;
+        const bool mOwnsOp;
+    };// LeafReducer
+
+    // Helper class to compute a pre-fix sum of offsets to active voxels
+    struct PreFixSum
+    {
+        PreFixSum(const LeafRange& r, size_t* offsets, size_t& prefix)
+            : mOffsets(offsets)
+        {
+            tbb::parallel_for( r, *this);
+            for (size_t i=0, leafCount = r.size(); i<leafCount; ++i) {
+                size_t tmp = offsets[i];
+                offsets[i] = prefix;
+                prefix += tmp;
+            }
+        }
+        inline void operator()(const LeafRange& r) const {
+            for (typename LeafRange::Iterator i = r.begin(); i; ++i) {
+                mOffsets[i.pos()] = i->onVoxelCount();
+            }
+        }
+        size_t* mOffsets;
+    };// PreFixSum
+    
+    typedef typename boost::function<void (LeafManager*, const RangeType&)> FuncType;
+
+    TreeType*            mTree;
+    size_t               mLeafCount, mAuxBufferCount, mAuxBuffersPerLeaf;
+    LeafType**           mLeafs;//array of LeafNode pointers
+    NonConstBufferType*  mAuxBuffers;//array of auxiliary buffers
+    FuncType             mTask;
+    const bool           mIsMaster;
+};//end of LeafManager class
+
+
+// Partial specializations of LeafManager methods for const trees
+template<typename TreeT>
+struct LeafManagerImpl<LeafManager<const TreeT> >
+{
+    typedef LeafManager<const TreeT> ManagerT;
+    typedef typename ManagerT::RangeType      RangeT;
+    typedef typename ManagerT::LeafType       LeafT;
+    typedef typename ManagerT::BufferType     BufT;
+
+    static inline void doSwapLeafBuffer(const RangeT&, size_t /*auxBufferIdx*/,
+                                        LeafT**, BufT*, size_t /*bufsPerLeaf*/)
+    {
+        // Buffers can't be swapped into const trees.
+    }
+};
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_LEAFMANAGER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/LeafNode.h b/nuparu/include/openvdb_new/tree/LeafNode.h
new file mode 100644
index 00000000..33d0f80b
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/LeafNode.h
@@ -0,0 +1,2219 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TREE_LEAFNODE_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_LEAFNODE_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <algorithm> // for std::swap
+#include <cstring> // for std::memcpy()
+#include <boost/shared_ptr.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/bind.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/spin_mutex.h>
+#include <tbb/parallel_for.h>
+#include <openvdb/Types.h>
+#include <openvdb/util/NodeMasks.h>
+#include <openvdb/io/Compression.h> // for io::readData(), etc.
+#include "Iterator.h"
+
+
+class TestLeaf;
+template<typename> class TestLeafIO;
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+template<Index, typename> struct SameLeafConfig; // forward declaration
+
+
+/// @brief Templated block class to hold specific data types and a fixed
+/// number of values determined by Log2Dim. The actual coordinate
+/// dimension of the block is 2^Log2Dim, i.e. Log2Dim=3 corresponds to
+/// a LeafNode that spans a 8^3 block.
+template<typename T, Index Log2Dim>
+class LeafNode
+{
+public:
+    typedef T                            BuildType; 
+    typedef T                            ValueType;
+    typedef LeafNode<ValueType, Log2Dim> LeafNodeType;
+    typedef boost::shared_ptr<LeafNode>  Ptr;
+    typedef util::NodeMask<Log2Dim>      NodeMaskType;
+
+    static const Index
+        LOG2DIM     = Log2Dim,      // needed by parent nodes
+        TOTAL       = Log2Dim,      // needed by parent nodes
+        DIM         = 1 << TOTAL,   // dimension along one coordinate direction
+        NUM_VALUES  = 1 << 3 * Log2Dim,
+        NUM_VOXELS  = NUM_VALUES,   // total number of voxels represented by this node
+        SIZE        = NUM_VALUES,
+        LEVEL       = 0;            // level 0 = leaf
+
+    /// @brief ValueConverter<T>::Type is the type of a LeafNode having the same
+    /// dimensions as this node but a different value type, T.
+    template<typename OtherValueType>
+    struct ValueConverter {
+        typedef LeafNode<OtherValueType, Log2Dim> Type;
+    };
+
+    /// @brief SameConfiguration<OtherNodeType>::value is @c true if and only if
+    /// OtherNodeType is the type of a LeafNode with the same dimensions as this node.
+    template<typename OtherNodeType>
+    struct SameConfiguration {
+        static const bool value = SameLeafConfig<LOG2DIM, OtherNodeType>::value;
+    };
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    struct FileInfo
+    {
+        FileInfo(): bufpos(0) , maskpos(0) {}
+        std::streamoff bufpos;
+        std::streamoff maskpos;
+        io::MappedFile::Ptr mapping;
+        boost::shared_ptr<io::StreamMetadata> meta;
+    };
+#endif
+
+    /// @brief Array of fixed size @f$2^{3 \times {\rm Log2Dim}}@f$ that stores
+    /// the voxel values of a LeafNode
+    class Buffer
+    {
+    public:
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+        /// Default constructor
+        Buffer(): mData(new ValueType[SIZE]) {}
+        /// Construct a buffer populated with the specified value.
+        explicit Buffer(const ValueType& val): mData(new ValueType[SIZE]) { this->fill(val); }
+        /// Copy constructor
+        Buffer(const Buffer& other): mData(new ValueType[SIZE]) { *this = other; }
+        /// Destructor
+        ~Buffer() { delete[] mData; }
+
+        /// Return @c true if this buffer's values have not yet been read from disk.
+        bool isOutOfCore() const { return false; }
+        /// Return @c true if memory for this buffer has not yet been allocated.
+        bool empty() const { return (mData == NULL); }
+#else
+        typedef ValueType WordType;
+        static const Index WORD_COUNT = SIZE;
+        /// Default constructor
+        Buffer(): mData(new ValueType[SIZE]), mOutOfCore(0) {}
+        /// Construct a buffer populated with the specified value.
+        explicit Buffer(const ValueType& val): mData(new ValueType[SIZE]), mOutOfCore(0)
+        {
+            this->fill(val);
+        }
+        /// Copy constructor
+        Buffer(const Buffer& other): mData(NULL), mOutOfCore(other.mOutOfCore)
+        {
+            if (other.isOutOfCore()) {
+                mFileInfo = new FileInfo(*other.mFileInfo);
+            } else {
+                this->allocate();
+                ValueType* target = mData;
+                const ValueType* source = other.mData;
+                Index n = SIZE;
+                while (n--) *target++ = *source++;
+            }
+        }
+        /// Construct a buffer but don't allocate memory for the full array of values.
+        Buffer(PartialCreate, const ValueType&): mData(NULL), mOutOfCore(0) {}
+        /// Destructor
+        ~Buffer()
+        {
+            if (this->isOutOfCore()) {
+                this->detachFromFile();
+            } else {
+                this->deallocate();
+            }
+        }
+
+        /// Return @c true if this buffer's values have not yet been read from disk.
+        bool isOutOfCore() const { return bool(mOutOfCore); }
+        /// Return @c true if memory for this buffer has not yet been allocated.
+        bool empty() const { return !mData || this->isOutOfCore(); }
+#endif
+        /// Allocate memory for this buffer if it has not already been allocated.
+        bool allocate() { if (mData == NULL) mData = new ValueType[SIZE]; return !this->empty(); }
+
+        /// Populate this buffer with a constant value.
+        void fill(const ValueType& val)
+        {
+            this->detachFromFile();
+            if (mData != NULL) {
+                ValueType* target = mData;
+                Index n = SIZE;
+                while (n--) *target++ = val;
+            }
+        }
+
+        /// Return a const reference to the i'th element of this buffer.
+        const ValueType& getValue(Index i) const { return this->at(i); }
+        /// Return a const reference to the i'th element of this buffer.
+        const ValueType& operator[](Index i) const { return this->at(i); }
+        /// Set the i'th value of this buffer to the specified value.
+        void setValue(Index i, const ValueType& val)
+        {
+            assert(i < SIZE);
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+            mData[i] = val;
+#else
+            this->loadValues();
+            if (mData) mData[i] = val;
+#endif
+        }
+
+        /// Copy the other buffer's values into this buffer.
+        Buffer& operator=(const Buffer& other)
+        {
+            if (&other != this) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+                if (this->isOutOfCore()) {
+                    this->detachFromFile();
+                } else {
+                    if (other.isOutOfCore()) this->deallocate();
+                }
+                if (other.isOutOfCore()) {
+                    mOutOfCore = other.mOutOfCore;
+                    mFileInfo = new FileInfo(*other.mFileInfo);
+                } else {
+#endif
+                    this->allocate();
+                    ValueType* target = mData;
+                    const ValueType* source = other.mData;
+                    Index n = SIZE;
+                    while (n--) *target++ = *source++;
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+                }
+#endif
+            }
+            return *this;
+        }
+
+        /// @brief Return @c true if the contents of the other buffer
+        /// exactly equal the contents of this buffer.
+        bool operator==(const Buffer& other) const
+        {
+            this->loadValues();
+            other.loadValues();
+            const ValueType *target = mData, *source = other.mData;
+            if (!target && !source) return true;
+            if (!target || !source) return false;
+            Index n = SIZE;
+            while (n && math::isExactlyEqual(*target++, *source++)) --n;
+            return n == 0;
+        }
+        /// @brief Return @c true if the contents of the other buffer
+        /// are not exactly equal to the contents of this buffer.
+        bool operator!=(const Buffer& other) const { return !(other == *this); }
+
+        /// Exchange this buffer's values with the other buffer's values.
+        void swap(Buffer& other)
+        {
+            std::swap(mData, other.mData);
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+            std::swap(mOutOfCore, other.mOutOfCore);
+#endif
+        }
+
+        /// Return the memory footprint of this buffer in bytes.
+        Index memUsage() const
+        {
+            size_t n = sizeof(*this);
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+            if (mData) n += SIZE * sizeof(ValueType);
+#else
+            if (this->isOutOfCore()) n += sizeof(FileInfo);
+            else if (mData) n += SIZE * sizeof(ValueType);
+#endif
+            return static_cast<Index>(n);
+        }
+        /// Return the number of values contained in this buffer.
+        static Index size() { return SIZE; }
+
+        /// @brief Return a const pointer to the array of voxel values.
+        /// @details This method guarantees that the buffer is allocated and loaded.
+        /// @warning This method should only be used by experts seeking low-level optimizations.
+        const ValueType* data() const
+        {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+            this->loadValues();
+            if (mData == NULL) {
+                Buffer* self = const_cast<Buffer*>(this);
+                // This lock will be contended at most once.
+                tbb::spin_mutex::scoped_lock lock(self->mMutex);
+                if (mData == NULL) self->mData = new ValueType[SIZE];
+            }
+#endif
+            return mData;
+        }
+
+        /// @brief Return a pointer to the array of voxel values.
+        /// @details This method guarantees that the buffer is allocated and loaded.
+        /// @warning This method should only be used by experts seeking low-level optimizations.
+        ValueType* data()
+        {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+            this->loadValues();
+            if (mData == NULL) {
+                // This lock will be contended at most once.
+                tbb::spin_mutex::scoped_lock lock(mMutex);
+                if (mData == NULL) mData = new ValueType[SIZE];
+            }
+#endif
+            return mData;
+        }
+
+    private:
+        /// If this buffer is empty, return zero, otherwise return the value at index @ i.
+        const ValueType& at(Index i) const
+        {
+            assert(i < SIZE);
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+            return mData[i];
+#else
+            this->loadValues();
+            // We can't use the ternary operator here, otherwise Visual C++ returns
+            // a reference to a temporary.
+            if (mData) return mData[i]; else return sZero;
+#endif
+        }
+
+        /// @brief Return a non-const reference to the value at index @a i.
+        /// @details This method is private since it makes assumptions about the
+        /// buffer's memory layout.  Buffers associated with custom leaf node types
+        /// (e.g., a bool buffer implemented as a bitmask) might not be able to
+        /// return non-const references to their values.
+        ValueType& operator[](Index i) { return const_cast<ValueType&>(this->at(i)); }
+
+        bool deallocate()
+        {
+            if (mData != NULL && !this->isOutOfCore()) {
+                delete[] mData;
+                mData = NULL;
+                return true;
+            }
+            return false;
+        }
+
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+        void setOutOfCore(bool) {}
+        void loadValues() const {}
+        void doLoad() const {}
+        bool detachFromFile() { return false; }
+#else
+        inline void setOutOfCore(bool b) { mOutOfCore = b; }
+        // To facilitate inlining in the common case in which the buffer is in-core,
+        // the loading logic is split into a separate function, doLoad().
+        inline void loadValues() const { if (this->isOutOfCore()) this->doLoad(); }
+        inline void doLoad() const;
+        inline bool detachFromFile()
+        {
+            if (this->isOutOfCore()) {
+                delete mFileInfo;
+                mFileInfo = NULL;
+                this->setOutOfCore(false);
+                return true;
+            }
+            return false;
+        }
+#endif
+
+        friend class ::TestLeaf;
+        // Allow the parent LeafNode to access this buffer's data pointer.
+        friend class LeafNode;
+
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+        ValueType* mData;
+#else
+        union {
+            ValueType* mData;
+            FileInfo*  mFileInfo;
+        };
+        Index32 mOutOfCore; // currently interpreted as bool; extra bits reserved for future use
+        tbb::spin_mutex mMutex; // 1 byte
+        //int8_t mReserved[3]; // padding for alignment
+
+        static const ValueType sZero;
+#endif
+    }; // class Buffer
+
+
+    /// Default constructor
+    LeafNode();
+
+    /// @brief Constructor
+    /// @param coords  the grid index coordinates of a voxel
+    /// @param value   a value with which to fill the buffer
+    /// @param active  the active state to which to initialize all voxels
+    explicit LeafNode(const Coord& coords,
+                      const ValueType& value = zeroVal<ValueType>(),
+                      bool active = false);
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief "Partial creation" constructor used during file input
+    /// @param coords  the grid index coordinates of a voxel
+    /// @param value   a value with which to fill the buffer
+    /// @param active  the active state to which to initialize all voxels
+    /// @details This constructor does not allocate memory for voxel values.
+    LeafNode(PartialCreate,
+             const Coord& coords,
+             const ValueType& value = zeroVal<ValueType>(),
+             bool active = false);
+#endif
+
+    /// Deep copy constructor
+    LeafNode(const LeafNode&);
+
+    /// Value conversion copy constructor
+    template<typename OtherValueType>
+    explicit LeafNode(const LeafNode<OtherValueType, Log2Dim>& other);
+
+    /// Topology copy constructor
+    template<typename OtherValueType>
+    LeafNode(const LeafNode<OtherValueType, Log2Dim>& other,
+             const ValueType& offValue, const ValueType& onValue, TopologyCopy);
+
+    /// Topology copy constructor
+    template<typename OtherValueType>
+    LeafNode(const LeafNode<OtherValueType, Log2Dim>& other,
+             const ValueType& background, TopologyCopy);
+
+    /// Destructor.
+    ~LeafNode();
+
+    //
+    // Statistics
+    //
+    /// Return log2 of the dimension of this LeafNode, e.g. 3 if dimensions are 8^3
+    static Index log2dim() { return Log2Dim; }
+    /// Return the number of voxels in each coordinate dimension.
+    static Index dim() { return DIM; }
+    /// Return the total number of voxels represented by this LeafNode
+    static Index size() { return SIZE; }
+    /// Return the total number of voxels represented by this LeafNode
+    static Index numValues() { return SIZE; }
+    /// Return the level of this node, which by definition is zero for LeafNodes
+    static Index getLevel() { return LEVEL; }
+    /// Append the Log2Dim of this LeafNode to the specified vector
+    static void getNodeLog2Dims(std::vector<Index>& dims) { dims.push_back(Log2Dim); }
+    /// Return the dimension of child nodes of this LeafNode, which is one for voxels.
+    static Index getChildDim() { return 1; }
+    /// Return the leaf count for this node, which is one.
+    static Index32 leafCount() { return 1; }
+    /// Return the non-leaf count for this node, which is zero.
+    static Index32 nonLeafCount() { return 0; }
+
+    /// Return the number of voxels marked On.
+    Index64 onVoxelCount() const { return mValueMask.countOn(); }
+    /// Return the number of voxels marked Off.
+    Index64 offVoxelCount() const { return mValueMask.countOff(); }
+    Index64 onLeafVoxelCount() const { return onVoxelCount(); }
+    Index64 offLeafVoxelCount() const { return offVoxelCount(); }
+    static Index64 onTileCount()  { return 0; }
+    static Index64 offTileCount() { return 0; }
+    /// Return @c true if this node has no active voxels.
+    bool isEmpty() const { return mValueMask.isOff(); }
+    /// Return @c true if this node contains only active voxels.
+    bool isDense() const { return mValueMask.isOn(); }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Return @c true if memory for this node's buffer has been allocated.
+    bool isAllocated() const { return !mBuffer.isOutOfCore() && !mBuffer.empty(); }
+    /// Allocate memory for this node's buffer if it has not already been allocated.
+    bool allocate() { return mBuffer.allocate(); }
+#endif
+
+    /// Return the memory in bytes occupied by this node.
+    Index64 memUsage() const;
+
+    /// Expand the given bounding box so that it includes this leaf node's active voxels.
+    /// If visitVoxels is false this LeafNode will be approximated as dense, i.e. with all
+    /// voxels active. Else the individual active voxels are visited to produce a tight bbox.
+    void evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels = true) const;
+
+    /// @brief Return the bounding box of this node, i.e., the full index space
+    /// spanned by this leaf node.
+    CoordBBox getNodeBoundingBox() const { return CoordBBox::createCube(mOrigin, DIM); }
+
+    /// Set the grid index coordinates of this node's local origin.
+    void setOrigin(const Coord& origin) { mOrigin = origin; }
+    //@{
+    /// Return the grid index coordinates of this node's local origin.
+    const Coord& origin() const { return mOrigin; }
+    void getOrigin(Coord& origin) const { origin = mOrigin; }
+    void getOrigin(Int32& x, Int32& y, Int32& z) const { mOrigin.asXYZ(x, y, z); }
+    //@}
+
+    /// Return the linear table offset of the given global or local coordinates.
+    static Index coordToOffset(const Coord& xyz);
+    /// @brief Return the local coordinates for a linear table offset,
+    /// where offset 0 has coordinates (0, 0, 0).
+    static Coord offsetToLocalCoord(Index n);
+    /// Return the global coordinates for a linear table offset.
+    Coord offsetToGlobalCoord(Index n) const;
+
+    /// Return a string representation of this node.
+    std::string str() const;
+
+    /// @brief Return @c true if the given node (which may have a different @c ValueType
+    /// than this node) has the same active value topology as this node.
+    template<typename OtherType, Index OtherLog2Dim>
+    bool hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const;
+
+    /// Check for buffer, state and origin equivalence.
+    bool operator==(const LeafNode& other) const;
+    bool operator!=(const LeafNode& other) const { return !(other == *this); }
+
+protected:
+    typedef typename NodeMaskType::OnIterator    MaskOnIterator;
+    typedef typename NodeMaskType::OffIterator   MaskOffIterator;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIterator;
+
+    // Type tags to disambiguate template instantiations
+    struct ValueOn {}; struct ValueOff {}; struct ValueAll {};
+    struct ChildOn {}; struct ChildOff {}; struct ChildAll {};
+
+    template<typename MaskIterT, typename NodeT, typename ValueT, typename TagT>
+    struct ValueIter:
+        // Derives from SparseIteratorBase, but can also be used as a dense iterator,
+        // if MaskIterT is a dense mask iterator type.
+        public SparseIteratorBase<
+            MaskIterT, ValueIter<MaskIterT, NodeT, ValueT, TagT>, NodeT, ValueT>
+    {
+        typedef SparseIteratorBase<MaskIterT, ValueIter, NodeT, ValueT> BaseT;
+
+        ValueIter() {}
+        ValueIter(const MaskIterT& iter, NodeT* parent): BaseT(iter, parent) {}
+
+        ValueT& getItem(Index pos) const { return this->parent().getValue(pos); }
+        ValueT& getValue() const { return this->parent().getValue(this->pos()); }
+
+        // Note: setItem() can't be called on const iterators.
+        void setItem(Index pos, const ValueT& value) const
+        {
+            this->parent().setValueOnly(pos, value);
+        }
+        // Note: setValue() can't be called on const iterators.
+        void setValue(const ValueT& value) const
+        {
+            this->parent().setValueOnly(this->pos(), value);
+        }
+
+        // Note: modifyItem() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyItem(Index n, const ModifyOp& op) const { this->parent().modifyValue(n, op); }
+        // Note: modifyValue() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyValue(const ModifyOp& op) const { this->parent().modifyValue(this->pos(), op); }
+    };
+
+    /// Leaf nodes have no children, so their child iterators have no get/set accessors.
+    template<typename MaskIterT, typename NodeT, typename TagT>
+    struct ChildIter:
+        public SparseIteratorBase<MaskIterT, ChildIter<MaskIterT, NodeT, TagT>, NodeT, ValueType>
+    {
+        ChildIter() {}
+        ChildIter(const MaskIterT& iter, NodeT* parent): SparseIteratorBase<
+            MaskIterT, ChildIter<MaskIterT, NodeT, TagT>, NodeT, ValueType>(iter, parent) {}
+    };
+
+    template<typename NodeT, typename ValueT, typename TagT>
+    struct DenseIter: public DenseIteratorBase<
+        MaskDenseIterator, DenseIter<NodeT, ValueT, TagT>, NodeT, /*ChildT=*/void, ValueT>
+    {
+        typedef DenseIteratorBase<MaskDenseIterator, DenseIter, NodeT, void, ValueT> BaseT;
+        typedef typename BaseT::NonConstValueType NonConstValueT;
+
+        DenseIter() {}
+        DenseIter(const MaskDenseIterator& iter, NodeT* parent): BaseT(iter, parent) {}
+
+        bool getItem(Index pos, void*& child, NonConstValueT& value) const
+        {
+            value = this->parent().getValue(pos);
+            child = NULL;
+            return false; // no child
+        }
+
+        // Note: setItem() can't be called on const iterators.
+        //void setItem(Index pos, void* child) const {}
+
+        // Note: unsetItem() can't be called on const iterators.
+        void unsetItem(Index pos, const ValueT& value) const
+        {
+            this->parent().setValueOnly(pos, value);
+        }
+    };
+
+public:
+    typedef ValueIter<MaskOnIterator, LeafNode, const ValueType, ValueOn>        ValueOnIter;
+    typedef ValueIter<MaskOnIterator, const LeafNode, const ValueType, ValueOn>  ValueOnCIter;
+    typedef ValueIter<MaskOffIterator, LeafNode, const ValueType, ValueOff>      ValueOffIter;
+    typedef ValueIter<MaskOffIterator,const LeafNode,const ValueType,ValueOff>   ValueOffCIter;
+    typedef ValueIter<MaskDenseIterator, LeafNode, const ValueType, ValueAll>    ValueAllIter;
+    typedef ValueIter<MaskDenseIterator,const LeafNode,const ValueType,ValueAll> ValueAllCIter;
+    typedef ChildIter<MaskOnIterator, LeafNode, ChildOn>                         ChildOnIter;
+    typedef ChildIter<MaskOnIterator, const LeafNode, ChildOn>                   ChildOnCIter;
+    typedef ChildIter<MaskOffIterator, LeafNode, ChildOff>                       ChildOffIter;
+    typedef ChildIter<MaskOffIterator, const LeafNode, ChildOff>                 ChildOffCIter;
+    typedef DenseIter<LeafNode, ValueType, ChildAll>                             ChildAllIter;
+    typedef DenseIter<const LeafNode, const ValueType, ChildAll>                 ChildAllCIter;
+
+    ValueOnCIter  cbeginValueOn() const { return ValueOnCIter(mValueMask.beginOn(), this); }
+    ValueOnCIter   beginValueOn() const { return ValueOnCIter(mValueMask.beginOn(), this); }
+    ValueOnIter    beginValueOn() { return ValueOnIter(mValueMask.beginOn(), this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(mValueMask.beginOff(), this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(mValueMask.beginOff(), this); }
+    ValueOffIter   beginValueOff() { return ValueOffIter(mValueMask.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(mValueMask.beginDense(), this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(mValueMask.beginDense(), this); }
+    ValueAllIter   beginValueAll() { return ValueAllIter(mValueMask.beginDense(), this); }
+
+    ValueOnCIter  cendValueOn() const { return ValueOnCIter(mValueMask.endOn(), this); }
+    ValueOnCIter   endValueOn() const { return ValueOnCIter(mValueMask.endOn(), this); }
+    ValueOnIter    endValueOn() { return ValueOnIter(mValueMask.endOn(), this); }
+    ValueOffCIter cendValueOff() const { return ValueOffCIter(mValueMask.endOff(), this); }
+    ValueOffCIter  endValueOff() const { return ValueOffCIter(mValueMask.endOff(), this); }
+    ValueOffIter   endValueOff() { return ValueOffIter(mValueMask.endOff(), this); }
+    ValueAllCIter cendValueAll() const { return ValueAllCIter(mValueMask.endDense(), this); }
+    ValueAllCIter  endValueAll() const { return ValueAllCIter(mValueMask.endDense(), this); }
+    ValueAllIter   endValueAll() { return ValueAllIter(mValueMask.endDense(), this); }
+
+    // Note that [c]beginChildOn() and [c]beginChildOff() actually return end iterators,
+    // because leaf nodes have no children.
+    ChildOnCIter  cbeginChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnCIter   beginChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnIter    beginChildOn() { return ChildOnIter(mValueMask.endOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffCIter  beginChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffIter   beginChildOff() { return ChildOffIter(mValueMask.endOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(mValueMask.beginDense(), this); }
+    ChildAllCIter  beginChildAll() const { return ChildAllCIter(mValueMask.beginDense(), this); }
+    ChildAllIter   beginChildAll() { return ChildAllIter(mValueMask.beginDense(), this); }
+
+    ChildOnCIter  cendChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnCIter   endChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnIter    endChildOn() { return ChildOnIter(mValueMask.endOn(), this); }
+    ChildOffCIter cendChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffCIter  endChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffIter   endChildOff() { return ChildOffIter(mValueMask.endOff(), this); }
+    ChildAllCIter cendChildAll() const { return ChildAllCIter(mValueMask.endDense(), this); }
+    ChildAllCIter  endChildAll() const { return ChildAllCIter(mValueMask.endDense(), this); }
+    ChildAllIter   endChildAll() { return ChildAllIter(mValueMask.endDense(), this); }
+
+    //
+    // Buffer management
+    //
+    /// @brief Exchange this node's data buffer with the given data buffer
+    /// without changing the active states of the values.
+    void swap(Buffer& other) { mBuffer.swap(other); }
+    const Buffer& buffer() const { return mBuffer; }
+    Buffer& buffer() { return mBuffer; }
+
+    //
+    // I/O methods
+    //
+    /// @brief Read in just the topology.
+    /// @param is        the stream from which to read
+    /// @param fromHalf  if true, floating-point input values are assumed to be 16-bit
+    void readTopology(std::istream& is, bool fromHalf = false);
+    /// @brief Write out just the topology.
+    /// @param os      the stream to which to write
+    /// @param toHalf  if true, output floating-point values as 16-bit half floats
+    void writeTopology(std::ostream& os, bool toHalf = false) const;
+
+    /// @brief Read buffers from a stream.
+    /// @param is        the stream from which to read
+    /// @param fromHalf  if true, floating-point input values are assumed to be 16-bit
+    void readBuffers(std::istream& is, bool fromHalf = false);
+    /// @brief Read buffers that intersect the given bounding box.
+    /// @param is        the stream from which to read
+    /// @param bbox      an index-space bounding box
+    /// @param fromHalf  if true, floating-point input values are assumed to be 16-bit
+    void readBuffers(std::istream& is, const CoordBBox& bbox, bool fromHalf = false);
+    /// @brief Write buffers to a stream.
+    /// @param os      the stream to which to write
+    /// @param toHalf  if true, output floating-point values as 16-bit half floats
+    void writeBuffers(std::ostream& os, bool toHalf = false) const;
+
+    size_t streamingSize(bool toHalf = false) const;
+
+    //
+    // Accessor methods
+    //
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const;
+    /// Return the value of the voxel at the given linear offset.
+    const ValueType& getValue(Index offset) const;
+
+    /// @brief Return @c true if the voxel at the given coordinates is active.
+    /// @param xyz       the coordinates of the voxel to be probed
+    /// @param[out] val  the value of the voxel at the given coordinates
+    bool probeValue(const Coord& xyz, ValueType& val) const;
+    /// @brief Return @c true if the voxel at the given offset is active.
+    /// @param offset    the linear offset of the voxel to be probed
+    /// @param[out] val  the value of the voxel at the given coordinates
+    bool probeValue(Index offset, ValueType& val) const;
+
+    /// Return the level (i.e., 0) at which leaf node values reside.
+    static Index getValueLevel(const Coord&) { return LEVEL; }
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on);
+    /// Set the active state of the voxel at the given offset but don't change its value.
+    void setActiveState(Index offset, bool on) { assert(offset<SIZE); mValueMask.set(offset, on); }
+
+    /// Set the value of the voxel at the given coordinates but don't change its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& val);
+    /// Set the value of the voxel at the given offset but don't change its active state.
+    void setValueOnly(Index offset, const ValueType& val);
+
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz) { mValueMask.setOff(LeafNode::coordToOffset(xyz)); }
+    /// Mark the voxel at the given offset as inactive but don't change its value.
+    void setValueOff(Index offset) { assert(offset < SIZE); mValueMask.setOff(offset); }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& val);
+    /// Set the value of the voxel at the given offset and mark the voxel as inactive.
+    void setValueOff(Index offset, const ValueType& val);
+
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz) { mValueMask.setOn(LeafNode::coordToOffset(xyz)); }
+    /// Mark the voxel at the given offset as active but don't change its value.
+    void setValueOn(Index offset) { assert(offset < SIZE); mValueMask.setOn(offset); }
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValueOn(const Coord& xyz, const ValueType& val) {
+        this->setValueOn(LeafNode::coordToOffset(xyz), val);
+    }
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& val) { this->setValueOn(xyz, val); }
+    /// Set the value of the voxel at the given offset and mark the voxel as active.
+    void setValueOn(Index offset, const ValueType& val) {
+        mBuffer.setValue(offset, val);
+        mValueMask.setOn(offset);
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given offset
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(Index offset, const ModifyOp& op)
+    {
+        ValueType val = mBuffer[offset];
+        op(val);
+        mBuffer.setValue(offset, val);
+        mValueMask.setOn(offset);
+    }
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        this->modifyValue(this->coordToOffset(xyz), op);
+    }
+
+    /// Apply a functor to the voxel at the given coordinates.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        const Index offset = this->coordToOffset(xyz);
+        bool state = mValueMask.isOn(offset);
+        ValueType val = mBuffer[offset];
+        op(val, state);
+        mBuffer.setValue(offset, val);
+        mValueMask.set(offset, state);
+    }
+
+    /// Mark all voxels as active but don't change their values.
+    void setValuesOn() { mValueMask.setOn(); }
+    /// Mark all voxels as inactive but don't change their values.
+    void setValuesOff() { mValueMask.setOff(); }
+
+    /// Return @c true if the voxel at the given coordinates is active.
+    bool isValueOn(const Coord& xyz) const {return this->isValueOn(LeafNode::coordToOffset(xyz));}
+    /// Return @c true if the voxel at the given offset is active.
+    bool isValueOn(Index offset) const { return mValueMask.isOn(offset); }
+
+    /// Return @c false since leaf nodes never contain tiles.
+    static bool hasActiveTiles() { return false; }
+
+    /// Set all voxels that lie outside the given axis-aligned box to the background.
+    void clip(const CoordBBox&, const ValueType& background);
+
+    /// Set all voxels within an axis-aligned box to the specified value and active state.
+    void fill(const CoordBBox& bbox, const ValueType&, bool active = true);
+
+    /// Set all voxels to the specified value but don't change their active states.
+    void fill(const ValueType& value);
+    /// Set all voxels to the specified value and active state.
+    void fill(const ValueType& value, bool active);
+
+    /// @brief Copy into a dense grid the values of the voxels that lie within
+    /// a given bounding box.
+    ///
+    /// @param bbox   inclusive bounding box of the voxels to be copied into the dense grid
+    /// @param dense  dense grid with a stride in @e z of one (see tools::Dense
+    ///               in tools/Dense.h for the required API)
+    ///
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    /// @note Consider using tools::CopyToDense in tools/Dense.h
+    /// instead of calling this method directly.
+    template<typename DenseT>
+    void copyToDense(const CoordBBox& bbox, DenseT& dense) const;
+
+    /// @brief Copy from a dense grid into this node the values of the voxels
+    /// that lie within a given bounding box.
+    /// @details Only values that are different (by more than the given tolerance)
+    /// from the background value will be active.  Other values are inactive
+    /// and truncated to the background value.
+    ///
+    /// @param bbox        inclusive bounding box of the voxels to be copied into this node
+    /// @param dense       dense grid with a stride in @e z of one (see tools::Dense
+    ///                    in tools/Dense.h for the required API)
+    /// @param background  background value of the tree that this node belongs to
+    /// @param tolerance   tolerance within which a value equals the background value
+    ///
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    /// @note Consider using tools::CopyFromDense in tools/Dense.h
+    /// instead of calling this method directly.
+    template<typename DenseT>
+    void copyFromDense(const CoordBBox& bbox, const DenseT& dense,
+                       const ValueType& background, const ValueType& tolerance);
+
+    /// @brief Return the value of the voxel at the given coordinates.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    const ValueType& getValueAndCache(const Coord& xyz, AccessorT&) const
+    {
+        return this->getValue(xyz);
+    }
+
+    /// @brief Return @c true if the voxel at the given coordinates is active.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool isValueOnAndCache(const Coord& xyz, AccessorT&) const { return this->isValueOn(xyz); }
+
+    /// @brief Change the value of the voxel at the given coordinates and mark it as active.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueAndCache(const Coord& xyz, const ValueType& val, AccessorT&)
+    {
+        this->setValueOn(xyz, val);
+    }
+
+    /// @brief Change the value of the voxel at the given coordinates
+    /// but preserve its state.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord& xyz, const ValueType& val, AccessorT&)
+    {
+        this->setValueOnly(xyz, val);
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&)
+    {
+        this->modifyValue(xyz, op);
+    }
+
+    /// Apply a functor to the voxel at the given coordinates.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&)
+    {
+        this->modifyValueAndActiveState(xyz, op);
+    }
+
+    /// @brief Change the value of the voxel at the given coordinates and mark it as inactive.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord& xyz, const ValueType& value, AccessorT&)
+    {
+        this->setValueOff(xyz, value);
+    }
+
+    /// @brief Set the active state of the voxel at the given coordinates
+    /// without changing its value.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT&)
+    {
+        this->setActiveState(xyz, on);
+    }
+
+    /// @brief Return @c true if the voxel at the given coordinates is active
+    /// and return the voxel value in @a val.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool probeValueAndCache(const Coord& xyz, ValueType& val, AccessorT&) const
+    {
+        return this->probeValue(xyz, val);
+    }
+
+    /// @brief Return the value of the voxel at the given coordinates and return
+    /// its active state and level (i.e., 0) in @a state and @a level.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    const ValueType& getValue(const Coord& xyz, bool& state, int& level, AccessorT&) const
+    {
+        const Index offset = this->coordToOffset(xyz);
+        state = mValueMask.isOn(offset);
+        level = LEVEL;
+        return mBuffer[offset];
+    }
+
+    /// @brief Return the LEVEL (=0) at which leaf node values reside.
+    /// @note Used internally by ValueAccessor (note last argument is a dummy).
+    template<typename AccessorT>
+    static Index getValueLevelAndCache(const Coord&, AccessorT&) { return LEVEL; }
+
+    /// @brief Return a const reference to the first value in the buffer.
+    /// @note Though it is potentially risky you can convert this
+    /// to a non-const pointer by means of const_case<ValueType*>&.
+    const ValueType& getFirstValue() const { return mBuffer[0]; }
+    /// Return a const reference to the last value in the buffer.
+    const ValueType& getLastValue() const { return mBuffer[SIZE - 1]; }
+
+    /// @brief Replace inactive occurrences of @a oldBackground with @a newBackground,
+    /// and inactive occurrences of @a -oldBackground with @a -newBackground.
+    void resetBackground(const ValueType& oldBackground, const ValueType& newBackground);
+
+    void negate();
+
+    /// @brief No-op
+    /// @details This function exists only to enable template instantiation.
+    void voxelizeActiveTiles(bool = true) {}
+
+    template<MergePolicy Policy> void merge(const LeafNode&);
+    template<MergePolicy Policy> void merge(const ValueType& tileValue, bool tileActive);
+    template<MergePolicy Policy>
+    void merge(const LeafNode& other, const ValueType& /*bg*/, const ValueType& /*otherBG*/);
+
+    /// @brief Union this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active if either of the original voxels
+    /// were active.
+    ///
+    /// @note This operation modifies only active states, not values.
+    template<typename OtherType>
+    void topologyUnion(const LeafNode<OtherType, Log2Dim>& other);
+
+    /// @brief Intersect this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if both of the original voxels
+    /// were active.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyIntersection.
+    ///
+    /// @note This operation modifies only active states, not
+    /// values. Also note that this operation can result in all voxels
+    /// being inactive so consider subsequnetly calling prune.
+    template<typename OtherType>
+    void topologyIntersection(const LeafNode<OtherType, Log2Dim>& other, const ValueType&);
+
+    /// @brief Difference this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if the original voxel is
+    /// active in this LeafNode and inactive in the other LeafNode.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyDifference.
+    ///
+    /// @note This operation modifies only active states, not values.
+    /// Also, because it can deactivate all of this node's voxels,
+    /// consider subsequently calling prune.
+    template<typename OtherType>
+    void topologyDifference(const LeafNode<OtherType, Log2Dim>& other, const ValueType&);
+
+    template<typename CombineOp>
+    void combine(const LeafNode& other, CombineOp& op);
+    template<typename CombineOp>
+    void combine(const ValueType& value, bool valueIsActive, CombineOp& op);
+
+    template<typename CombineOp, typename OtherType /*= ValueType*/>
+    void combine2(const LeafNode& other, const OtherType&, bool valueIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherNodeT /*= LeafNode*/>
+    void combine2(const ValueType&, const OtherNodeT& other, bool valueIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherNodeT /*= LeafNode*/>
+    void combine2(const LeafNode& b0, const OtherNodeT& b1, CombineOp&);
+
+    /// @brief Calls the templated functor BBoxOp with bounding box
+    /// information. An additional level argument is provided to the
+    /// callback.
+    ///
+    /// @note The bounding boxes are guarenteed to be non-overlapping.
+    template<typename BBoxOp> void visitActiveBBox(BBoxOp&) const;
+
+    template<typename VisitorOp> void visit(VisitorOp&);
+    template<typename VisitorOp> void visit(VisitorOp&) const;
+
+    template<typename OtherLeafNodeType, typename VisitorOp>
+    void visit2Node(OtherLeafNodeType& other, VisitorOp&);
+    template<typename OtherLeafNodeType, typename VisitorOp>
+    void visit2Node(OtherLeafNodeType& other, VisitorOp&) const;
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false);
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false) const;
+
+    //@{
+    /// This function exists only to enable template instantiation.
+    void prune(const ValueType& /*tolerance*/ = zeroVal<ValueType>()) {}
+    void addLeaf(LeafNode*) {}
+    template<typename AccessorT>
+    void addLeafAndCache(LeafNode*, AccessorT&) {}
+    template<typename NodeT>
+    NodeT* stealNode(const Coord&, const ValueType&, bool) { return NULL; }
+    template<typename NodeT>
+    NodeT* probeNode(const Coord&) { return NULL; }
+    template<typename NodeT>
+    const NodeT* probeConstNode(const Coord&) const { return NULL; }
+    template<typename ArrayT> void getNodes(ArrayT&) const {}
+    template<typename ArrayT> void stealNodes(ArrayT&, const ValueType&, bool) {}
+    //@}
+
+    void addTile(Index level, const Coord&, const ValueType&, bool);
+    void addTile(Index offset, const ValueType&, bool);
+    template<typename AccessorT>
+    void addTileAndCache(Index, const Coord&, const ValueType&, bool, AccessorT&);
+
+    //@{
+    /// @brief Return a pointer to this node.
+    LeafNode* touchLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    LeafNode* touchLeafAndCache(const Coord&, AccessorT&) { return this; }
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord&, AccessorT&)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,LeafNode>::value)) return NULL;
+        return reinterpret_cast<NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    LeafNode* probeLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    LeafNode* probeLeafAndCache(const Coord&, AccessorT&) { return this; }
+    //@}
+    //@{
+    /// @brief Return a @const pointer to this node.
+    const LeafNode* probeConstLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const LeafNode* probeConstLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    template<typename AccessorT>
+    const LeafNode* probeLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    const LeafNode* probeLeaf(const Coord&) const { return this; }
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord&, AccessorT&) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,LeafNode>::value)) return NULL;
+        return reinterpret_cast<const NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+
+    /// Return @c true if all of this node's values have the same active state
+    /// and are in the range this->getFirstValue() +/- @a tolerance.
+    ///
+    ///
+    /// @param constValue  Is updated with the first value of this leaf node.
+    /// @param state       Is updated with the state of all values IF method
+    ///                    returns @c true. Else the value is undefined!
+    /// @param tolerance   The tolerance used to determine if values are
+    ///                    approximatly equal to the for value.
+    bool isConstant(ValueType& constValue, bool& state,
+                    const ValueType& tolerance = zeroVal<ValueType>()) const;
+
+    /// Return @c true if all of this node's values have the same active state
+    /// and are in the range (@a maxValue + @a minValue)/2 +/- @a tolerance.
+    ///
+    /// @param minValue  Is updated with the minimum of all values IF method
+    ///                  returns @c true. Else the value is undefined!
+    /// @param maxValue  Is updated with the maximum of all values IF method
+    ///                  returns @c true. Else the value is undefined!
+    /// @param state     Is updated with the state of all values IF method
+    ///                  returns @c true. Else the value is undefined!
+    /// @param tolerance The tolerance used to determine if values are
+    ///                  approximatly constant.
+    bool isConstant(ValueType& minValue, ValueType& maxValue,
+                    bool& state, const ValueType& tolerance = zeroVal<ValueType>()) const;
+    
+    /// Return @c true if all of this node's values are inactive.
+    bool isInactive() const { return mValueMask.isOff(); }
+
+protected:
+    friend class ::TestLeaf;
+    template<typename> friend class ::TestLeafIO;
+
+    // During topology-only construction, access is needed
+    // to protected/private members of other template instances.
+    template<typename, Index> friend class LeafNode;
+
+    friend struct ValueIter<MaskOnIterator, LeafNode, ValueType, ValueOn>;
+    friend struct ValueIter<MaskOffIterator, LeafNode, ValueType, ValueOff>;
+    friend struct ValueIter<MaskDenseIterator, LeafNode, ValueType, ValueAll>;
+    friend struct ValueIter<MaskOnIterator, const LeafNode, ValueType, ValueOn>;
+    friend struct ValueIter<MaskOffIterator, const LeafNode, ValueType, ValueOff>;
+    friend struct ValueIter<MaskDenseIterator, const LeafNode, ValueType, ValueAll>;
+
+    // Allow iterators to call mask accessor methods (see below).
+    /// @todo Make mask accessors public?
+    friend class IteratorBase<MaskOnIterator, LeafNode>;
+    friend class IteratorBase<MaskOffIterator, LeafNode>;
+    friend class IteratorBase<MaskDenseIterator, LeafNode>;
+
+    // Mask accessors
+public:
+    bool isValueMaskOn(Index n) const { return mValueMask.isOn(n); }
+    bool isValueMaskOn() const { return mValueMask.isOn(); }
+    bool isValueMaskOff(Index n) const { return mValueMask.isOff(n); }
+    bool isValueMaskOff() const { return mValueMask.isOff(); }
+    const NodeMaskType& getValueMask() const { return mValueMask; }
+    NodeMaskType& getValueMask() { return mValueMask; }
+    const NodeMaskType& valueMask() const { return mValueMask; }
+    void setValueMask(const NodeMaskType& mask) { mValueMask = mask; }
+    bool isChildMaskOn(Index) const { return false; } // leaf nodes have no children
+    bool isChildMaskOff(Index) const { return true; }
+    bool isChildMaskOff() const { return true; }
+protected:
+    void setValueMask(Index n, bool on) { mValueMask.set(n, on); }
+    void setValueMaskOn(Index n)  { mValueMask.setOn(n); }
+    void setValueMaskOff(Index n) { mValueMask.setOff(n); }
+
+    /// Compute the origin of the leaf node that contains the voxel with the given coordinates.
+    static void evalNodeOrigin(Coord& xyz) { xyz &= ~(DIM - 1); }
+
+    template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+    static inline void doVisit(NodeT&, VisitorOp&);
+
+    template<typename NodeT, typename OtherNodeT, typename VisitorOp,
+             typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp&);
+
+    template<typename NodeT, typename VisitorOp,
+             typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2(NodeT& self, OtherChildAllIterT&, VisitorOp&, bool otherIsLHS);
+
+private:
+    /// Buffer containing the actual data values
+    Buffer mBuffer;
+    /// Bitmask that determines which voxels are active
+    NodeMaskType mValueMask;
+    /// Global grid index coordinates (x,y,z) of the local origin of this node
+    Coord mOrigin;
+}; // end of LeafNode class
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<typename T, Index Log2Dim>
+const T LeafNode<T, Log2Dim>::Buffer::sZero = zeroVal<T>();
+#endif
+
+
+////////////////////////////////////////
+
+
+//@{
+/// Helper metafunction used to implement LeafNode::SameConfiguration
+/// (which, as an inner class, can't be independently specialized)
+template<Index Dim1, typename NodeT2>
+struct SameLeafConfig { static const bool value = false; };
+
+template<Index Dim1, typename T2>
+struct SameLeafConfig<Dim1, LeafNode<T2, Dim1> > { static const bool value = true; };
+//@}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline
+LeafNode<T, Log2Dim>::LeafNode():
+    mValueMask(),//default is off!
+    mOrigin(0, 0, 0)
+{
+}
+
+
+template<typename T, Index Log2Dim>
+inline
+LeafNode<T, Log2Dim>::LeafNode(const Coord& xyz, const ValueType& val, bool active):
+    mBuffer(val),
+    mValueMask(active),
+    mOrigin(xyz & (~(DIM - 1)))
+{
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<typename T, Index Log2Dim>
+inline
+LeafNode<T, Log2Dim>::LeafNode(PartialCreate, const Coord& xyz, const ValueType& val, bool active):
+    mBuffer(PartialCreate(), val),
+    mValueMask(active),
+    mOrigin(xyz & (~(DIM - 1)))
+{
+}
+#endif
+
+
+template<typename T, Index Log2Dim>
+inline
+LeafNode<T, Log2Dim>::LeafNode(const LeafNode &other):
+    mBuffer(other.mBuffer),
+    mValueMask(other.valueMask()),
+    mOrigin(other.mOrigin)
+{
+}
+
+
+// Copy-construct from a leaf node with the same configuration but a different ValueType.
+template<typename T, Index Log2Dim>
+template<typename OtherValueType>
+inline
+LeafNode<T, Log2Dim>::LeafNode(const LeafNode<OtherValueType, Log2Dim>& other):
+    mValueMask(other.valueMask()),
+    mOrigin(other.mOrigin)
+{
+    struct Local {
+        /// @todo Consider using a value conversion functor passed as an argument instead.
+        static inline ValueType convertValue(const OtherValueType& val) { return ValueType(val); }
+    };
+
+    for (Index i = 0; i < SIZE; ++i) {
+        mBuffer[i] = Local::convertValue(other.mBuffer[i]);
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename OtherValueType>
+inline
+LeafNode<T, Log2Dim>::LeafNode(const LeafNode<OtherValueType, Log2Dim>& other,
+                               const ValueType& background, TopologyCopy):
+    mBuffer(background),
+    mValueMask(other.valueMask()),
+    mOrigin(other.mOrigin)
+{
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename OtherValueType>
+inline
+LeafNode<T, Log2Dim>::LeafNode(const LeafNode<OtherValueType, Log2Dim>& other,
+    const ValueType& offValue, const ValueType& onValue, TopologyCopy):
+    mValueMask(other.valueMask()),
+    mOrigin(other.mOrigin)
+{
+    for (Index i = 0; i < SIZE; ++i) {
+        mBuffer[i] = (mValueMask.isOn(i) ? onValue : offValue);
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+inline
+LeafNode<T, Log2Dim>::~LeafNode()
+{
+}
+
+
+template<typename T, Index Log2Dim>
+inline std::string
+LeafNode<T, Log2Dim>::str() const
+{
+    std::ostringstream ostr;
+    ostr << "LeafNode @" << mOrigin << ": " << mBuffer;
+    return ostr.str();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline Index
+LeafNode<T, Log2Dim>::coordToOffset(const Coord& xyz)
+{
+    assert ((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
+    return ((xyz[0] & (DIM-1u)) << 2*Log2Dim)
+        +  ((xyz[1] & (DIM-1u)) <<  Log2Dim)
+        +   (xyz[2] & (DIM-1u));
+}
+
+template<typename T, Index Log2Dim>
+inline Coord
+LeafNode<T, Log2Dim>::offsetToLocalCoord(Index n)
+{
+    assert(n<(1<< 3*Log2Dim));
+    Coord xyz;
+    xyz.setX(n >> 2*Log2Dim);
+    n &= ((1<<2*Log2Dim)-1);
+    xyz.setY(n >> Log2Dim);
+    xyz.setZ(n & ((1<<Log2Dim)-1));
+    return xyz;
+}
+
+
+template<typename T, Index Log2Dim>
+inline Coord
+LeafNode<T, Log2Dim>::offsetToGlobalCoord(Index n) const
+{
+    return (this->offsetToLocalCoord(n) + this->origin());
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ValueT, Index Log2Dim>
+inline const ValueT&
+LeafNode<ValueT, Log2Dim>::getValue(const Coord& xyz) const
+{
+    return this->getValue(LeafNode::coordToOffset(xyz));
+}
+
+template<typename ValueT, Index Log2Dim>
+inline const ValueT&
+LeafNode<ValueT, Log2Dim>::getValue(Index offset) const
+{
+    assert(offset < SIZE);
+    return mBuffer[offset];
+}
+
+
+template<typename T, Index Log2Dim>
+inline bool
+LeafNode<T, Log2Dim>::probeValue(const Coord& xyz, ValueType& val) const
+{
+    return this->probeValue(LeafNode::coordToOffset(xyz), val);
+}
+
+template<typename T, Index Log2Dim>
+inline bool
+LeafNode<T, Log2Dim>::probeValue(Index offset, ValueType& val) const
+{
+    assert(offset < SIZE);
+    val = mBuffer[offset];
+    return mValueMask.isOn(offset);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::setValueOff(const Coord& xyz, const ValueType& val)
+{
+    this->setValueOff(LeafNode::coordToOffset(xyz), val);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::setValueOff(Index offset, const ValueType& val)
+{
+    assert(offset < SIZE);
+    mBuffer.setValue(offset, val);
+    mValueMask.setOff(offset);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::setActiveState(const Coord& xyz, bool on)
+{
+    mValueMask.set(this->coordToOffset(xyz), on);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::setValueOnly(const Coord& xyz, const ValueType& val)
+{
+    this->setValueOnly(LeafNode::coordToOffset(xyz), val);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::setValueOnly(Index offset, const ValueType& val)
+{
+    assert(offset<SIZE); mBuffer.setValue(offset, val);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::clip(const CoordBBox& clipBBox, const T& background)
+{
+    CoordBBox nodeBBox = this->getNodeBoundingBox();
+    if (!clipBBox.hasOverlap(nodeBBox)) {
+        // This node lies completely outside the clipping region.  Fill it with the background.
+        this->fill(background, /*active=*/false);
+    } else if (clipBBox.isInside(nodeBBox)) {
+        // This node lies completely inside the clipping region.  Leave it intact.
+        return;
+    }
+
+    // This node isn't completely contained inside the clipping region.
+    // Set any voxels that lie outside the region to the background value.
+
+    // Construct a boolean mask that is on inside the clipping region and off outside it.
+    NodeMaskType mask;
+    nodeBBox.intersect(clipBBox);
+    Coord xyz;
+    int &x = xyz.x(), &y = xyz.y(), &z = xyz.z();
+    for (x = nodeBBox.min().x(); x <= nodeBBox.max().x(); ++x) {
+        for (y = nodeBBox.min().y(); y <= nodeBBox.max().y(); ++y) {
+            for (z = nodeBBox.min().z(); z <= nodeBBox.max().z(); ++z) {
+                mask.setOn(static_cast<Index32>(this->coordToOffset(xyz)));
+            }
+        }
+    }
+
+    // Set voxels that lie in the inactive region of the mask (i.e., outside
+    // the clipping region) to the background value.
+    for (MaskOffIterator maskIter = mask.beginOff(); maskIter; ++maskIter) {
+        this->setValueOff(maskIter.pos(), background);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::fill(const CoordBBox& bbox, const ValueType& value, bool active)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+
+    for (Int32 x = bbox.min().x(); x <= bbox.max().x(); ++x) {
+        const Index offsetX = (x & (DIM-1u)) << 2*Log2Dim;
+        for (Int32 y = bbox.min().y(); y <= bbox.max().y(); ++y) {
+            const Index offsetXY = offsetX + ((y & (DIM-1u)) << Log2Dim);
+            for (Int32 z = bbox.min().z(); z <= bbox.max().z(); ++z) {
+                const Index offset = offsetXY + (z & (DIM-1u));
+                mBuffer[offset] = value;
+                mValueMask.set(offset, active);
+            }
+        }
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::fill(const ValueType& value)
+{
+    mBuffer.fill(value);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::fill(const ValueType& value, bool active)
+{
+    mBuffer.fill(value);
+    mValueMask.set(active);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+template<typename DenseT>
+inline void
+LeafNode<T, Log2Dim>::copyToDense(const CoordBBox& bbox, DenseT& dense) const
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->isAllocated()) return;
+#endif
+
+    typedef typename DenseT::ValueType DenseValueType;
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    DenseValueType* t0 = dense.data() + zStride * (bbox.min()[2] - min[2]); // target array
+    const T* s0 = &mBuffer[bbox.min()[2] & (DIM-1u)]; // source array
+    for (Int32 x = bbox.min()[0], ex = bbox.max()[0] + 1; x < ex; ++x) {
+        DenseValueType* t1 = t0 + xStride * (x - min[0]);
+        const T* s1 = s0 + ((x & (DIM-1u)) << 2*Log2Dim);
+        for (Int32 y = bbox.min()[1], ey = bbox.max()[1] + 1; y < ey; ++y) {
+            DenseValueType* t2 = t1 + yStride * (y - min[1]);
+            const T* s2 = s1 + ((y & (DIM-1u)) << Log2Dim);
+            for (Int32 z = bbox.min()[2], ez = bbox.max()[2] + 1; z < ez; ++z, t2 += zStride) {
+                *t2 = DenseValueType(*s2++);
+            }
+        }
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename DenseT>
+inline void
+LeafNode<T, Log2Dim>::copyFromDense(const CoordBBox& bbox, const DenseT& dense,
+                                    const ValueType& background, const ValueType& tolerance)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+
+    typedef typename DenseT::ValueType DenseValueType;
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+
+    const DenseValueType* s0 = dense.data() + zStride * (bbox.min()[2] - min[2]); // source
+    const Int32 n0 = bbox.min()[2] & (DIM-1u);
+    for (Int32 x = bbox.min()[0], ex = bbox.max()[0]+1; x < ex; ++x) {
+        const DenseValueType* s1 = s0 + xStride * (x - min[0]);
+        const Int32 n1 = n0 + ((x & (DIM-1u)) << 2*LOG2DIM);
+        for (Int32 y = bbox.min()[1], ey = bbox.max()[1]+1; y < ey; ++y) {
+            const DenseValueType* s2 = s1 + yStride * (y - min[1]);
+            Int32 n2 = n1 + ((y & (DIM-1u)) << LOG2DIM);
+            for (Int32 z = bbox.min()[2], ez = bbox.max()[2]+1; z < ez; ++z, ++n2, s2 += zStride) {
+                if (math::isApproxEqual(background, ValueType(*s2), tolerance)) {
+                    mValueMask.setOff(n2);
+                    mBuffer[n2] = background;
+                } else {
+                    mValueMask.setOn(n2);
+                    mBuffer[n2] = ValueType(*s2);
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::readTopology(std::istream& is, bool /*fromHalf*/)
+{
+    mValueMask.load(is);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::writeTopology(std::ostream& os, bool /*toHalf*/) const
+{
+    mValueMask.save(os);
+}
+
+
+////////////////////////////////////////
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::Buffer::doLoad() const
+{
+    if (!this->isOutOfCore()) return;
+
+    Buffer* self = const_cast<Buffer*>(this);
+
+    // This lock will be contended at most once, after which this buffer
+    // will no longer be out-of-core.
+    tbb::spin_mutex::scoped_lock lock(self->mMutex);
+    if (!this->isOutOfCore()) return;
+
+    boost::scoped_ptr<FileInfo> info(self->mFileInfo);
+    assert(info.get() != NULL);
+    assert(info->mapping.get() != NULL);
+    assert(info->meta.get() != NULL);
+
+    /// @todo For now, we have to clear the mData pointer in order for allocate() to take effect.
+    self->mData = NULL;
+    self->allocate();
+
+    boost::shared_ptr<std::streambuf> buf = info->mapping->createBuffer();
+    std::istream is(buf.get());
+
+    io::setStreamMetadataPtr(is, info->meta, /*transfer=*/true);
+
+    NodeMaskType mask;
+    is.seekg(info->maskpos);
+    mask.load(is);
+
+    is.seekg(info->bufpos);
+    io::readCompressedValues(is, self->mData, SIZE, mask, io::getHalfFloat(is));
+
+    self->setOutOfCore(false);
+}
+#endif
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T,Log2Dim>::readBuffers(std::istream& is, bool fromHalf)
+{
+    this->readBuffers(is, CoordBBox::inf(), fromHalf);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T,Log2Dim>::readBuffers(std::istream& is, const CoordBBox& clipBBox, bool fromHalf)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    std::streamoff maskpos = is.tellg();
+#endif
+
+    // Read in the value mask.
+    mValueMask.load(is);
+
+    int8_t numBuffers = 1;
+    if (io::getFormatVersion(is) < OPENVDB_FILE_VERSION_NODE_MASK_COMPRESSION) {
+        // Read in the origin.
+        is.read(reinterpret_cast<char*>(&mOrigin), sizeof(Coord::ValueType) * 3);
+
+        // Read in the number of buffers, which should now always be one.
+        is.read(reinterpret_cast<char*>(&numBuffers), sizeof(int8_t));
+    }
+
+    CoordBBox nodeBBox = this->getNodeBoundingBox();
+    if (!clipBBox.hasOverlap(nodeBBox)) {
+        // This node lies completely outside the clipping region.
+        // Read and discard its voxel values.
+        Buffer temp;
+        io::readCompressedValues(is, temp.mData, SIZE, mValueMask, fromHalf);
+        mValueMask.setOff();
+        mBuffer.setOutOfCore(false);
+    } else {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        // If this node lies completely inside the clipping region and it is being read
+        // from a memory-mapped file, delay loading of its buffer until the buffer
+        // is actually accessed.  (If this node requires clipping, its buffer
+        // must be accessed and therefore must be loaded.)
+        io::MappedFile::Ptr mappedFile = io::getMappedFilePtr(is);
+        const bool delayLoad = ((mappedFile.get() != NULL) && clipBBox.isInside(nodeBBox));
+
+        if (delayLoad) {
+            mBuffer.setOutOfCore(true);
+            mBuffer.mFileInfo = new FileInfo;
+            mBuffer.mFileInfo->bufpos = is.tellg();
+            mBuffer.mFileInfo->mapping = mappedFile;
+            // Save the offset to the value mask, because the in-memory copy
+            // might change before the value buffer gets read.
+            mBuffer.mFileInfo->maskpos = maskpos;
+
+            mBuffer.mFileInfo->meta = io::getStreamMetadataPtr(is);
+
+            // Read and discard voxel values.
+            Buffer temp;
+            io::readCompressedValues(is, temp.mData, SIZE, mValueMask, fromHalf);
+        } else {
+#endif
+            mBuffer.allocate();
+            io::readCompressedValues(is, mBuffer.mData, SIZE, mValueMask, fromHalf);
+            mBuffer.setOutOfCore(false);
+
+            // Get this tree's background value.
+            T background = zeroVal<T>();
+            if (const void* bgPtr = io::getGridBackgroundValuePtr(is)) {
+                background = *static_cast<const T*>(bgPtr);
+            }
+            this->clip(clipBBox, background);
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        }
+#endif
+    }
+
+    if (numBuffers > 1) {
+        // Read in and discard auxiliary buffers that were created with earlier
+        // versions of the library.  (Auxiliary buffers are not mask compressed.)
+        const bool zipped = io::getDataCompression(is) & io::COMPRESS_ZIP;
+        Buffer temp;
+        for (int i = 1; i < numBuffers; ++i) {
+            if (fromHalf) {
+                io::HalfReader<io::RealToHalf<T>::isReal, T>::read(is, temp.mData, SIZE, zipped);
+            } else {
+                io::readData<T>(is, temp.mData, SIZE, zipped);
+            }
+        }
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::writeBuffers(std::ostream& os, bool toHalf) const
+{
+    // Write out the value mask.
+    mValueMask.save(os);
+
+    mBuffer.loadValues();
+
+    io::writeCompressedValues(os, mBuffer.mData, SIZE,
+        mValueMask, /*childMask=*/NodeMaskType(), toHalf);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline bool
+LeafNode<T, Log2Dim>::operator==(const LeafNode& other) const
+{
+    return mOrigin == other.mOrigin &&
+           mValueMask == other.valueMask() &&
+           mBuffer == other.mBuffer;
+}
+
+
+template<typename T, Index Log2Dim>
+inline Index64
+LeafNode<T, Log2Dim>::memUsage() const
+{
+    // Use sizeof(*this) to capture alignment-related padding
+    // (but note that sizeof(*this) includes sizeof(mBuffer)).
+    return sizeof(*this) + mBuffer.memUsage() - sizeof(mBuffer);
+}
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels) const
+{
+    CoordBBox this_bbox = this->getNodeBoundingBox();
+    if (bbox.isInside(this_bbox)) return;//this LeafNode is already enclosed in the bbox
+    if (ValueOnCIter iter = this->cbeginValueOn()) {//any active values?
+        if (visitVoxels) {//use voxel granularity?
+            this_bbox.reset();
+            for(; iter; ++iter) this_bbox.expand(this->offsetToLocalCoord(iter.pos()));
+            this_bbox.translate(this->origin());
+        }
+        bbox.expand(this_bbox);
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename OtherType, Index OtherLog2Dim>
+inline bool
+LeafNode<T, Log2Dim>::hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const
+{
+    assert(other);
+    return (Log2Dim == OtherLog2Dim && mValueMask == other->getValueMask());
+}
+
+
+template<typename T, Index Log2Dim>
+inline bool
+LeafNode<T, Log2Dim>::isConstant(ValueType& value, bool& state,
+                                 const ValueType& tolerance) const
+{
+    state = mValueMask.isOn();
+    if (!(state || mValueMask.isOff())) return false;// Are values neither active nor inactive?
+    
+    value = mBuffer[0];
+    for (Index i = 1; i < SIZE; ++i) {
+        if ( !math::isApproxEqual(mBuffer[i], value, tolerance) ) return false;
+    }
+    return true;
+}
+
+template<typename T, Index Log2Dim>
+inline bool
+LeafNode<T, Log2Dim>::isConstant(ValueType& minValue, ValueType& maxValue,
+                                 bool& state, const ValueType& tolerance) const
+{
+    state = mValueMask.isOn();
+    if (!(state || mValueMask.isOff())) return false;// Are values neither active nor inactive?
+    
+    const T range = 2 * tolerance;
+    minValue = maxValue = mBuffer[0];
+    for (Index i = 1; i < SIZE; ++i) {// early termination
+        const T& v = mBuffer[i];
+        if (v < minValue) {
+            if ((maxValue - v) > range) return false;
+            minValue = v;
+        } else if (v > maxValue) {
+            if ((v - minValue) > range) return false;
+            maxValue = v;
+        }
+    }
+    return true;
+}
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::addTile(Index /*level*/, const Coord& xyz, const ValueType& val, bool active)
+{
+    this->addTile(this->coordToOffset(xyz), val, active);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::addTile(Index offset, const ValueType& val, bool active)
+{
+    assert(offset < SIZE);
+    setValueOnly(offset, val);
+    setActiveState(offset, active);
+}
+
+template<typename T, Index Log2Dim>
+template<typename AccessorT>
+inline void
+LeafNode<T, Log2Dim>::addTileAndCache(Index level, const Coord& xyz,
+    const ValueType& val, bool active, AccessorT&)
+{
+    this->addTile(level, xyz, val, active);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::resetBackground(const ValueType& oldBackground,
+                                      const ValueType& newBackground)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+
+    typename NodeMaskType::OffIterator iter;
+    // For all inactive values...
+    for (iter = this->mValueMask.beginOff(); iter; ++iter) {
+        ValueType &inactiveValue = mBuffer[iter.pos()];
+        if (math::isApproxEqual(inactiveValue, oldBackground)) {
+            inactiveValue = newBackground;
+        } else if (math::isApproxEqual(inactiveValue, math::negative(oldBackground))) {
+            inactiveValue = math::negative(newBackground);
+        }
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<T, Log2Dim>::merge(const LeafNode& other)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    if (Policy == MERGE_NODES) return;
+    typename NodeMaskType::OnIterator iter = other.valueMask().beginOn();
+    for (; iter; ++iter) {
+        const Index n = iter.pos();
+        if (mValueMask.isOff(n)) {
+            mBuffer[n] = other.mBuffer[n];
+            mValueMask.setOn(n);
+        }
+    }
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+template<typename T, Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<T, Log2Dim>::merge(const LeafNode& other,
+    const ValueType& /*bg*/, const ValueType& /*otherBG*/)
+{
+    this->template merge<Policy>(other);
+}
+
+template<typename T, Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<T, Log2Dim>::merge(const ValueType& tileValue, bool tileActive)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    if (Policy != MERGE_ACTIVE_STATES_AND_NODES) return;
+    if (!tileActive) return;
+    // Replace all inactive values with the active tile value.
+    for (typename NodeMaskType::OffIterator iter = mValueMask.beginOff(); iter; ++iter) {
+        const Index n = iter.pos();
+        mBuffer[n] = tileValue;
+        mValueMask.setOn(n);
+    }
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<T, Log2Dim>::topologyUnion(const LeafNode<OtherType, Log2Dim>& other)
+{
+    mValueMask |= other.valueMask();
+}
+
+template<typename T, Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<T, Log2Dim>::topologyIntersection(const LeafNode<OtherType, Log2Dim>& other,
+                                           const ValueType&)
+{
+    mValueMask &= other.valueMask();
+}
+
+template<typename T, Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<T, Log2Dim>::topologyDifference(const LeafNode<OtherType, Log2Dim>& other,
+                                         const ValueType&)
+{
+    mValueMask &= !other.valueMask();
+}
+
+template<typename T, Index Log2Dim>
+inline void
+LeafNode<T, Log2Dim>::negate()
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+    for (Index i = 0; i < SIZE; ++i) {
+        mBuffer[i] = -mBuffer[i];
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+template<typename CombineOp>
+inline void
+LeafNode<T, Log2Dim>::combine(const LeafNode& other, CombineOp& op)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+    CombineArgs<T> args;
+    for (Index i = 0; i < SIZE; ++i) {
+        op(args.setARef(mBuffer[i])
+            .setAIsActive(mValueMask.isOn(i))
+            .setBRef(other.mBuffer[i])
+            .setBIsActive(other.valueMask().isOn(i))
+            .setResultRef(mBuffer[i]));
+        mValueMask.set(i, args.resultIsActive());
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename CombineOp>
+inline void
+LeafNode<T, Log2Dim>::combine(const ValueType& value, bool valueIsActive, CombineOp& op)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+    CombineArgs<T> args;
+    args.setBRef(value).setBIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        op(args.setARef(mBuffer[i])
+            .setAIsActive(mValueMask.isOn(i))
+            .setResultRef(mBuffer[i]));
+        mValueMask.set(i, args.resultIsActive());
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+template<typename CombineOp, typename OtherType>
+inline void
+LeafNode<T, Log2Dim>::combine2(const LeafNode& other, const OtherType& value,
+    bool valueIsActive, CombineOp& op)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+    CombineArgs<T, OtherType> args;
+    args.setBRef(value).setBIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        op(args.setARef(other.mBuffer[i])
+            .setAIsActive(other.valueMask().isOn(i))
+            .setResultRef(mBuffer[i]));
+        mValueMask.set(i, args.resultIsActive());
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename CombineOp, typename OtherNodeT>
+inline void
+LeafNode<T, Log2Dim>::combine2(const ValueType& value, const OtherNodeT& other,
+    bool valueIsActive, CombineOp& op)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+    CombineArgs<T, typename OtherNodeT::ValueType> args;
+    args.setARef(value).setAIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        op(args.setBRef(other.mBuffer[i])
+            .setBIsActive(other.valueMask().isOn(i))
+            .setResultRef(mBuffer[i]));
+        mValueMask.set(i, args.resultIsActive());
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename CombineOp, typename OtherNodeT>
+inline void
+LeafNode<T, Log2Dim>::combine2(const LeafNode& b0, const OtherNodeT& b1, CombineOp& op)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!this->allocate()) return;
+#endif
+    CombineArgs<T, typename OtherNodeT::ValueType> args;
+    for (Index i = 0; i < SIZE; ++i) {
+        mValueMask.set(i, b0.valueMask().isOn(i) || b1.valueMask().isOn(i));
+        op(args.setARef(b0.mBuffer[i])
+            .setAIsActive(b0.valueMask().isOn(i))
+            .setBRef(b1.mBuffer[i])
+            .setBIsActive(b1.valueMask().isOn(i))
+            .setResultRef(mBuffer[i]));
+        mValueMask.set(i, args.resultIsActive());
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+template<typename BBoxOp>
+inline void
+LeafNode<T, Log2Dim>::visitActiveBBox(BBoxOp& op) const
+{
+    if (op.template descent<LEVEL>()) {
+        for (ValueOnCIter i=this->cbeginValueOn(); i; ++i) {
+#ifdef _MSC_VER
+            op.operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), 1));
+#else
+            op.template operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), 1));
+#endif
+        }
+    } else {
+#ifdef _MSC_VER
+        op.operator()<LEVEL>(this->getNodeBoundingBox());
+#else
+        op.template operator()<LEVEL>(this->getNodeBoundingBox());
+#endif
+    }
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename VisitorOp>
+inline void
+LeafNode<T, Log2Dim>::visit(VisitorOp& op)
+{
+    doVisit<LeafNode, VisitorOp, ChildAllIter>(*this, op);
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename VisitorOp>
+inline void
+LeafNode<T, Log2Dim>::visit(VisitorOp& op) const
+{
+    doVisit<const LeafNode, VisitorOp, ChildAllCIter>(*this, op);
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+inline void
+LeafNode<T, Log2Dim>::doVisit(NodeT& self, VisitorOp& op)
+{
+    for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+        op(iter);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+template<typename OtherLeafNodeType, typename VisitorOp>
+inline void
+LeafNode<T, Log2Dim>::visit2Node(OtherLeafNodeType& other, VisitorOp& op)
+{
+    doVisit2Node<LeafNode, OtherLeafNodeType, VisitorOp, ChildAllIter,
+        typename OtherLeafNodeType::ChildAllIter>(*this, other, op);
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename OtherLeafNodeType, typename VisitorOp>
+inline void
+LeafNode<T, Log2Dim>::visit2Node(OtherLeafNodeType& other, VisitorOp& op) const
+{
+    doVisit2Node<const LeafNode, OtherLeafNodeType, VisitorOp, ChildAllCIter,
+        typename OtherLeafNodeType::ChildAllCIter>(*this, other, op);
+}
+
+
+template<typename T, Index Log2Dim>
+template<
+    typename NodeT,
+    typename OtherNodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+LeafNode<T, Log2Dim>::doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp& op)
+{
+    // Allow the two nodes to have different ValueTypes, but not different dimensions.
+    BOOST_STATIC_ASSERT(OtherNodeT::SIZE == NodeT::SIZE);
+    BOOST_STATIC_ASSERT(OtherNodeT::LEVEL == NodeT::LEVEL);
+
+    ChildAllIterT iter = self.beginChildAll();
+    OtherChildAllIterT otherIter = other.beginChildAll();
+
+    for ( ; iter && otherIter; ++iter, ++otherIter) {
+        op(iter, otherIter);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+template<typename IterT, typename VisitorOp>
+inline void
+LeafNode<T, Log2Dim>::visit2(IterT& otherIter, VisitorOp& op, bool otherIsLHS)
+{
+    doVisit2<LeafNode, VisitorOp, ChildAllIter, IterT>(
+        *this, otherIter, op, otherIsLHS);
+}
+
+
+template<typename T, Index Log2Dim>
+template<typename IterT, typename VisitorOp>
+inline void
+LeafNode<T, Log2Dim>::visit2(IterT& otherIter, VisitorOp& op, bool otherIsLHS) const
+{
+    doVisit2<const LeafNode, VisitorOp, ChildAllCIter, IterT>(
+        *this, otherIter, op, otherIsLHS);
+}
+
+
+template<typename T, Index Log2Dim>
+template<
+    typename NodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+LeafNode<T, Log2Dim>::doVisit2(NodeT& self, OtherChildAllIterT& otherIter,
+    VisitorOp& op, bool otherIsLHS)
+{
+    if (!otherIter) return;
+
+    if (otherIsLHS) {
+        for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+            op(otherIter, iter);
+        }
+    } else {
+        for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+            op(iter, otherIter);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename T, Index Log2Dim>
+inline std::ostream&
+operator<<(std::ostream& os, const typename LeafNode<T, Log2Dim>::Buffer& buf)
+{
+    for (Index32 i = 0, N = buf.size(); i < N; ++i) os << buf.mData[i] << ", ";
+    return os;
+}
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+////////////////////////////////////////
+
+
+// Specialization for LeafNodes of type bool
+#include "LeafNodeBool.h"
+
+// Specialization for LeafNodes with mask information only
+#include "LeafNodeMask.h"
+
+#endif // OPENVDB_TREE_LEAFNODE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/LeafNodeBool.h b/nuparu/include/openvdb_new/tree/LeafNodeBool.h
new file mode 100644
index 00000000..8591b6c1
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/LeafNodeBool.h
@@ -0,0 +1,1773 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TREE_LEAF_NODE_BOOL_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_LEAF_NODE_BOOL_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+#include <boost/static_assert.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/io/Compression.h> // for io::readData(), etc.
+#include <openvdb/math/Math.h> // for math::isZero()
+#include <openvdb/util/NodeMasks.h>
+#include "LeafNode.h"
+#include "Iterator.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+/// @brief LeafNode specialization for values of type bool that stores both
+/// the active states and the values of (2^Log2Dim)^3 voxels as bit masks
+template<Index Log2Dim>
+class LeafNode<bool, Log2Dim>
+{
+public:
+    typedef LeafNode<bool, Log2Dim>         LeafNodeType;
+    typedef boost::shared_ptr<LeafNodeType> Ptr;
+    typedef bool                            BuildType; 
+    typedef bool                            ValueType;
+    typedef util::NodeMask<Log2Dim>         NodeMaskType;
+
+    // These static declarations must be on separate lines to avoid VC9 compiler errors.
+    static const Index LOG2DIM    = Log2Dim;    // needed by parent nodes
+    static const Index TOTAL      = Log2Dim;    // needed by parent nodes
+    static const Index DIM        = 1 << TOTAL; // dimension along one coordinate direction
+    static const Index NUM_VALUES = 1 << 3 * Log2Dim;
+    static const Index NUM_VOXELS = NUM_VALUES; // total number of voxels represented by this node
+    static const Index SIZE       = NUM_VALUES;
+    static const Index LEVEL      = 0;          // level 0 = leaf
+
+    /// @brief ValueConverter<T>::Type is the type of a LeafNode having the same
+    /// dimensions as this node but a different value type, T.
+    template<typename ValueType>
+    struct ValueConverter { typedef LeafNode<ValueType, Log2Dim> Type; };
+
+    /// @brief SameConfiguration<OtherNodeType>::value is @c true if and only if
+    /// OtherNodeType is the type of a LeafNode with the same dimensions as this node.
+    template<typename OtherNodeType>
+    struct SameConfiguration {
+        static const bool value = SameLeafConfig<LOG2DIM, OtherNodeType>::value;
+    };
+
+
+    class Buffer
+    {
+    public:
+        typedef typename NodeMaskType::Word WordType;
+        static const Index WORD_COUNT = NodeMaskType::WORD_COUNT;
+        Buffer() {}
+        Buffer(bool on) : mData(on) {}
+        Buffer(const NodeMaskType& other): mData(other) {}
+        Buffer(const Buffer& other): mData(other.mData) {}
+        ~Buffer() {}
+        void fill(bool val) { mData.set(val); }
+        Buffer& operator=(const Buffer& b) { if (&b != this) { mData = b.mData; } return *this; }
+
+        const bool& getValue(Index i) const
+        {
+            assert(i < SIZE);
+            // We can't use the ternary operator here, otherwise Visual C++ returns
+            // a reference to a temporary.
+            if (mData.isOn(i)) return LeafNode::sOn; else return LeafNode::sOff;
+        }
+        const bool& operator[](Index i) const { return this->getValue(i); }
+
+        bool operator==(const Buffer& other) const { return mData == other.mData; }
+        bool operator!=(const Buffer& other) const { return mData != other.mData; }
+
+        void setValue(Index i, bool val) { assert(i < SIZE); mData.set(i, val); }
+
+        void swap(Buffer& other) { if (&other != this) std::swap(mData, other.mData); }
+
+        Index memUsage() const { return mData.memUsage(); }
+        static Index size() { return SIZE; }
+
+        /// Return a point to the c-style array of words encoding the bits.
+        /// @warning This method should only be used by experts that
+        /// seek low-level optimizations.
+        WordType* data()
+        {
+            return &(mData.template getWord<WordType>(0));
+        }
+        /// Return a const point to the c-style array of words
+        /// encoding the bits.
+        /// @warning This method should only be used by experts that
+        /// seek low-level optimizations.
+        const WordType* data() const
+        {
+            return const_cast<Buffer*>(this)->data();
+        }
+
+    private:
+        friend class ::TestLeaf;
+        // Allow the parent LeafNode to access this Buffer's bit mask.
+        friend class LeafNode;
+
+        NodeMaskType mData;
+    }; // class Buffer
+
+
+    /// Default constructor
+    LeafNode();
+
+    /// Constructor
+    /// @param xyz     the coordinates of a voxel that lies within the node
+    /// @param value   the initial value for all of this node's voxels
+    /// @param active  the active state to which to initialize all voxels
+    explicit LeafNode(const Coord& xyz, bool value = false, bool active = false);
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// "Partial creation" constructor used during file input
+    LeafNode(PartialCreate, const Coord& xyz, bool value = false, bool active = false);
+#endif
+
+    /// Deep copy constructor
+    LeafNode(const LeafNode&);
+
+    /// Value conversion copy constructor
+    template<typename OtherValueType>
+    explicit LeafNode(const LeafNode<OtherValueType, Log2Dim>& other);
+
+    /// Topology copy constructor
+    template<typename ValueType>
+    LeafNode(const LeafNode<ValueType, Log2Dim>& other, TopologyCopy);
+
+    //@{
+    /// @brief Topology copy constructor
+    /// @note This variant exists mainly to enable template instantiation.
+    template<typename ValueType>
+    LeafNode(const LeafNode<ValueType, Log2Dim>& other, bool offValue, bool onValue, TopologyCopy);
+    template<typename ValueType>
+    LeafNode(const LeafNode<ValueType, Log2Dim>& other, bool background, TopologyCopy);
+    //@}
+
+    /// Destructor
+    ~LeafNode();
+
+    //
+    // Statistics
+    //
+    /// Return log2 of the size of the buffer storage.
+    static Index log2dim() { return Log2Dim; }
+    /// Return the number of voxels in each dimension.
+    static Index dim() { return DIM; }
+    static Index size() { return SIZE; }
+    static Index numValues() { return SIZE; }
+    static Index getLevel() { return LEVEL; }
+    static void getNodeLog2Dims(std::vector<Index>& dims) { dims.push_back(Log2Dim); }
+    static Index getChildDim() { return 1; }
+
+    static Index32 leafCount() { return 1; }
+    static Index32 nonLeafCount() { return 0; }
+
+    /// Return the number of active voxels.
+    Index64 onVoxelCount() const { return mValueMask.countOn(); }
+    /// Return the number of inactive voxels.
+    Index64 offVoxelCount() const { return mValueMask.countOff(); }
+    Index64 onLeafVoxelCount() const { return onVoxelCount(); }
+    Index64 offLeafVoxelCount() const { return offVoxelCount(); }
+    static Index64 onTileCount()  { return 0; }
+    static Index64 offTileCount() { return 0; }
+
+    /// Return @c true if this node has no active voxels.
+    bool isEmpty() const { return mValueMask.isOff(); }
+    /// Return @c true if this node only contains active voxels.
+    bool isDense() const { return mValueMask.isOn(); }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Return @c true if memory for this node's buffer has been allocated.
+    /// @details Currently, boolean leaf nodes don't support partial creation,
+    /// so this always returns @c true.
+    bool isAllocated() const { return true; }
+    /// @brief Allocate memory for this node's buffer if it has not already been allocated.
+    /// @details Currently, boolean leaf nodes don't support partial creation,
+    /// so this has no effect.
+    bool allocate() { return true; }
+#endif
+
+    /// Return the memory in bytes occupied by this node.
+    Index64 memUsage() const;
+
+    /// Expand the given bounding box so that it includes this leaf node's active voxels.
+    /// If visitVoxels is false this LeafNode will be approximated as dense, i.e. with all
+    /// voxels active. Else the individual active voxels are visited to produce a tight bbox.
+    void evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels = true) const;
+
+    /// @brief Return the bounding box of this node, i.e., the full index space
+    /// spanned by this leaf node.
+    CoordBBox getNodeBoundingBox() const { return CoordBBox::createCube(mOrigin, DIM); }
+
+    /// Set the grid index coordinates of this node's local origin.
+    void setOrigin(const Coord& origin) { mOrigin = origin; }
+    //@{
+    /// Return the grid index coordinates of this node's local origin.
+    const Coord& origin() const { return mOrigin; }
+    void getOrigin(Coord& origin) const { origin = mOrigin; }
+    void getOrigin(Int32& x, Int32& y, Int32& z) const { mOrigin.asXYZ(x, y, z); }
+    //@}
+
+    /// Return the linear table offset of the given global or local coordinates.
+    static Index coordToOffset(const Coord& xyz);
+    /// @brief Return the local coordinates for a linear table offset,
+    /// where offset 0 has coordinates (0, 0, 0).
+    static Coord offsetToLocalCoord(Index n);
+    /// Return the global coordinates for a linear table offset.
+    Coord offsetToGlobalCoord(Index n) const;
+
+    /// Return a string representation of this node.
+    std::string str() const;
+
+    /// @brief Return @c true if the given node (which may have a different @c ValueType
+    /// than this node) has the same active value topology as this node.
+    template<typename OtherType, Index OtherLog2Dim>
+    bool hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const;
+
+    /// Check for buffer equivalence by value.
+    bool operator==(const LeafNode&) const;
+    bool operator!=(const LeafNode&) const;
+
+    //
+    // Buffer management
+    //
+    /// @brief Exchange this node's data buffer with the given data buffer
+    /// without changing the active states of the values.
+    void swap(Buffer& other) { mBuffer.swap(other); }
+    const Buffer& buffer() const { return mBuffer; }
+    Buffer& buffer() { return mBuffer; }
+
+    //
+    // I/O methods
+    //
+    /// Read in just the topology.
+    void readTopology(std::istream&, bool fromHalf = false);
+    /// Write out just the topology.
+    void writeTopology(std::ostream&, bool toHalf = false) const;
+
+    /// Read in the topology and the origin.
+    void readBuffers(std::istream&, bool fromHalf = false);
+    void readBuffers(std::istream& is, const CoordBBox&, bool fromHalf = false);
+    /// Write out the topology and the origin.
+    void writeBuffers(std::ostream&, bool toHalf = false) const;
+
+    //
+    // Accessor methods
+    //
+    /// Return the value of the voxel at the given coordinates.
+    const bool& getValue(const Coord& xyz) const;
+    /// Return the value of the voxel at the given offset.
+    const bool& getValue(Index offset) const;
+
+    /// @brief Return @c true if the voxel at the given coordinates is active.
+    /// @param xyz       the coordinates of the voxel to be probed
+    /// @param[out] val  the value of the voxel at the given coordinates
+    bool probeValue(const Coord& xyz, bool& val) const;
+
+    /// Return the level (0) at which leaf node values reside.
+    static Index getValueLevel(const Coord&) { return LEVEL; }
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on);
+    /// Set the active state of the voxel at the given offset but don't change its value.
+    void setActiveState(Index offset, bool on) { assert(offset<SIZE); mValueMask.set(offset, on); }
+
+    /// Set the value of the voxel at the given coordinates but don't change its active state.
+    void setValueOnly(const Coord& xyz, bool val);
+    /// Set the value of the voxel at the given offset but don't change its active state.
+    void setValueOnly(Index offset, bool val) { assert(offset<SIZE); mBuffer.setValue(offset,val); }
+
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz) { mValueMask.setOff(this->coordToOffset(xyz)); }
+    /// Mark the voxel at the given offset as inactive but don't change its value.
+    void setValueOff(Index offset) { assert(offset < SIZE); mValueMask.setOff(offset); }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, bool val);
+    /// Set the value of the voxel at the given offset and mark the voxel as inactive.
+    void setValueOff(Index offset, bool val);
+
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz) { mValueMask.setOn(this->coordToOffset(xyz)); }
+    /// Mark the voxel at the given offset as active but don't change its value.
+    void setValueOn(Index offset) { assert(offset < SIZE); mValueMask.setOn(offset); }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValueOn(const Coord& xyz, bool val);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, bool val) { this->setValueOn(xyz, val); }
+    /// Set the value of the voxel at the given offset and mark the voxel as active.
+    void setValueOn(Index offset, bool val);
+
+    /// @brief Apply a functor to the value of the voxel at the given offset
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(Index offset, const ModifyOp& op);
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op);
+
+    /// Apply a functor to the voxel at the given coordinates.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op);
+
+    /// Mark all voxels as active but don't change their values.
+    void setValuesOn() { mValueMask.setOn(); }
+    /// Mark all voxels as inactive but don't change their values.
+    void setValuesOff() { mValueMask.setOff(); }
+
+    /// Return @c true if the voxel at the given coordinates is active.
+    bool isValueOn(const Coord& xyz) const { return mValueMask.isOn(this->coordToOffset(xyz)); }
+    /// Return @c true if the voxel at the given offset is active.
+    bool isValueOn(Index offset) const { assert(offset < SIZE); return mValueMask.isOn(offset); }
+
+    /// Return @c false since leaf nodes never contain tiles.
+    static bool hasActiveTiles() { return false; }
+
+    /// Set all voxels that lie outside the given axis-aligned box to the background.
+    void clip(const CoordBBox&, bool background);
+
+    /// Set all voxels within an axis-aligned box to the specified value and active state.
+    void fill(const CoordBBox& bbox, bool value, bool active = true);
+
+    /// Set all voxels to the specified value but don't change their active states.
+    void fill(const bool& value);
+    /// Set all voxels to the specified value and active state.
+    void fill(const bool& value, bool active);
+
+    /// @brief Copy into a dense grid the values of the voxels that lie within
+    /// a given bounding box.
+    ///
+    /// @param bbox   inclusive bounding box of the voxels to be copied into the dense grid
+    /// @param dense  dense grid with a stride in @e z of one (see tools::Dense
+    ///               in tools/Dense.h for the required API)
+    ///
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    /// @note Consider using tools::CopyToDense in tools/Dense.h
+    /// instead of calling this method directly.
+    template<typename DenseT>
+    void copyToDense(const CoordBBox& bbox, DenseT& dense) const;
+
+    /// @brief Copy from a dense grid into this node the values of the voxels
+    /// that lie within a given bounding box.
+    /// @details Only values that are different (by more than the given tolerance)
+    /// from the background value will be active.  Other values are inactive
+    /// and truncated to the background value.
+    ///
+    /// @param bbox        inclusive bounding box of the voxels to be copied into this node
+    /// @param dense       dense grid with a stride in @e z of one (see tools::Dense
+    ///                    in tools/Dense.h for the required API)
+    /// @param background  background value of the tree that this node belongs to
+    /// @param tolerance   tolerance within which a value equals the background value
+    ///
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    /// @note Consider using tools::CopyFromDense in tools/Dense.h
+    /// instead of calling this method directly.
+    template<typename DenseT>
+    void copyFromDense(const CoordBBox& bbox, const DenseT& dense, bool background, bool tolerance);
+
+    /// @brief Return the value of the voxel at the given coordinates.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    const bool& getValueAndCache(const Coord& xyz, AccessorT&) const {return this->getValue(xyz);}
+
+    /// @brief Return @c true if the voxel at the given coordinates is active.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool isValueOnAndCache(const Coord& xyz, AccessorT&) const { return this->isValueOn(xyz); }
+
+    /// @brief Change the value of the voxel at the given coordinates and mark it as active.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueAndCache(const Coord& xyz, bool val, AccessorT&) { this->setValueOn(xyz, val); }
+
+    /// @brief Change the value of the voxel at the given coordinates
+    /// but preserve its state.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord& xyz, bool val, AccessorT&) {this->setValueOnly(xyz,val);}
+
+    /// @brief Change the value of the voxel at the given coordinates and mark it as inactive.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord& xyz, bool value, AccessorT&)
+    {
+        this->setValueOff(xyz, value);
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&)
+    {
+        this->modifyValue(xyz, op);
+    }
+
+    /// Apply a functor to the voxel at the given coordinates.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&)
+    {
+        this->modifyValueAndActiveState(xyz, op);
+    }
+
+    /// @brief Set the active state of the voxel at the given coordinates
+    /// without changing its value.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT&)
+    {
+        this->setActiveState(xyz, on);
+    }
+
+    /// @brief Return @c true if the voxel at the given coordinates is active
+    /// and return the voxel value in @a val.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool probeValueAndCache(const Coord& xyz, bool& val, AccessorT&) const
+    {
+        return this->probeValue(xyz, val);
+    }
+
+    /// @brief Return the LEVEL (=0) at which leaf node values reside.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    static Index getValueLevelAndCache(const Coord&, AccessorT&) { return LEVEL; }
+
+    /// @brief Return a const reference to the first entry in the buffer.
+    /// @note Since it's actually a reference to a static data member
+    /// it should not be converted to a non-const pointer!
+    const bool& getFirstValue() const { if (mValueMask.isOn(0)) return sOn; else return sOff; }
+    /// @brief Return a const reference to the last entry in the buffer.
+    /// @note Since it's actually a reference to a static data member
+    /// it should not be converted to a non-const pointer!
+    const bool& getLastValue() const { if (mValueMask.isOn(SIZE-1)) return sOn; else return sOff; }
+
+    /// Return @c true if all of this node's voxels have the same active state
+    /// and are equal to within the given tolerance, and return the value in
+    /// @a constValue and the active state in @a state.
+    bool isConstant(bool& constValue, bool& state, bool tolerance = 0) const;
+    /// Return @c true if all of this node's values are inactive.
+    bool isInactive() const { return mValueMask.isOff(); }
+
+    void resetBackground(bool oldBackground, bool newBackground);
+
+    void negate() { mBuffer.mData.toggle(); }
+
+    template<MergePolicy Policy>
+    void merge(const LeafNode& other, bool bg = false, bool otherBG = false);
+    template<MergePolicy Policy> void merge(bool tileValue, bool tileActive);
+
+    /// @brief No-op
+    /// @details This function exists only to enable template instantiation.
+    void voxelizeActiveTiles(bool = true) {}
+
+    /// @brief Union this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active if either of the original voxels
+    /// were active.
+    ///
+    /// @note This operation modifies only active states, not values.
+    template<typename OtherType>
+    void topologyUnion(const LeafNode<OtherType, Log2Dim>& other);
+
+    /// @brief Intersect this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if both of the original voxels
+    /// were active.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyIntersection.
+    ///
+    /// @note This operation modifies only active states, not
+    /// values. Also note that this operation can result in all voxels
+    /// being inactive so consider subsequnetly calling prune.
+    template<typename OtherType>
+    void topologyIntersection(const LeafNode<OtherType, Log2Dim>& other, const bool&);
+
+    /// @brief Difference this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if the original voxel is
+    /// active in this LeafNode and inactive in the other LeafNode.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyDifference.
+    ///
+    /// @note This operation modifies only active states, not values.
+    /// Also, because it can deactivate all of this node's voxels,
+    /// consider subsequently calling prune.
+    template<typename OtherType>
+    void topologyDifference(const LeafNode<OtherType, Log2Dim>& other, const bool&);
+
+    template<typename CombineOp>
+    void combine(const LeafNode& other, CombineOp& op);
+    template<typename CombineOp>
+    void combine(bool, bool valueIsActive, CombineOp& op);
+
+    template<typename CombineOp, typename OtherType /*= bool*/>
+    void combine2(const LeafNode& other, const OtherType&, bool valueIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherNodeT /*= LeafNode*/>
+    void combine2(bool, const OtherNodeT& other, bool valueIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherNodeT /*= LeafNode*/>
+    void combine2(const LeafNode& b0, const OtherNodeT& b1, CombineOp&);
+
+    /// @brief Calls the templated functor BBoxOp with bounding box information.
+    /// An additional level argument is provided to the callback.
+    ///
+    /// @note The bounding boxes are guarenteed to be non-overlapping.
+    template<typename BBoxOp> void visitActiveBBox(BBoxOp&) const;
+
+    template<typename VisitorOp> void visit(VisitorOp&);
+    template<typename VisitorOp> void visit(VisitorOp&) const;
+
+    template<typename OtherLeafNodeType, typename VisitorOp>
+    void visit2Node(OtherLeafNodeType& other, VisitorOp&);
+    template<typename OtherLeafNodeType, typename VisitorOp>
+    void visit2Node(OtherLeafNodeType& other, VisitorOp&) const;
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false);
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false) const;
+
+    //@{
+    /// This function exists only to enable template instantiation.
+    void prune(const ValueType& /*tolerance*/ = zeroVal<ValueType>()) {}
+    void addLeaf(LeafNode*) {}
+    template<typename AccessorT>
+    void addLeafAndCache(LeafNode*, AccessorT&) {}
+    template<typename NodeT>
+    NodeT* stealNode(const Coord&, const ValueType&, bool) { return NULL; }
+    template<typename NodeT>
+    NodeT* probeNode(const Coord&) { return NULL; }
+    template<typename NodeT>
+    const NodeT* probeConstNode(const Coord&) const { return NULL; }
+    template<typename ArrayT> void getNodes(ArrayT&) const {}
+    template<typename ArrayT> void stealNodes(ArrayT&, const ValueType&, bool) {}
+    //@}
+
+    void addTile(Index level, const Coord&, bool val, bool active);
+    void addTile(Index offset, bool val, bool active);
+    template<typename AccessorT>
+    void addTileAndCache(Index level, const Coord&, bool val, bool active, AccessorT&);
+
+    //@{
+    /// @brief Return a pointer to this node.
+    LeafNode* touchLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    LeafNode* touchLeafAndCache(const Coord&, AccessorT&) { return this; }
+    LeafNode* probeLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    LeafNode* probeLeafAndCache(const Coord&, AccessorT&) { return this; }
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord&, AccessorT&)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,LeafNode>::value)) return NULL;
+        return reinterpret_cast<NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+    //@{
+    /// @brief Return a @const pointer to this node.
+    const LeafNode* probeLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const LeafNode* probeLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    const LeafNode* probeConstLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const LeafNode* probeConstLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord&, AccessorT&) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,LeafNode>::value)) return NULL;
+        return reinterpret_cast<const NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+
+    //
+    // Iterators
+    //
+protected:
+    typedef typename NodeMaskType::OnIterator    MaskOnIter;
+    typedef typename NodeMaskType::OffIterator   MaskOffIter;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIter;
+
+    template<typename MaskIterT, typename NodeT, typename ValueT>
+    struct ValueIter:
+        // Derives from SparseIteratorBase, but can also be used as a dense iterator,
+        // if MaskIterT is a dense mask iterator type.
+        public SparseIteratorBase<MaskIterT, ValueIter<MaskIterT, NodeT, ValueT>, NodeT, ValueT>
+    {
+        typedef SparseIteratorBase<MaskIterT, ValueIter, NodeT, ValueT> BaseT;
+
+        ValueIter() {}
+        ValueIter(const MaskIterT& iter, NodeT* parent): BaseT(iter, parent) {}
+
+        const bool& getItem(Index pos) const { return this->parent().getValue(pos); }
+        const bool& getValue() const { return this->getItem(this->pos()); }
+
+        // Note: setItem() can't be called on const iterators.
+        void setItem(Index pos, bool value) const { this->parent().setValueOnly(pos, value); }
+        // Note: setValue() can't be called on const iterators.
+        void setValue(bool value) const { this->setItem(this->pos(), value); }
+
+        // Note: modifyItem() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyItem(Index n, const ModifyOp& op) const { this->parent().modifyValue(n, op); }
+        // Note: modifyValue() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyValue(const ModifyOp& op) const { this->modifyItem(this->pos(), op); }
+    };
+
+    /// Leaf nodes have no children, so their child iterators have no get/set accessors.
+    template<typename MaskIterT, typename NodeT>
+    struct ChildIter:
+        public SparseIteratorBase<MaskIterT, ChildIter<MaskIterT, NodeT>, NodeT, bool>
+    {
+        ChildIter() {}
+        ChildIter(const MaskIterT& iter, NodeT* parent): SparseIteratorBase<
+            MaskIterT, ChildIter<MaskIterT, NodeT>, NodeT, bool>(iter, parent) {}
+    };
+
+    template<typename NodeT, typename ValueT>
+    struct DenseIter: public DenseIteratorBase<
+        MaskDenseIter, DenseIter<NodeT, ValueT>, NodeT, /*ChildT=*/void, ValueT>
+    {
+        typedef DenseIteratorBase<MaskDenseIter, DenseIter, NodeT, void, ValueT> BaseT;
+        typedef typename BaseT::NonConstValueType NonConstValueT;
+
+        DenseIter() {}
+        DenseIter(const MaskDenseIter& iter, NodeT* parent): BaseT(iter, parent) {}
+
+        bool getItem(Index pos, void*& child, NonConstValueT& value) const
+        {
+            value = this->parent().getValue(pos);
+            child = NULL;
+            return false; // no child
+        }
+
+        // Note: setItem() can't be called on const iterators.
+        //void setItem(Index pos, void* child) const {}
+
+        // Note: unsetItem() can't be called on const iterators.
+        void unsetItem(Index pos, const ValueT& val) const {this->parent().setValueOnly(pos, val);}
+    };
+
+public:
+    typedef ValueIter<MaskOnIter, LeafNode, const bool>           ValueOnIter;
+    typedef ValueIter<MaskOnIter, const LeafNode, const bool>     ValueOnCIter;
+    typedef ValueIter<MaskOffIter, LeafNode, const bool>          ValueOffIter;
+    typedef ValueIter<MaskOffIter, const LeafNode, const bool>    ValueOffCIter;
+    typedef ValueIter<MaskDenseIter, LeafNode, const bool>        ValueAllIter;
+    typedef ValueIter<MaskDenseIter, const LeafNode, const bool>  ValueAllCIter;
+    typedef ChildIter<MaskOnIter, LeafNode>                       ChildOnIter;
+    typedef ChildIter<MaskOnIter, const LeafNode>                 ChildOnCIter;
+    typedef ChildIter<MaskOffIter, LeafNode>                      ChildOffIter;
+    typedef ChildIter<MaskOffIter, const LeafNode>                ChildOffCIter;
+    typedef DenseIter<LeafNode, bool>                             ChildAllIter;
+    typedef DenseIter<const LeafNode, const bool>                 ChildAllCIter;
+
+    ValueOnCIter  cbeginValueOn() const { return ValueOnCIter(mValueMask.beginOn(), this); }
+    ValueOnCIter   beginValueOn() const { return ValueOnCIter(mValueMask.beginOn(), this); }
+    ValueOnIter    beginValueOn() { return ValueOnIter(mValueMask.beginOn(), this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(mValueMask.beginOff(), this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(mValueMask.beginOff(), this); }
+    ValueOffIter   beginValueOff() { return ValueOffIter(mValueMask.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(mValueMask.beginDense(), this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(mValueMask.beginDense(), this); }
+    ValueAllIter   beginValueAll() { return ValueAllIter(mValueMask.beginDense(), this); }
+
+    ValueOnCIter  cendValueOn() const { return ValueOnCIter(mValueMask.endOn(), this); }
+    ValueOnCIter   endValueOn() const { return ValueOnCIter(mValueMask.endOn(), this); }
+    ValueOnIter    endValueOn() { return ValueOnIter(mValueMask.endOn(), this); }
+    ValueOffCIter cendValueOff() const { return ValueOffCIter(mValueMask.endOff(), this); }
+    ValueOffCIter  endValueOff() const { return ValueOffCIter(mValueMask.endOff(), this); }
+    ValueOffIter   endValueOff() { return ValueOffIter(mValueMask.endOff(), this); }
+    ValueAllCIter cendValueAll() const { return ValueAllCIter(mValueMask.endDense(), this); }
+    ValueAllCIter  endValueAll() const { return ValueAllCIter(mValueMask.endDense(), this); }
+    ValueAllIter   endValueAll() { return ValueAllIter(mValueMask.endDense(), this); }
+
+    // Note that [c]beginChildOn() and [c]beginChildOff() actually return end iterators,
+    // because leaf nodes have no children.
+    ChildOnCIter  cbeginChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnCIter   beginChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnIter    beginChildOn() { return ChildOnIter(mValueMask.endOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffCIter  beginChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffIter   beginChildOff() { return ChildOffIter(mValueMask.endOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(mValueMask.beginDense(), this); }
+    ChildAllCIter  beginChildAll() const { return ChildAllCIter(mValueMask.beginDense(), this); }
+    ChildAllIter   beginChildAll() { return ChildAllIter(mValueMask.beginDense(), this); }
+
+    ChildOnCIter  cendChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnCIter   endChildOn() const { return ChildOnCIter(mValueMask.endOn(), this); }
+    ChildOnIter    endChildOn() { return ChildOnIter(mValueMask.endOn(), this); }
+    ChildOffCIter cendChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffCIter  endChildOff() const { return ChildOffCIter(mValueMask.endOff(), this); }
+    ChildOffIter   endChildOff() { return ChildOffIter(mValueMask.endOff(), this); }
+    ChildAllCIter cendChildAll() const { return ChildAllCIter(mValueMask.endDense(), this); }
+    ChildAllCIter  endChildAll() const { return ChildAllCIter(mValueMask.endDense(), this); }
+    ChildAllIter   endChildAll() { return ChildAllIter(mValueMask.endDense(), this); }
+
+    //
+    // Mask accessors
+    //
+    bool isValueMaskOn(Index n) const { return mValueMask.isOn(n); }
+    bool isValueMaskOn() const { return mValueMask.isOn(); }
+    bool isValueMaskOff(Index n) const { return mValueMask.isOff(n); }
+    bool isValueMaskOff() const { return mValueMask.isOff(); }
+    const NodeMaskType& getValueMask() const { return mValueMask; }
+    const NodeMaskType& valueMask() const { return mValueMask; }
+    NodeMaskType& getValueMask() { return mValueMask; }
+    void setValueMask(const NodeMaskType& mask) { mValueMask = mask; }
+    bool isChildMaskOn(Index) const { return false; } // leaf nodes have no children
+    bool isChildMaskOff(Index) const { return true; }
+    bool isChildMaskOff() const { return true; }
+protected:
+    void setValueMask(Index n, bool on) { mValueMask.set(n, on); }
+    void setValueMaskOn(Index n)  { mValueMask.setOn(n); }
+    void setValueMaskOff(Index n) { mValueMask.setOff(n); }
+
+    /// Compute the origin of the leaf node that contains the voxel with the given coordinates.
+    static void evalNodeOrigin(Coord& xyz) { xyz &= ~(DIM - 1); }
+
+    template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+    static inline void doVisit(NodeT&, VisitorOp&);
+
+    template<typename NodeT, typename OtherNodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp&);
+
+    template<typename NodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2(NodeT& self, OtherChildAllIterT&, VisitorOp&, bool otherIsLHS);
+
+
+    /// Bitmask that determines which voxels are active
+    NodeMaskType mValueMask;
+    /// Bitmask representing the values of voxels
+    Buffer mBuffer;
+    /// Global grid index coordinates (x,y,z) of the local origin of this node
+    Coord mOrigin;
+
+    // These static declarations must be on separate lines to avoid VC9 compiler errors.
+    static const bool sOn;
+    static const bool sOff;
+
+private:
+    /// @brief During topology-only construction, access is needed
+    /// to protected/private members of other template instances.
+    template<typename, Index> friend class LeafNode;
+
+    friend struct ValueIter<MaskOnIter, LeafNode, bool>;
+    friend struct ValueIter<MaskOffIter, LeafNode, bool>;
+    friend struct ValueIter<MaskDenseIter, LeafNode, bool>;
+    friend struct ValueIter<MaskOnIter, const LeafNode, bool>;
+    friend struct ValueIter<MaskOffIter, const LeafNode, bool>;
+    friend struct ValueIter<MaskDenseIter, const LeafNode, bool>;
+
+    //@{
+    /// Allow iterators to call mask accessor methods (see below).
+    /// @todo Make mask accessors public?
+    friend class IteratorBase<MaskOnIter, LeafNode>;
+    friend class IteratorBase<MaskOffIter, LeafNode>;
+    friend class IteratorBase<MaskDenseIter, LeafNode>;
+    //@}
+
+}; // class LeafNode<bool>
+
+
+/// @internal For consistency with other nodes and with iterators, methods like
+/// LeafNode::getValue() return a reference to a value.  Since it's not possible
+/// to return a reference to a bit in a node mask, we return a reference to one
+/// of the following static values instead.
+template<Index Log2Dim> const bool LeafNode<bool, Log2Dim>::sOn = true;
+template<Index Log2Dim> const bool LeafNode<bool, Log2Dim>::sOff = false;
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline
+LeafNode<bool, Log2Dim>::LeafNode()
+  : mOrigin(0, 0, 0)
+{
+}
+
+
+template<Index Log2Dim>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(const Coord& xyz, bool value, bool active)
+    : mValueMask(active)
+    , mBuffer(value)
+    , mOrigin(xyz & (~(DIM - 1)))
+{
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<Index Log2Dim>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(PartialCreate, const Coord& xyz, bool value, bool active)
+    : mValueMask(active)
+    , mBuffer(value)
+    , mOrigin(xyz & (~(DIM - 1)))
+{
+    /// @todo For now, this is identical to the non-PartialCreate constructor.
+    /// Consider modifying the Buffer class to allow it to be constructed
+    /// without allocating a bitmask.
+}
+#endif
+
+
+template<Index Log2Dim>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(const LeafNode &other)
+    : mValueMask(other.valueMask())
+    , mBuffer(other.mBuffer)
+    , mOrigin(other.mOrigin)
+{
+}
+
+
+// Copy-construct from a leaf node with the same configuration but a different ValueType.
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other)
+    : mValueMask(other.valueMask())
+    , mOrigin(other.origin())
+{
+    struct Local {
+        /// @todo Consider using a value conversion functor passed as an argument instead.
+        static inline bool convertValue(const ValueT& val) { return bool(val); }
+    };
+
+    for (Index i = 0; i < SIZE; ++i) {
+         mBuffer.setValue(i, Local::convertValue(other.mBuffer[i]));
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other,
+                                  bool background, TopologyCopy)
+    : mValueMask(other.valueMask())
+    , mBuffer(background)
+    , mOrigin(other.origin())
+{
+}
+
+
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other, TopologyCopy)
+    : mValueMask(other.valueMask())
+    , mBuffer(other.valueMask())// value = active state
+    , mOrigin(other.origin())
+{
+}
+
+
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<bool, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other,
+                                  bool offValue, bool onValue, TopologyCopy)
+    : mValueMask(other.valueMask())
+    , mBuffer(other.valueMask())
+    , mOrigin(other.origin())
+{
+    if (offValue) { if (!onValue) mBuffer.mData.toggle(); else mBuffer.mData.setOn(); }
+}
+
+
+template<Index Log2Dim>
+inline
+LeafNode<bool, Log2Dim>::~LeafNode()
+{
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline Index64
+LeafNode<bool, Log2Dim>::memUsage() const
+{
+    return sizeof(mOrigin) + mValueMask.memUsage() + mBuffer.memUsage();
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels) const
+{
+    CoordBBox this_bbox = this->getNodeBoundingBox();
+    if (bbox.isInside(this_bbox)) return;//this LeafNode is already enclosed in the bbox
+    if (ValueOnCIter iter = this->cbeginValueOn()) {//any active values?
+        if (visitVoxels) {//use voxel granularity?
+            this_bbox.reset();
+            for(; iter; ++iter) this_bbox.expand(this->offsetToLocalCoord(iter.pos()));
+            this_bbox.translate(this->origin());
+        }
+        bbox.expand(this_bbox);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename OtherType, Index OtherLog2Dim>
+inline bool
+LeafNode<bool, Log2Dim>::hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const
+{
+    assert(other);
+    return (Log2Dim == OtherLog2Dim && mValueMask == other->getValueMask());
+}
+
+
+template<Index Log2Dim>
+inline std::string
+LeafNode<bool, Log2Dim>::str() const
+{
+    std::ostringstream ostr;
+    ostr << "LeafNode @" << mOrigin << ": ";
+    for (Index32 n = 0; n < SIZE; ++n) ostr << (mValueMask.isOn(n) ? '#' : '.');
+    return ostr.str();
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline Index
+LeafNode<bool, Log2Dim>::coordToOffset(const Coord& xyz)
+{
+    assert ((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
+    return ((xyz[0] & (DIM-1u)) << 2*Log2Dim)
+         + ((xyz[1] & (DIM-1u)) << Log2Dim)
+         +  (xyz[2] & (DIM-1u));
+}
+
+
+template<Index Log2Dim>
+inline Coord
+LeafNode<bool, Log2Dim>::offsetToLocalCoord(Index n)
+{
+    assert(n < (1 << 3*Log2Dim));
+    Coord xyz;
+    xyz.setX(n >> 2*Log2Dim);
+    n &= ((1 << 2*Log2Dim) - 1);
+    xyz.setY(n >> Log2Dim);
+    xyz.setZ(n & ((1 << Log2Dim) - 1));
+    return xyz;
+}
+
+
+template<Index Log2Dim>
+inline Coord
+LeafNode<bool, Log2Dim>::offsetToGlobalCoord(Index n) const
+{
+    return (this->offsetToLocalCoord(n) + this->origin());
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::readTopology(std::istream& is, bool /*fromHalf*/)
+{
+    mValueMask.load(is);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::writeTopology(std::ostream& os, bool /*toHalf*/) const
+{
+    mValueMask.save(os);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::readBuffers(std::istream& is, const CoordBBox& clipBBox, bool fromHalf)
+{
+    // Boolean LeafNodes don't currently implement lazy loading.
+    // Instead, load the full buffer, then clip it.
+
+    this->readBuffers(is, fromHalf);
+
+    // Get this tree's background value.
+    bool background = false;
+    if (const void* bgPtr = io::getGridBackgroundValuePtr(is)) {
+        background = *static_cast<const bool*>(bgPtr);
+    }
+    this->clip(clipBBox, background);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::readBuffers(std::istream& is, bool /*fromHalf*/)
+{
+    // Read in the value mask.
+    mValueMask.load(is);
+    // Read in the origin.
+    is.read(reinterpret_cast<char*>(&mOrigin), sizeof(Coord::ValueType) * 3);
+
+    if (io::getFormatVersion(is) >= OPENVDB_FILE_VERSION_BOOL_LEAF_OPTIMIZATION) {
+        // Read in the mask for the voxel values.
+        mBuffer.mData.load(is);
+    } else {
+        // Older files stored one or more bool arrays.
+
+        // Read in the number of buffers, which should now always be one.
+        int8_t numBuffers = 0;
+        is.read(reinterpret_cast<char*>(&numBuffers), sizeof(int8_t));
+
+        // Read in the buffer.
+        // (Note: prior to the bool leaf optimization, buffers were always compressed.)
+        boost::shared_array<bool> buf(new bool[SIZE]);
+        io::readData<bool>(is, buf.get(), SIZE, /*isCompressed=*/true);
+
+        // Transfer values to mBuffer.
+        mBuffer.mData.setOff();
+        for (Index i = 0; i < SIZE; ++i) {
+            if (buf[i]) mBuffer.mData.setOn(i);
+        }
+
+        if (numBuffers > 1) {
+            // Read in and discard auxiliary buffers that were created with
+            // earlier versions of the library.
+            for (int i = 1; i < numBuffers; ++i) {
+                io::readData<bool>(is, buf.get(), SIZE, /*isCompressed=*/true);
+            }
+        }
+    }
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::writeBuffers(std::ostream& os, bool /*toHalf*/) const
+{
+    // Write out the value mask.
+    mValueMask.save(os);
+    // Write out the origin.
+    os.write(reinterpret_cast<const char*>(&mOrigin), sizeof(Coord::ValueType) * 3);
+    // Write out the voxel values.
+    mBuffer.mData.save(os);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<bool, Log2Dim>::operator==(const LeafNode& other) const
+{
+    return mOrigin == other.mOrigin &&
+           mValueMask == other.valueMask() &&
+           mBuffer == other.mBuffer;
+}
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<bool, Log2Dim>::operator!=(const LeafNode& other) const
+{
+    return !(this->operator==(other));
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<bool, Log2Dim>::isConstant(bool& constValue, bool& state, bool tolerance) const
+{
+    state = mValueMask.isOn();
+
+    if (!(state || mValueMask.isOff())) return false;
+    
+    // Note: if tolerance is true (i.e., 1), then all boolean values compare equal.
+    if (!tolerance && !(mBuffer.mData.isOn() || mBuffer.mData.isOff())) return false;
+    
+    constValue = mBuffer.mData.isOn();
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::addTile(Index /*level*/, const Coord& xyz, bool val, bool active)
+{
+    this->addTile(this->coordToOffset(xyz), val, active);
+}
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::addTile(Index offset, bool val, bool active)
+{
+    assert(offset < SIZE);
+    this->setValueOnly(offset, val);
+    this->setActiveState(offset, active);
+}
+
+template<Index Log2Dim>
+template<typename AccessorT>
+inline void
+LeafNode<bool, Log2Dim>::addTileAndCache(Index level, const Coord& xyz,
+    bool val, bool active, AccessorT&)
+{
+    this->addTile(level, xyz, val, active);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline const bool&
+LeafNode<bool, Log2Dim>::getValue(const Coord& xyz) const
+{
+    // This *CANNOT* use operator ? because Visual C++
+    if (mBuffer.mData.isOn(this->coordToOffset(xyz))) return sOn; else return sOff;
+}
+
+
+template<Index Log2Dim>
+inline const bool&
+LeafNode<bool, Log2Dim>::getValue(Index offset) const
+{
+    assert(offset < SIZE);
+    // This *CANNOT* use operator ? for Windows
+    if (mBuffer.mData.isOn(offset)) return sOn; else return sOff;
+}
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<bool, Log2Dim>::probeValue(const Coord& xyz, bool& val) const
+{
+    const Index offset = this->coordToOffset(xyz);
+    val = mBuffer.mData.isOn(offset);
+    return mValueMask.isOn(offset);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::setValueOn(const Coord& xyz, bool val)
+{
+    this->setValueOn(this->coordToOffset(xyz), val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::setValueOn(Index offset, bool val)
+{
+    assert(offset < SIZE);
+    mValueMask.setOn(offset);
+    mBuffer.mData.set(offset, val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::setValueOnly(const Coord& xyz, bool val)
+{
+    this->setValueOnly(this->coordToOffset(xyz), val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::setActiveState(const Coord& xyz, bool on)
+{
+    mValueMask.set(this->coordToOffset(xyz), on);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::setValueOff(const Coord& xyz, bool val)
+{
+    this->setValueOff(this->coordToOffset(xyz), val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::setValueOff(Index offset, bool val)
+{
+    assert(offset < SIZE);
+    mValueMask.setOff(offset);
+    mBuffer.mData.set(offset, val);
+}
+
+
+template<Index Log2Dim>
+template<typename ModifyOp>
+inline void
+LeafNode<bool, Log2Dim>::modifyValue(Index offset, const ModifyOp& op)
+{
+    bool val = mBuffer.mData.isOn(offset);
+    op(val);
+    mBuffer.mData.set(offset, val);
+    mValueMask.setOn(offset);
+}
+
+
+template<Index Log2Dim>
+template<typename ModifyOp>
+inline void
+LeafNode<bool, Log2Dim>::modifyValue(const Coord& xyz, const ModifyOp& op)
+{
+    this->modifyValue(this->coordToOffset(xyz), op);
+}
+
+
+template<Index Log2Dim>
+template<typename ModifyOp>
+inline void
+LeafNode<bool, Log2Dim>::modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+{
+    const Index offset = this->coordToOffset(xyz);
+    bool val = mBuffer.mData.isOn(offset), state = mValueMask.isOn(offset);
+    op(val, state);
+    mBuffer.mData.set(offset, val);
+    mValueMask.set(offset, state);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::resetBackground(bool oldBackground, bool newBackground)
+{
+    if (newBackground != oldBackground) {
+        // Flip mBuffer's background bits and zero its foreground bits.
+        NodeMaskType bgMask = !(mBuffer.mData | mValueMask);
+        // Overwrite mBuffer's background bits, leaving its foreground bits intact.
+        mBuffer.mData = (mBuffer.mData & mValueMask) | bgMask;
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<bool, Log2Dim>::merge(const LeafNode& other, bool /*bg*/, bool /*otherBG*/)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    if (Policy == MERGE_NODES) return;
+    for (typename NodeMaskType::OnIterator iter = other.valueMask().beginOn(); iter; ++iter) {
+        const Index n = iter.pos();
+        if (mValueMask.isOff(n)) {
+            mBuffer.mData.set(n, other.mBuffer.mData.isOn(n));
+            mValueMask.setOn(n);
+        }
+    }
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+template<Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<bool, Log2Dim>::merge(bool tileValue, bool tileActive)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    if (Policy != MERGE_ACTIVE_STATES_AND_NODES) return;
+    if (!tileActive) return;
+    // Replace all inactive values with the active tile value.
+    if (tileValue) mBuffer.mData |= !mValueMask; // -0=>1, +0=>0, -1=>1, +1=>1 (-,+ = off,on)
+    else mBuffer.mData &= mValueMask;            // -0=>0, +0=>0, -1=>0, +1=>1
+    mValueMask.setOn();
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<bool, Log2Dim>::topologyUnion(const LeafNode<OtherType, Log2Dim>& other)
+{
+    mValueMask |= other.valueMask();
+}
+
+
+template<Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<bool, Log2Dim>::topologyIntersection(const LeafNode<OtherType, Log2Dim>& other,
+                                              const bool&)
+{
+    mValueMask &= other.valueMask();
+}
+
+
+template<Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<bool, Log2Dim>::topologyDifference(const LeafNode<OtherType, Log2Dim>& other,
+                                            const bool&)
+{
+    mValueMask &= !other.valueMask();
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::clip(const CoordBBox& clipBBox, bool background)
+{
+    CoordBBox nodeBBox = this->getNodeBoundingBox();
+    if (!clipBBox.hasOverlap(nodeBBox)) {
+        // This node lies completely outside the clipping region.  Fill it with background tiles.
+        this->fill(nodeBBox, background, /*active=*/false);
+    } else if (clipBBox.isInside(nodeBBox)) {
+        // This node lies completely inside the clipping region.  Leave it intact.
+        return;
+    }
+
+    // This node isn't completely contained inside the clipping region.
+    // Set any voxels that lie outside the region to the background value.
+
+    // Construct a boolean mask that is on inside the clipping region and off outside it.
+    NodeMaskType mask;
+    nodeBBox.intersect(clipBBox);
+    Coord xyz;
+    int &x = xyz.x(), &y = xyz.y(), &z = xyz.z();
+    for (x = nodeBBox.min().x(); x <= nodeBBox.max().x(); ++x) {
+        for (y = nodeBBox.min().y(); y <= nodeBBox.max().y(); ++y) {
+            for (z = nodeBBox.min().z(); z <= nodeBBox.max().z(); ++z) {
+                mask.setOn(static_cast<Index32>(this->coordToOffset(xyz)));
+            }
+        }
+    }
+
+    // Set voxels that lie in the inactive region of the mask (i.e., outside
+    // the clipping region) to the background value.
+    for (MaskOffIter maskIter = mask.beginOff(); maskIter; ++maskIter) {
+        this->setValueOff(maskIter.pos(), background);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::fill(const CoordBBox& bbox, bool value, bool active)
+{
+    for (Int32 x = bbox.min().x(); x <= bbox.max().x(); ++x) {
+        const Index offsetX = (x & (DIM-1u))<<2*Log2Dim;
+        for (Int32 y = bbox.min().y(); y <= bbox.max().y(); ++y) {
+            const Index offsetXY = offsetX + ((y & (DIM-1u))<<  Log2Dim);
+            for (Int32 z = bbox.min().z(); z <= bbox.max().z(); ++z) {
+                const Index offset = offsetXY + (z & (DIM-1u));
+                mValueMask.set(offset, active);
+                mBuffer.mData.set(offset, value);
+            }
+        }
+    }
+}
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::fill(const bool& value)
+{
+    mBuffer.fill(value);
+}
+
+template<Index Log2Dim>
+inline void
+LeafNode<bool, Log2Dim>::fill(const bool& value, bool active)
+{
+    mBuffer.fill(value);
+    mValueMask.set(active);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename DenseT>
+inline void
+LeafNode<bool, Log2Dim>::copyToDense(const CoordBBox& bbox, DenseT& dense) const
+{
+    typedef typename DenseT::ValueType DenseValueType;
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    DenseValueType* t0 = dense.data() + zStride * (bbox.min()[2] - min[2]); // target array
+    const Int32 n0 = bbox.min()[2] & (DIM-1u);
+    for (Int32 x = bbox.min()[0], ex = bbox.max()[0] + 1; x < ex; ++x) {
+        DenseValueType* t1 = t0 + xStride * (x - min[0]);
+        const Int32 n1 = n0 + ((x & (DIM-1u)) << 2*LOG2DIM);
+        for (Int32 y = bbox.min()[1], ey = bbox.max()[1] + 1; y < ey; ++y) {
+            DenseValueType* t2 = t1 + yStride * (y - min[1]);
+            Int32 n2 = n1 + ((y & (DIM-1u)) << LOG2DIM);
+            for (Int32 z = bbox.min()[2], ez = bbox.max()[2] + 1; z < ez; ++z, t2 += zStride) {
+                *t2 = DenseValueType(mBuffer.mData.isOn(n2++));
+            }
+        }
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename DenseT>
+inline void
+LeafNode<bool, Log2Dim>::copyFromDense(const CoordBBox& bbox, const DenseT& dense,
+                                       bool background, bool tolerance)
+{
+    typedef typename DenseT::ValueType DenseValueType;
+    struct Local {
+        inline static bool toBool(const DenseValueType& v) { return !math::isZero(v); }
+    };
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    const DenseValueType* s0 = dense.data() + zStride * (bbox.min()[2] - min[2]); // source
+    const Int32 n0 = bbox.min()[2] & (DIM-1u);
+    for (Int32 x = bbox.min()[0], ex = bbox.max()[0] + 1; x < ex; ++x) {
+        const DenseValueType* s1 = s0 + xStride * (x - min[0]);
+        const Int32 n1 = n0 + ((x & (DIM-1u)) << 2*LOG2DIM);
+        for (Int32 y = bbox.min()[1], ey = bbox.max()[1] + 1; y < ey; ++y) {
+            const DenseValueType* s2 = s1 + yStride * (y - min[1]);
+            Int32 n2 = n1 + ((y & (DIM-1u)) << LOG2DIM);
+            for (Int32 z = bbox.min()[2], ez = bbox.max()[2]+1; z < ez; ++z, ++n2, s2 += zStride) {
+                // Note: if tolerance is true (i.e., 1), then all boolean values compare equal.
+                if (tolerance || (background == Local::toBool(*s2))) {
+                    mValueMask.setOff(n2);
+                    mBuffer.mData.set(n2, background);
+                } else {
+                    mValueMask.setOn(n2);
+                    mBuffer.mData.set(n2, Local::toBool(*s2));
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename CombineOp>
+inline void
+LeafNode<bool, Log2Dim>::combine(const LeafNode& other, CombineOp& op)
+{
+    CombineArgs<bool> args;
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, aVal = mBuffer.mData.isOn(i), bVal = other.mBuffer.mData.isOn(i);
+        op(args.setARef(aVal)
+            .setAIsActive(mValueMask.isOn(i))
+            .setBRef(bVal)
+            .setBIsActive(other.valueMask().isOn(i))
+            .setResultRef(result));
+        mValueMask.set(i, args.resultIsActive());
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename CombineOp>
+inline void
+LeafNode<bool, Log2Dim>::combine(bool value, bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<bool> args;
+    args.setBRef(value).setBIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, aVal = mBuffer.mData.isOn(i);
+        op(args.setARef(aVal)
+            .setAIsActive(mValueMask.isOn(i))
+            .setResultRef(result));
+        mValueMask.set(i, args.resultIsActive());
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename CombineOp, typename OtherType>
+inline void
+LeafNode<bool, Log2Dim>::combine2(const LeafNode& other, const OtherType& value,
+    bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<bool, OtherType> args;
+    args.setBRef(value).setBIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, aVal = other.mBuffer.mData.isOn(i);
+        op(args.setARef(aVal)
+            .setAIsActive(other.valueMask().isOn(i))
+            .setResultRef(result));
+        mValueMask.set(i, args.resultIsActive());
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename CombineOp, typename OtherNodeT>
+inline void
+LeafNode<bool, Log2Dim>::combine2(bool value, const OtherNodeT& other,
+    bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<bool, typename OtherNodeT::ValueType> args;
+    args.setARef(value).setAIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, bVal = other.mBuffer.mData.isOn(i);
+        op(args.setBRef(bVal)
+            .setBIsActive(other.valueMask().isOn(i))
+            .setResultRef(result));
+        mValueMask.set(i, args.resultIsActive());
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename CombineOp, typename OtherNodeT>
+inline void
+LeafNode<bool, Log2Dim>::combine2(const LeafNode& b0, const OtherNodeT& b1, CombineOp& op)
+{
+    CombineArgs<bool, typename OtherNodeT::ValueType> args;
+    for (Index i = 0; i < SIZE; ++i) {
+        // Default behavior: output voxel is active if either input voxel is active.
+        mValueMask.set(i, b0.valueMask().isOn(i) || b1.valueMask().isOn(i));
+
+        bool result = false, b0Val = b0.mBuffer.mData.isOn(i), b1Val = b1.mBuffer.mData.isOn(i);
+        op(args.setARef(b0Val)
+            .setAIsActive(b0.valueMask().isOn(i))
+            .setBRef(b1Val)
+            .setBIsActive(b1.valueMask().isOn(i))
+            .setResultRef(result));
+        mValueMask.set(i, args.resultIsActive());
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+////////////////////////////////////////
+
+template<Index Log2Dim>
+template<typename BBoxOp>
+inline void
+LeafNode<bool, Log2Dim>::visitActiveBBox(BBoxOp& op) const
+{
+    if (op.template descent<LEVEL>()) {
+        for (ValueOnCIter i=this->cbeginValueOn(); i; ++i) {
+#ifdef _MSC_VER
+            op.operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), 1));
+#else
+            op.template operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), 1));
+#endif
+        }
+    } else {
+#ifdef _MSC_VER
+        op.operator()<LEVEL>(this->getNodeBoundingBox());
+#else
+        op.template operator()<LEVEL>(this->getNodeBoundingBox());
+#endif
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename VisitorOp>
+inline void
+LeafNode<bool, Log2Dim>::visit(VisitorOp& op)
+{
+    doVisit<LeafNode, VisitorOp, ChildAllIter>(*this, op);
+}
+
+
+template<Index Log2Dim>
+template<typename VisitorOp>
+inline void
+LeafNode<bool, Log2Dim>::visit(VisitorOp& op) const
+{
+    doVisit<const LeafNode, VisitorOp, ChildAllCIter>(*this, op);
+}
+
+
+template<Index Log2Dim>
+template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+inline void
+LeafNode<bool, Log2Dim>::doVisit(NodeT& self, VisitorOp& op)
+{
+    for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+        op(iter);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename OtherLeafNodeType, typename VisitorOp>
+inline void
+LeafNode<bool, Log2Dim>::visit2Node(OtherLeafNodeType& other, VisitorOp& op)
+{
+    doVisit2Node<LeafNode, OtherLeafNodeType, VisitorOp, ChildAllIter,
+        typename OtherLeafNodeType::ChildAllIter>(*this, other, op);
+}
+
+
+template<Index Log2Dim>
+template<typename OtherLeafNodeType, typename VisitorOp>
+inline void
+LeafNode<bool, Log2Dim>::visit2Node(OtherLeafNodeType& other, VisitorOp& op) const
+{
+    doVisit2Node<const LeafNode, OtherLeafNodeType, VisitorOp, ChildAllCIter,
+        typename OtherLeafNodeType::ChildAllCIter>(*this, other, op);
+}
+
+
+template<Index Log2Dim>
+template<
+    typename NodeT,
+    typename OtherNodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+LeafNode<bool, Log2Dim>::doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp& op)
+{
+    // Allow the two nodes to have different ValueTypes, but not different dimensions.
+    BOOST_STATIC_ASSERT(OtherNodeT::SIZE == NodeT::SIZE);
+    BOOST_STATIC_ASSERT(OtherNodeT::LEVEL == NodeT::LEVEL);
+
+    ChildAllIterT iter = self.beginChildAll();
+    OtherChildAllIterT otherIter = other.beginChildAll();
+
+    for ( ; iter && otherIter; ++iter, ++otherIter) {
+        op(iter, otherIter);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename IterT, typename VisitorOp>
+inline void
+LeafNode<bool, Log2Dim>::visit2(IterT& otherIter, VisitorOp& op, bool otherIsLHS)
+{
+    doVisit2<LeafNode, VisitorOp, ChildAllIter, IterT>(*this, otherIter, op, otherIsLHS);
+}
+
+
+template<Index Log2Dim>
+template<typename IterT, typename VisitorOp>
+inline void
+LeafNode<bool, Log2Dim>::visit2(IterT& otherIter, VisitorOp& op, bool otherIsLHS) const
+{
+    doVisit2<const LeafNode, VisitorOp, ChildAllCIter, IterT>(*this, otherIter, op, otherIsLHS);
+}
+
+
+template<Index Log2Dim>
+template<
+    typename NodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+LeafNode<bool, Log2Dim>::doVisit2(NodeT& self, OtherChildAllIterT& otherIter,
+    VisitorOp& op, bool otherIsLHS)
+{
+    if (!otherIter) return;
+
+    if (otherIsLHS) {
+        for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+            op(otherIter, iter);
+        }
+    } else {
+        for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+            op(iter, otherIter);
+        }
+    }
+}
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_LEAF_NODE_BOOL_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/LeafNodeMask.h b/nuparu/include/openvdb_new/tree/LeafNodeMask.h
new file mode 100644
index 00000000..46aa78d3
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/LeafNodeMask.h
@@ -0,0 +1,1687 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_TREE_LEAF_NODE_MASK_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_LEAF_NODE_MASK_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+#include <boost/shared_array.hpp>
+#include <boost/static_assert.hpp>
+#include <openvdb/Types.h>
+#include <openvdb/io/Compression.h> // for io::readData(), etc.
+#include <openvdb/math/Math.h> // for math::isZero()
+#include <openvdb/util/NodeMasks.h>
+#include "LeafNode.h"
+#include "Iterator.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {        
+
+/// @brief LeafNode specialization for values of type ValueMask that encodes both
+/// the active states and the boolean values of (2^Log2Dim)^3 voxels
+/// in a single bit mask, i.e. voxel values and states are indistinguishable!
+template<Index Log2Dim>
+class LeafNode<ValueMask, Log2Dim>
+{
+public:
+    typedef LeafNode<ValueMask, Log2Dim>    LeafNodeType;
+    typedef boost::shared_ptr<LeafNodeType> Ptr;
+    typedef ValueMask                       BuildType;// this is a rare case where 
+    typedef bool                            ValueType;// value type != build type
+    typedef util::NodeMask<Log2Dim>         NodeMaskType;
+
+    // These static declarations must be on separate lines to avoid VC9 compiler errors.
+    static const Index LOG2DIM    = Log2Dim;    // needed by parent nodes
+    static const Index TOTAL      = Log2Dim;    // needed by parent nodes
+    static const Index DIM        = 1 << TOTAL; // dimension along one coordinate direction
+    static const Index NUM_VALUES = 1 << 3 * Log2Dim;
+    static const Index NUM_VOXELS = NUM_VALUES; // total number of voxels represented by this node
+    static const Index SIZE       = NUM_VALUES;
+    static const Index LEVEL      = 0;          // level 0 = leaf
+
+    /// @brief ValueConverter<T>::Type is the type of a LeafNode having the same
+    /// dimensions as this node but a different value type, T.
+    template<typename OtherValueType>
+    struct ValueConverter {
+        typedef LeafNode<OtherValueType, Log2Dim> Type;
+    };
+
+    /// @brief SameConfiguration<OtherNodeType>::value is @c true if and only if
+    /// OtherNodeType is the type of a LeafNode with the same dimensions as this node.
+    template<typename OtherNodeType>
+    struct SameConfiguration {
+        static const bool value = SameLeafConfig<LOG2DIM, OtherNodeType>::value;
+    };
+
+    class Buffer
+    {
+    public:
+        typedef typename NodeMaskType::Word WordType;
+        static const Index WORD_COUNT = NodeMaskType::WORD_COUNT;
+        Buffer() {}
+        explicit Buffer(bool on) : mData(on) {}
+        Buffer(const NodeMaskType& other): mData(other) {}
+        Buffer(const Buffer& other): mData(other.mData) {}
+        ~Buffer() {}
+        void fill(bool val) { mData.set(val); }
+        Buffer& operator=(const Buffer& b)
+        {
+            if (&b != this) mData = b.mData;
+            return *this;
+        }
+
+        const bool& getValue(Index i) const
+        {
+            assert(i < SIZE);
+            // We can't use the ternary operator here, otherwise Visual C++ returns
+            // a reference to a temporary.
+            if (mData.isOn(i)) return LeafNode::sOn;
+            return LeafNode::sOff;
+        }
+        const bool& operator[](Index i) const { return this->getValue(i); }
+
+        bool operator==(const Buffer& other) const { return mData == other.mData; }
+        bool operator!=(const Buffer& other) const { return mData != other.mData; }
+
+        void setValue(Index i, bool val) { assert(i < SIZE); mData.set(i, val); }
+
+        void swap(Buffer& other) { if (&other != this) std::swap(mData, other.mData); }
+
+        Index memUsage() const { return mData.memUsage(); }
+        static Index size() { return SIZE; }
+
+        /// Return a point to the c-style array of words encoding the bits.
+        /// @warning This method should only be used by experts that
+        /// seek low-level optimizations.
+        WordType* data()
+        {
+            return &(mData.template getWord<WordType>(0));
+        }
+        /// Return a const point to the c-style array of words
+        /// encoding the bits.
+        /// @warning This method should only be used by experts that
+        /// seek low-level optimizations.
+        const WordType* data() const
+        {
+            return const_cast<Buffer*>(this)->data();
+        }
+
+    private:
+        friend class ::TestLeaf;
+        // Allow the parent LeafNode to access this Buffer's bit mask.
+        friend class LeafNode;
+
+        NodeMaskType mData;
+    }; // class Buffer
+
+
+    /// Default constructor
+    LeafNode();
+    
+    /// Constructor
+    /// @param xyz     the coordinates of a voxel that lies within the node
+    /// @param value   the initial value = state for all of this node's voxels
+    /// @param dummy   dummy value
+    explicit LeafNode(const Coord& xyz, bool value = false, bool dummy = false);
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// "Partial creation" constructor used during file input
+    LeafNode(PartialCreate, const Coord& xyz, bool value = false, bool dummy = false);
+#endif
+
+    /// Deep copy constructor
+    LeafNode(const LeafNode&);
+
+    /// Value conversion copy constructor
+    template<typename OtherValueType>
+    explicit LeafNode(const LeafNode<OtherValueType, Log2Dim>& other);
+
+    /// Topology copy constructor
+    template<typename ValueType>
+    LeafNode(const LeafNode<ValueType, Log2Dim>& other, TopologyCopy);
+
+    //@{
+    /// @brief Topology copy constructor
+    /// @note This variant exists mainly to enable template instantiation.
+    template<typename ValueType>
+    LeafNode(const LeafNode<ValueType, Log2Dim>& other, bool offValue, bool onValue, TopologyCopy);
+    template<typename ValueType>
+    LeafNode(const LeafNode<ValueType, Log2Dim>& other, bool background, TopologyCopy);
+    //@}
+
+    /// Destructor
+    ~LeafNode();
+
+    //
+    // Statistics
+    //
+    /// Return log2 of the size of the buffer storage.
+    static Index log2dim() { return Log2Dim; }
+    /// Return the number of voxels in each dimension.
+    static Index dim() { return DIM; }
+    /// Return the total number of voxels represented by this LeafNode
+    static Index size() { return SIZE; }
+    /// Return the total number of voxels represented by this LeafNode
+    static Index numValues() { return SIZE; }
+    /// Return the level of this node, which by definition is zero for LeafNodes
+    static Index getLevel() { return LEVEL; }
+    /// Append the Log2Dim of this LeafNode to the specified vector
+    static void getNodeLog2Dims(std::vector<Index>& dims) { dims.push_back(Log2Dim); }
+    /// Return the dimension of child nodes of this LeafNode, which is one for voxels.
+    static Index getChildDim() { return 1; }
+    /// Return the leaf count for this node, which is one.
+    static Index32 leafCount() { return 1; }
+    /// Return the non-leaf count for this node, which is zero.
+    static Index32 nonLeafCount() { return 0; }
+
+    /// Return the number of active voxels.
+    Index64 onVoxelCount() const { return mBuffer.mData.countOn(); }
+    /// Return the number of inactive voxels.
+    Index64 offVoxelCount() const { return mBuffer.mData.countOff(); }
+    Index64 onLeafVoxelCount() const { return this->onVoxelCount(); }
+    Index64 offLeafVoxelCount() const { return this->offVoxelCount(); }
+    static Index64 onTileCount()  { return 0; }
+    static Index64 offTileCount() { return 0; }
+
+    /// Return @c true if this node has no active voxels.
+    bool isEmpty() const { return mBuffer.mData.isOff(); }
+    /// Return @c true if this node only contains active voxels.
+    bool isDense() const { return mBuffer.mData.isOn(); }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Return @c true if memory for this node's buffer has been allocated.
+    /// @details Currently, boolean leaf nodes don't support partial creation,
+    /// so this always returns @c true.
+    bool isAllocated() const { return true; }
+    /// @brief Allocate memory for this node's buffer if it has not already been allocated.
+    /// @details Currently, boolean leaf nodes don't support partial creation,
+    /// so this has no effect.
+    bool allocate() { return true; }
+#endif
+
+    /// Return the memory in bytes occupied by this node.
+    Index64 memUsage() const;
+
+    /// Expand the given bounding box so that it includes this leaf node's active voxels.
+    /// If visitVoxels is false this LeafNode will be approximated as dense, i.e. with all
+    /// voxels active. Else the individual active voxels are visited to produce a tight bbox.
+    void evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels = true) const;
+
+    /// @brief Return the bounding box of this node, i.e., the full index space
+    /// spanned by this leaf node.
+    CoordBBox getNodeBoundingBox() const { return CoordBBox::createCube(mOrigin, DIM); }
+
+    /// Set the grid index coordinates of this node's local origin.
+    void setOrigin(const Coord& origin) { mOrigin = origin; }
+    //@{
+    /// Return the grid index coordinates of this node's local origin.
+    const Coord& origin() const { return mOrigin; }
+    void getOrigin(Coord& origin) const { origin = mOrigin; }
+    void getOrigin(Int32& x, Int32& y, Int32& z) const { mOrigin.asXYZ(x, y, z); }
+    //@}
+
+    /// Return the linear table offset of the given global or local coordinates.
+    static Index coordToOffset(const Coord& xyz);
+    /// @brief Return the local coordinates for a linear table offset,
+    /// where offset 0 has coordinates (0, 0, 0).
+    static Coord offsetToLocalCoord(Index n);
+    /// Return the global coordinates for a linear table offset.
+    Coord offsetToGlobalCoord(Index n) const;
+
+    /// Return a string representation of this node.
+    std::string str() const;
+
+    /// @brief Return @c true if the given node (which may have a different @c ValueType
+    /// than this node) has the same active value topology as this node.
+    template<typename OtherType, Index OtherLog2Dim>
+    bool hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const;
+
+    /// Check for buffer equivalence by value.
+    bool operator==(const LeafNode&) const;
+    bool operator!=(const LeafNode&) const;
+
+    //
+    // Buffer management
+    //
+    /// @brief Exchange this node's data buffer with the given data buffer
+    /// without changing the active states of the values.
+    void swap(Buffer& other) { mBuffer.swap(other); }
+    const Buffer& buffer() const { return mBuffer; }
+    Buffer& buffer() { return mBuffer; }
+
+    //
+    // I/O methods
+    //
+    /// Read in just the topology.
+    void readTopology(std::istream&, bool fromHalf = false);
+    /// Write out just the topology.
+    void writeTopology(std::ostream&, bool toHalf = false) const;
+
+    /// Read in the topology and the origin.
+    void readBuffers(std::istream&, bool fromHalf = false);
+    void readBuffers(std::istream& is, const CoordBBox&, bool fromHalf = false);
+    /// Write out the topology and the origin.
+    void writeBuffers(std::ostream&, bool toHalf = false) const;
+
+    //
+    // Accessor methods
+    //
+    /// Return the value of the voxel at the given coordinates.
+    const bool& getValue(const Coord& xyz) const;
+    /// Return the value of the voxel at the given offset.
+    const bool& getValue(Index offset) const;
+
+    /// @brief Return @c true if the voxel at the given coordinates is active.
+    /// @param xyz       the coordinates of the voxel to be probed
+    /// @param[out] val  the value of the voxel at the given coordinates
+    bool probeValue(const Coord& xyz, bool& val) const;
+
+    /// Return the level (0) at which leaf node values reside.
+    static Index getValueLevel(const Coord&) { return LEVEL; }
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on);
+    /// Set the active state of the voxel at the given offset but don't change its value.
+    void setActiveState(Index offset, bool on) { assert(offset<SIZE); mBuffer.mData.set(offset, on); }
+
+    /// Set the value of the voxel at the given coordinates but don't change its active state.
+    void setValueOnly(const Coord& xyz, bool val);
+    /// Set the value of the voxel at the given offset but don't change its active state.
+    void setValueOnly(Index offset, bool val) { assert(offset<SIZE); mBuffer.setValue(offset,val); }
+
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz) { mBuffer.mData.setOff(this->coordToOffset(xyz)); }
+    /// Mark the voxel at the given offset as inactive but don't change its value.
+    void setValueOff(Index offset) { assert(offset < SIZE); mBuffer.mData.setOff(offset); }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, bool val);
+    /// Set the value of the voxel at the given offset and mark the voxel as inactive.
+    void setValueOff(Index offset, bool val);
+
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz) { mBuffer.mData.setOn(this->coordToOffset(xyz)); }
+    /// Mark the voxel at the given offset as active but don't change its value.
+    void setValueOn(Index offset) { assert(offset < SIZE); mBuffer.mData.setOn(offset); }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValueOn(const Coord& xyz, bool val);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, bool val) { this->setValueOn(xyz, val); }
+    /// Set the value of the voxel at the given offset and mark the voxel as active.
+    void setValueOn(Index offset, bool val);
+
+    /// @brief Apply a functor to the value of the voxel at the given offset
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(Index offset, const ModifyOp& op);
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op);
+
+    /// Apply a functor to the voxel at the given coordinates.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op);
+
+    /// Mark all voxels as active but don't change their values.
+    void setValuesOn() { mBuffer.mData.setOn(); }
+    /// Mark all voxels as inactive but don't change their values.
+    void setValuesOff() { mBuffer.mData.setOff(); }
+
+    /// Return @c true if the voxel at the given coordinates is active.
+    bool isValueOn(const Coord& xyz) const { return mBuffer.mData.isOn(this->coordToOffset(xyz)); }
+    /// Return @c true if the voxel at the given offset is active.
+    bool isValueOn(Index offset) const { assert(offset < SIZE); return mBuffer.mData.isOn(offset); }
+
+    /// Return @c false since leaf nodes never contain tiles.
+    static bool hasActiveTiles() { return false; }
+
+    /// Set all voxels that lie outside the given axis-aligned box to the background.
+    void clip(const CoordBBox&, bool background);
+
+    /// Set all voxels within an axis-aligned box to the specified value and active state.
+    void fill(const CoordBBox& bbox, bool value, bool dummy = false);
+    
+    /// Set the state of all voxels to the specified active state.
+    void fill(const bool& value, bool dummy = false);
+
+    /// @brief Copy into a dense grid the values of the voxels that lie within
+    /// a given bounding box.
+    ///
+    /// @param bbox   inclusive bounding box of the voxels to be copied into the dense grid
+    /// @param dense  dense grid with a stride in @e z of one (see tools::Dense
+    ///               in tools/Dense.h for the required API)
+    ///
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    /// @note Consider using tools::CopyToDense in tools/Dense.h
+    /// instead of calling this method directly.
+    template<typename DenseT>
+    void copyToDense(const CoordBBox& bbox, DenseT& dense) const;
+
+    /// @brief Copy from a dense grid into this node the values of the voxels
+    /// that lie within a given bounding box.
+    /// @details Only values that are different (by more than the given tolerance)
+    /// from the background value will be active.  Other values are inactive
+    /// and truncated to the background value.
+    ///
+    /// @param bbox        inclusive bounding box of the voxels to be copied into this node
+    /// @param dense       dense grid with a stride in @e z of one (see tools::Dense
+    ///                    in tools/Dense.h for the required API)
+    /// @param background  background value of the tree that this node belongs to
+    /// @param tolerance   tolerance within which a value equals the background value
+    ///
+    /// @note @a bbox is assumed to be identical to or contained in the coordinate domains
+    /// of both the dense grid and this node, i.e., no bounds checking is performed.
+    /// @note Consider using tools::CopyFromDense in tools/Dense.h
+    /// instead of calling this method directly.
+    template<typename DenseT>
+    void copyFromDense(const CoordBBox& bbox, const DenseT& dense, bool background, bool tolerance);
+
+    /// @brief Return the value of the voxel at the given coordinates.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    const bool& getValueAndCache(const Coord& xyz, AccessorT&) const {return this->getValue(xyz);}
+
+    /// @brief Return @c true if the voxel at the given coordinates is active.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool isValueOnAndCache(const Coord& xyz, AccessorT&) const { return this->isValueOn(xyz); }
+
+    /// @brief Change the value of the voxel at the given coordinates and mark it as active.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueAndCache(const Coord& xyz, bool val, AccessorT&) { this->setValueOn(xyz, val); }
+
+    /// @brief Change the value of the voxel at the given coordinates
+    /// but preserve its state.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord& xyz, bool val, AccessorT&) {this->setValueOnly(xyz,val);}
+
+    /// @brief Change the value of the voxel at the given coordinates and mark it as inactive.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord& xyz, bool value, AccessorT&)
+    {
+        this->setValueOff(xyz, value);
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&)
+    {
+        this->modifyValue(xyz, op);
+    }
+
+    /// Apply a functor to the voxel at the given coordinates.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&)
+    {
+        this->modifyValueAndActiveState(xyz, op);
+    }
+
+    /// @brief Set the active state of the voxel at the given coordinates
+    /// without changing its value.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT&)
+    {
+        this->setActiveState(xyz, on);
+    }
+
+    /// @brief Return @c true if the voxel at the given coordinates is active
+    /// and return the voxel value in @a val.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool probeValueAndCache(const Coord& xyz, bool& val, AccessorT&) const
+    {
+        return this->probeValue(xyz, val);
+    }
+
+    /// @brief Return the LEVEL (=0) at which leaf node values reside.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    static Index getValueLevelAndCache(const Coord&, AccessorT&) { return LEVEL; }
+
+    /// @brief Return a const reference to the first entry in the buffer.
+    /// @note Since it's actually a reference to a static data member
+    /// it should not be converted to a non-const pointer!
+    const bool& getFirstValue() const { if (mBuffer.mData.isOn(0)) return sOn; else return sOff; }
+    /// @brief Return a const reference to the last entry in the buffer.
+    /// @note Since it's actually a reference to a static data member
+    /// it should not be converted to a non-const pointer!
+    const bool& getLastValue() const { if (mBuffer.mData.isOn(SIZE-1)) return sOn; else return sOff; }
+
+    /// Return @c true if all of this node's voxels have the same active state
+    /// and are equal to within the given tolerance, and return the value in
+    /// @a constValue and the active state in @a state.
+    bool isConstant(bool& constValue, bool& state, bool tolerance = 0) const;
+    /// Return @c true if all of this node's values are inactive.
+    bool isInactive() const { return mBuffer.mData.isOff(); }
+
+    /// @brief no-op since for this temaplte specialization voxel
+    /// values and states are indistinguishable.
+    void resetBackground(bool, bool) {}
+
+    /// @brief Invert the bits of the voxels, i.e. states and values
+    void negate() { mBuffer.mData.toggle(); }
+
+    template<MergePolicy Policy>
+    void merge(const LeafNode& other, bool bg = false, bool otherBG = false);
+    template<MergePolicy Policy> void merge(bool tileValue, bool tileActive=false);
+
+    /// @brief No-op
+    /// @details This function exists only to enable template instantiation.
+    void voxelizeActiveTiles(bool = true) {}
+
+    /// @brief Union this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active if either of the original voxels
+    /// were active.
+    ///
+    /// @note This operation modifies only active states, not values.
+    template<typename OtherType>
+    void topologyUnion(const LeafNode<OtherType, Log2Dim>& other);
+
+    /// @brief Intersect this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if both of the original voxels
+    /// were active.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyIntersection.
+    ///
+    /// @note This operation modifies only active states, not
+    /// values. Also note that this operation can result in all voxels
+    /// being inactive so consider subsequnetly calling prune.
+    template<typename OtherType>
+    void topologyIntersection(const LeafNode<OtherType, Log2Dim>& other, const bool&);
+
+    /// @brief Difference this node's set of active values with the active values
+    /// of the other node, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if the original voxel is
+    /// active in this LeafNode and inactive in the other LeafNode.
+    ///
+    /// @details The last dummy argument is required to match the signature
+    /// for InternalNode::topologyDifference.
+    ///
+    /// @note This operation modifies only active states, not values.
+    /// Also, because it can deactivate all of this node's voxels,
+    /// consider subsequently calling prune.
+    template<typename OtherType>
+    void topologyDifference(const LeafNode<OtherType, Log2Dim>& other, const bool&);
+
+    template<typename CombineOp>
+    void combine(const LeafNode& other, CombineOp& op);
+    template<typename CombineOp>
+    void combine(bool, bool valueIsActive, CombineOp& op);
+
+    template<typename CombineOp, typename OtherType /*= bool*/>
+    void combine2(const LeafNode& other, const OtherType&, bool valueIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherNodeT /*= LeafNode*/>
+    void combine2(bool, const OtherNodeT& other, bool valueIsActive, CombineOp&);
+    template<typename CombineOp, typename OtherNodeT /*= LeafNode*/>
+    void combine2(const LeafNode& b0, const OtherNodeT& b1, CombineOp&);
+
+    /// @brief Calls the templated functor BBoxOp with bounding box information.
+    /// An additional level argument is provided to the callback.
+    ///
+    /// @note The bounding boxes are guarenteed to be non-overlapping.
+    template<typename BBoxOp> void visitActiveBBox(BBoxOp&) const;
+
+    template<typename VisitorOp> void visit(VisitorOp&);
+    template<typename VisitorOp> void visit(VisitorOp&) const;
+
+    template<typename OtherLeafNodeType, typename VisitorOp>
+    void visit2Node(OtherLeafNodeType& other, VisitorOp&);
+    template<typename OtherLeafNodeType, typename VisitorOp>
+    void visit2Node(OtherLeafNodeType& other, VisitorOp&) const;
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false);
+    template<typename IterT, typename VisitorOp>
+    void visit2(IterT& otherIter, VisitorOp&, bool otherIsLHS = false) const;
+
+    //@{
+    /// This function exists only to enable template instantiation.
+    void prune(const ValueType& /*tolerance*/ = zeroVal<ValueType>()) {}
+    void addLeaf(LeafNode*) {}
+    template<typename AccessorT>
+    void addLeafAndCache(LeafNode*, AccessorT&) {}
+    template<typename NodeT>
+    NodeT* stealNode(const Coord&, const ValueType&, bool) { return NULL; }
+    template<typename NodeT>
+    NodeT* probeNode(const Coord&) { return NULL; }
+    template<typename NodeT>
+    const NodeT* probeConstNode(const Coord&) const { return NULL; }
+    template<typename ArrayT> void getNodes(ArrayT&) const {}
+    template<typename ArrayT> void stealNodes(ArrayT&, const ValueType&, bool) {}
+    //@}
+
+    void addTile(Index level, const Coord&, bool val, bool active);
+    void addTile(Index offset, bool val, bool active);
+    template<typename AccessorT>
+    void addTileAndCache(Index level, const Coord&, bool val, bool active, AccessorT&);
+
+    //@{
+    /// @brief Return a pointer to this node.
+    LeafNode* touchLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    LeafNode* touchLeafAndCache(const Coord&, AccessorT&) { return this; }
+    LeafNode* probeLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    LeafNode* probeLeafAndCache(const Coord&, AccessorT&) { return this; }
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord&, AccessorT&)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,LeafNode>::value)) return NULL;
+        return reinterpret_cast<NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+    //@{
+    /// @brief Return a @const pointer to this node.
+    const LeafNode* probeLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const LeafNode* probeLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    const LeafNode* probeConstLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const LeafNode* probeConstLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord&, AccessorT&) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,LeafNode>::value)) return NULL;
+        return reinterpret_cast<const NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+
+    //
+    // Iterators
+    //
+protected:
+    typedef typename NodeMaskType::OnIterator    MaskOnIter;
+    typedef typename NodeMaskType::OffIterator   MaskOffIter;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIter;
+
+    template<typename MaskIterT, typename NodeT, typename ValueT>
+    struct ValueIter:
+        // Derives from SparseIteratorBase, but can also be used as a dense iterator,
+        // if MaskIterT is a dense mask iterator type.
+        public SparseIteratorBase<MaskIterT, ValueIter<MaskIterT, NodeT, ValueT>, NodeT, ValueT>
+    {
+        typedef SparseIteratorBase<MaskIterT, ValueIter, NodeT, ValueT> BaseT;
+
+        ValueIter() {}
+        ValueIter(const MaskIterT& iter, NodeT* parent): BaseT(iter, parent) {}
+
+        const bool& getItem(Index pos) const { return this->parent().getValue(pos); }
+        const bool& getValue() const { return this->getItem(this->pos()); }
+
+        // Note: setItem() can't be called on const iterators.
+        void setItem(Index pos, bool value) const { this->parent().setValueOnly(pos, value); }
+        // Note: setValue() can't be called on const iterators.
+        void setValue(bool value) const { this->setItem(this->pos(), value); }
+
+        // Note: modifyItem() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyItem(Index n, const ModifyOp& op) const { this->parent().modifyValue(n, op); }
+        // Note: modifyValue() can't be called on const iterators.
+        template<typename ModifyOp>
+        void modifyValue(const ModifyOp& op) const { this->modifyItem(this->pos(), op); }
+    };
+
+    /// Leaf nodes have no children, so their child iterators have no get/set accessors.
+    template<typename MaskIterT, typename NodeT>
+    struct ChildIter:
+        public SparseIteratorBase<MaskIterT, ChildIter<MaskIterT, NodeT>, NodeT, bool>
+    {
+        ChildIter() {}
+        ChildIter(const MaskIterT& iter, NodeT* parent): SparseIteratorBase<
+            MaskIterT, ChildIter<MaskIterT, NodeT>, NodeT, bool>(iter, parent) {}
+    };
+
+    template<typename NodeT, typename ValueT>
+    struct DenseIter: public DenseIteratorBase<
+        MaskDenseIter, DenseIter<NodeT, ValueT>, NodeT, /*ChildT=*/void, ValueT>
+    {
+        typedef DenseIteratorBase<MaskDenseIter, DenseIter, NodeT, void, ValueT> BaseT;
+        typedef typename BaseT::NonConstValueType NonConstValueT;
+
+        DenseIter() {}
+        DenseIter(const MaskDenseIter& iter, NodeT* parent): BaseT(iter, parent) {}
+
+        bool getItem(Index pos, void*& child, NonConstValueT& value) const
+        {
+            value = this->parent().getValue(pos);
+            child = NULL;
+            return false; // no child
+        }
+
+        // Note: setItem() can't be called on const iterators.
+        //void setItem(Index pos, void* child) const {}
+
+        // Note: unsetItem() can't be called on const iterators.
+        void unsetItem(Index pos, const ValueT& val) const {this->parent().setValueOnly(pos, val);}
+    };
+
+public:
+    typedef ValueIter<MaskOnIter, LeafNode, const bool>           ValueOnIter;
+    typedef ValueIter<MaskOnIter, const LeafNode, const bool>     ValueOnCIter;
+    typedef ValueIter<MaskOffIter, LeafNode, const bool>          ValueOffIter;
+    typedef ValueIter<MaskOffIter, const LeafNode, const bool>    ValueOffCIter;
+    typedef ValueIter<MaskDenseIter, LeafNode, const bool>        ValueAllIter;
+    typedef ValueIter<MaskDenseIter, const LeafNode, const bool>  ValueAllCIter;
+    typedef ChildIter<MaskOnIter, LeafNode>                       ChildOnIter;
+    typedef ChildIter<MaskOnIter, const LeafNode>                 ChildOnCIter;
+    typedef ChildIter<MaskOffIter, LeafNode>                      ChildOffIter;
+    typedef ChildIter<MaskOffIter, const LeafNode>                ChildOffCIter;
+    typedef DenseIter<LeafNode, bool>                             ChildAllIter;
+    typedef DenseIter<const LeafNode, const bool>                 ChildAllCIter;
+
+    ValueOnCIter  cbeginValueOn() const { return ValueOnCIter(mBuffer.mData.beginOn(), this); }
+    ValueOnCIter   beginValueOn() const { return ValueOnCIter(mBuffer.mData.beginOn(), this); }
+    ValueOnIter    beginValueOn() { return ValueOnIter(mBuffer.mData.beginOn(), this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(mBuffer.mData.beginOff(), this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(mBuffer.mData.beginOff(), this); }
+    ValueOffIter   beginValueOff() { return ValueOffIter(mBuffer.mData.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(mBuffer.mData.beginDense(), this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(mBuffer.mData.beginDense(), this); }
+    ValueAllIter   beginValueAll() { return ValueAllIter(mBuffer.mData.beginDense(), this); }
+
+    ValueOnCIter  cendValueOn() const { return ValueOnCIter(mBuffer.mData.endOn(), this); }
+    ValueOnCIter   endValueOn() const { return ValueOnCIter(mBuffer.mData.endOn(), this); }
+    ValueOnIter    endValueOn() { return ValueOnIter(mBuffer.mData.endOn(), this); }
+    ValueOffCIter cendValueOff() const { return ValueOffCIter(mBuffer.mData.endOff(), this); }
+    ValueOffCIter  endValueOff() const { return ValueOffCIter(mBuffer.mData.endOff(), this); }
+    ValueOffIter   endValueOff() { return ValueOffIter(mBuffer.mData.endOff(), this); }
+    ValueAllCIter cendValueAll() const { return ValueAllCIter(mBuffer.mData.endDense(), this); }
+    ValueAllCIter  endValueAll() const { return ValueAllCIter(mBuffer.mData.endDense(), this); }
+    ValueAllIter   endValueAll() { return ValueAllIter(mBuffer.mData.endDense(), this); }
+
+    // Note that [c]beginChildOn() and [c]beginChildOff() actually return end iterators,
+    // because leaf nodes have no children.
+    ChildOnCIter  cbeginChildOn() const { return ChildOnCIter(mBuffer.mData.endOn(), this); }
+    ChildOnCIter   beginChildOn() const { return ChildOnCIter(mBuffer.mData.endOn(), this); }
+    ChildOnIter    beginChildOn() { return ChildOnIter(mBuffer.mData.endOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(mBuffer.mData.endOff(), this); }
+    ChildOffCIter  beginChildOff() const { return ChildOffCIter(mBuffer.mData.endOff(), this); }
+    ChildOffIter   beginChildOff() { return ChildOffIter(mBuffer.mData.endOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(mBuffer.mData.beginDense(), this); }
+    ChildAllCIter  beginChildAll() const { return ChildAllCIter(mBuffer.mData.beginDense(), this); }
+    ChildAllIter   beginChildAll() { return ChildAllIter(mBuffer.mData.beginDense(), this); }
+
+    ChildOnCIter  cendChildOn() const { return ChildOnCIter(mBuffer.mData.endOn(), this); }
+    ChildOnCIter   endChildOn() const { return ChildOnCIter(mBuffer.mData.endOn(), this); }
+    ChildOnIter    endChildOn() { return ChildOnIter(mBuffer.mData.endOn(), this); }
+    ChildOffCIter cendChildOff() const { return ChildOffCIter(mBuffer.mData.endOff(), this); }
+    ChildOffCIter  endChildOff() const { return ChildOffCIter(mBuffer.mData.endOff(), this); }
+    ChildOffIter   endChildOff() { return ChildOffIter(mBuffer.mData.endOff(), this); }
+    ChildAllCIter cendChildAll() const { return ChildAllCIter(mBuffer.mData.endDense(), this); }
+    ChildAllCIter  endChildAll() const { return ChildAllCIter(mBuffer.mData.endDense(), this); }
+    ChildAllIter   endChildAll() { return ChildAllIter(mBuffer.mData.endDense(), this); }
+
+    //
+    // Mask accessors
+    //
+    bool isValueMaskOn(Index n) const { return mBuffer.mData.isOn(n); }
+    bool isValueMaskOn() const { return mBuffer.mData.isOn(); }
+    bool isValueMaskOff(Index n) const { return mBuffer.mData.isOff(n); }
+    bool isValueMaskOff() const { return mBuffer.mData.isOff(); }
+    const NodeMaskType& getValueMask() const { return mBuffer.mData; }
+    const NodeMaskType& valueMask() const { return mBuffer.mData; }
+    NodeMaskType& getValueMask() { return mBuffer.mData; }
+    void setValueMask(const NodeMaskType& mask) { mBuffer.mData = mask; }
+    bool isChildMaskOn(Index) const { return false; } // leaf nodes have no children
+    bool isChildMaskOff(Index) const { return true; }
+    bool isChildMaskOff() const { return true; }
+protected:
+    void setValueMask(Index n, bool on) { mBuffer.mData.set(n, on); }
+    void setValueMaskOn(Index n)  { mBuffer.mData.setOn(n); }
+    void setValueMaskOff(Index n) { mBuffer.mData.setOff(n); }
+
+    /// Compute the origin of the leaf node that contains the voxel with the given coordinates.
+    static void evalNodeOrigin(Coord& xyz) { xyz &= ~(DIM - 1); }
+
+    template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+    static inline void doVisit(NodeT&, VisitorOp&);
+
+    template<typename NodeT, typename OtherNodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp&);
+
+    template<typename NodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2(NodeT& self, OtherChildAllIterT&, VisitorOp&, bool otherIsLHS);
+    
+    /// Bitmask representing the values AND state of voxels
+    Buffer mBuffer;
+
+    /// Global grid index coordinates (x,y,z) of the local origin of this node
+    Coord mOrigin;
+
+    // These static declarations must be on separate lines to avoid VC9 compiler errors.
+    static const bool sOn;
+    static const bool sOff;
+
+private:
+    /// @brief During topology-only construction, access is needed
+    /// to protected/private members of other template instances.
+    template<typename, Index> friend class LeafNode;
+
+    friend struct ValueIter<MaskOnIter, LeafNode, bool>;
+    friend struct ValueIter<MaskOffIter, LeafNode, bool>;
+    friend struct ValueIter<MaskDenseIter, LeafNode, bool>;
+    friend struct ValueIter<MaskOnIter, const LeafNode, bool>;
+    friend struct ValueIter<MaskOffIter, const LeafNode, bool>;
+    friend struct ValueIter<MaskDenseIter, const LeafNode, bool>;
+
+    //@{
+    /// Allow iterators to call mask accessor methods (see below).
+    /// @todo Make mask accessors public?
+    friend class IteratorBase<MaskOnIter, LeafNode>;
+    friend class IteratorBase<MaskOffIter, LeafNode>;
+    friend class IteratorBase<MaskDenseIter, LeafNode>;
+    //@}
+
+}; // class LeafNode<ValueMask>
+
+
+/// @internal For consistency with other nodes and with iterators, methods like
+/// LeafNode::getValue() return a reference to a value.  Since it's not possible
+/// to return a reference to a bit in a node mask, we return a reference to one
+/// of the following static values instead.
+template<Index Log2Dim> const bool LeafNode<ValueMask, Log2Dim>::sOn = true;
+template<Index Log2Dim> const bool LeafNode<ValueMask, Log2Dim>::sOff = false;
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode()
+  : mOrigin(0, 0, 0)
+{
+}
+
+template<Index Log2Dim>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(const Coord& xyz, bool value, bool)
+    : mBuffer(value)
+    , mOrigin(xyz & (~(DIM - 1)))
+{
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<Index Log2Dim>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(PartialCreate, const Coord& xyz, bool value, bool)
+    : mBuffer(value)
+    , mOrigin(xyz & (~(DIM - 1)))
+{
+}
+#endif
+
+
+template<Index Log2Dim>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(const LeafNode &other)
+    : mBuffer(other.mBuffer)
+    , mOrigin(other.mOrigin)
+{
+}
+
+
+// Copy-construct from a leaf node with the same configuration but a different ValueType.
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other)
+    : mBuffer(other.valueMask())
+    , mOrigin(other.origin())
+{
+}
+
+
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other,
+                                         bool, TopologyCopy)
+    : mBuffer(other.valueMask())// value = active state
+    , mOrigin(other.origin())
+{
+}
+
+
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other, TopologyCopy)
+    : mBuffer(other.valueMask())// value = active state
+    , mOrigin(other.origin())
+{
+}
+
+
+template<Index Log2Dim>
+template<typename ValueT>
+inline
+LeafNode<ValueMask, Log2Dim>::LeafNode(const LeafNode<ValueT, Log2Dim>& other,
+                                         bool offValue, bool onValue, TopologyCopy)
+    : mBuffer(other.valueMask())
+    , mOrigin(other.origin())
+{
+    if (offValue==true) {
+        if (onValue==false) {
+            mBuffer.mData.toggle();
+        } else {
+            mBuffer.mData.setOn();
+        }
+    }
+}
+
+
+template<Index Log2Dim>
+inline
+LeafNode<ValueMask, Log2Dim>::~LeafNode()
+{
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline Index64
+LeafNode<ValueMask, Log2Dim>::memUsage() const
+{
+    return sizeof(mOrigin) + mBuffer.memUsage();
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels) const
+{
+    CoordBBox this_bbox = this->getNodeBoundingBox();
+    if (bbox.isInside(this_bbox)) return;//this LeafNode is already enclosed in the bbox
+    if (ValueOnCIter iter = this->cbeginValueOn()) {//any active values?
+        if (visitVoxels) {//use voxel granularity?
+            this_bbox.reset();
+            for(; iter; ++iter) this_bbox.expand(this->offsetToLocalCoord(iter.pos()));
+            this_bbox.translate(this->origin());
+        }
+        bbox.expand(this_bbox);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename OtherType, Index OtherLog2Dim>
+inline bool
+LeafNode<ValueMask, Log2Dim>::hasSameTopology(const LeafNode<OtherType, OtherLog2Dim>* other) const
+{
+    assert(other);
+    return (Log2Dim == OtherLog2Dim && mBuffer.mData == other->getValueMask());
+}
+
+
+template<Index Log2Dim>
+inline std::string
+LeafNode<ValueMask, Log2Dim>::str() const
+{
+    std::ostringstream ostr;
+    ostr << "LeafNode @" << mOrigin << ": ";
+    for (Index32 n = 0; n < SIZE; ++n) ostr << (mBuffer.mData.isOn(n) ? '#' : '.');
+    return ostr.str();
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline Index
+LeafNode<ValueMask, Log2Dim>::coordToOffset(const Coord& xyz)
+{
+    assert ((xyz[0] & (DIM-1u)) < DIM && (xyz[1] & (DIM-1u)) < DIM && (xyz[2] & (DIM-1u)) < DIM);
+    return ((xyz[0] & (DIM-1u)) << 2*Log2Dim)
+         + ((xyz[1] & (DIM-1u)) << Log2Dim)
+         +  (xyz[2] & (DIM-1u));
+}
+
+
+template<Index Log2Dim>
+inline Coord
+LeafNode<ValueMask, Log2Dim>::offsetToLocalCoord(Index n)
+{
+    assert(n < (1 << 3*Log2Dim));
+    Coord xyz;
+    xyz.setX(n >> 2*Log2Dim);
+    n &= ((1 << 2*Log2Dim) - 1);
+    xyz.setY(n >> Log2Dim);
+    xyz.setZ(n & ((1 << Log2Dim) - 1));
+    return xyz;
+}
+
+
+template<Index Log2Dim>
+inline Coord
+LeafNode<ValueMask, Log2Dim>::offsetToGlobalCoord(Index n) const
+{
+    return (this->offsetToLocalCoord(n) + this->origin());
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::readTopology(std::istream& is, bool /*fromHalf*/)
+{
+    mBuffer.mData.load(is);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::writeTopology(std::ostream& os, bool /*toHalf*/) const
+{
+    mBuffer.mData.save(os);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::readBuffers(std::istream& is, const CoordBBox& clipBBox, bool fromHalf)
+{
+    // Boolean LeafNodes don't currently implement lazy loading.
+    // Instead, load the full buffer, then clip it.
+
+    this->readBuffers(is, fromHalf);
+
+    // Get this tree's background value.
+    bool background = false;
+    if (const void* bgPtr = io::getGridBackgroundValuePtr(is)) {
+        background = *static_cast<const bool*>(bgPtr);
+    }
+    this->clip(clipBBox, background);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::readBuffers(std::istream& is, bool /*fromHalf*/)
+{
+    // Read in the value mask = buffer.
+    mBuffer.mData.load(is);
+    // Read in the origin.
+    is.read(reinterpret_cast<char*>(&mOrigin), sizeof(Coord::ValueType) * 3);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::writeBuffers(std::ostream& os, bool /*toHalf*/) const
+{
+    // Write out the value mask = buffer.
+    mBuffer.mData.save(os);
+    // Write out the origin.
+    os.write(reinterpret_cast<const char*>(&mOrigin), sizeof(Coord::ValueType) * 3);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<ValueMask, Log2Dim>::operator==(const LeafNode& other) const
+{
+    return mOrigin == other.mOrigin && mBuffer == other.mBuffer;
+}
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<ValueMask, Log2Dim>::operator!=(const LeafNode& other) const
+{
+    return !(this->operator==(other));
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<ValueMask, Log2Dim>::isConstant(bool& constValue, bool& state, bool) const
+{
+    state = mBuffer.mData.isOn();
+
+    if (!(state || mBuffer.mData.isOff())) return false;
+    
+    constValue = state;
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::addTile(Index /*level*/, const Coord& xyz, bool val, bool active)
+{
+    this->addTile(this->coordToOffset(xyz), val, active);
+}
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::addTile(Index offset, bool val, bool active)
+{
+    assert(offset < SIZE);
+    this->setValueOnly(offset, val);
+    this->setActiveState(offset, active);
+}
+
+template<Index Log2Dim>
+template<typename AccessorT>
+inline void
+LeafNode<ValueMask, Log2Dim>::addTileAndCache(Index level, const Coord& xyz,
+    bool val, bool active, AccessorT&)
+{
+    this->addTile(level, xyz, val, active);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline const bool&
+LeafNode<ValueMask, Log2Dim>::getValue(const Coord& xyz) const
+{
+    // This *CANNOT* use operator ? because Visual C++
+    if (mBuffer.mData.isOn(this->coordToOffset(xyz))) return sOn; else return sOff;
+}
+
+
+template<Index Log2Dim>
+inline const bool&
+LeafNode<ValueMask, Log2Dim>::getValue(Index offset) const
+{
+    assert(offset < SIZE);
+    // This *CANNOT* use operator ? for Windows
+    if (mBuffer.mData.isOn(offset)) return sOn; else return sOff;
+}
+
+
+template<Index Log2Dim>
+inline bool
+LeafNode<ValueMask, Log2Dim>::probeValue(const Coord& xyz, bool& val) const
+{
+    const Index offset = this->coordToOffset(xyz);
+    val = mBuffer.mData.isOn(offset);
+    return val;
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::setValueOn(const Coord& xyz, bool val)
+{
+    this->setValueOn(this->coordToOffset(xyz), val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::setValueOn(Index offset, bool val)
+{
+    assert(offset < SIZE);
+    mBuffer.mData.set(offset, val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::setValueOnly(const Coord& xyz, bool val)
+{
+    this->setValueOnly(this->coordToOffset(xyz), val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::setActiveState(const Coord& xyz, bool on)
+{
+    mBuffer.mData.set(this->coordToOffset(xyz), on);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::setValueOff(const Coord& xyz, bool val)
+{
+    this->setValueOff(this->coordToOffset(xyz), val);
+}
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::setValueOff(Index offset, bool val)
+{
+    assert(offset < SIZE);
+    mBuffer.mData.set(offset, val);
+}
+
+
+template<Index Log2Dim>
+template<typename ModifyOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::modifyValue(Index offset, const ModifyOp& op)
+{
+    bool val = mBuffer.mData.isOn(offset);
+    op(val);
+    mBuffer.mData.set(offset, val);
+}
+
+
+template<Index Log2Dim>
+template<typename ModifyOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::modifyValue(const Coord& xyz, const ModifyOp& op)
+{
+    this->modifyValue(this->coordToOffset(xyz), op);
+}
+
+
+template<Index Log2Dim>
+template<typename ModifyOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+{
+    const Index offset = this->coordToOffset(xyz);
+    bool val = mBuffer.mData.isOn(offset), state = val;
+    op(val, state);
+    mBuffer.mData.set(offset, val);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<ValueMask, Log2Dim>::merge(const LeafNode& other, bool /*bg*/, bool /*otherBG*/)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    if (Policy == MERGE_NODES) return;
+    mBuffer.mData |= other.mBuffer.mData;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+template<Index Log2Dim>
+template<MergePolicy Policy>
+inline void
+LeafNode<ValueMask, Log2Dim>::merge(bool tileValue, bool)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    if (Policy != MERGE_ACTIVE_STATES_AND_NODES) return;
+    if (tileValue) mBuffer.mData.setOn();
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<ValueMask, Log2Dim>::topologyUnion(const LeafNode<OtherType, Log2Dim>& other)
+{
+    mBuffer.mData |= other.valueMask();
+}
+
+
+template<Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<ValueMask, Log2Dim>::topologyIntersection(const LeafNode<OtherType, Log2Dim>& other,
+                                                   const bool&)
+{
+    mBuffer.mData &= other.valueMask();
+}
+
+
+template<Index Log2Dim>
+template<typename OtherType>
+inline void
+LeafNode<ValueMask, Log2Dim>::topologyDifference(const LeafNode<OtherType, Log2Dim>& other,
+                                            const bool&)
+{
+    mBuffer.mData &= !other.valueMask();
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::clip(const CoordBBox& clipBBox, bool background)
+{
+    CoordBBox nodeBBox = this->getNodeBoundingBox();
+    if (!clipBBox.hasOverlap(nodeBBox)) {
+        // This node lies completely outside the clipping region.  Fill it with background tiles.
+        this->fill(nodeBBox, background, /*active=*/false);
+    } else if (clipBBox.isInside(nodeBBox)) {
+        // This node lies completely inside the clipping region.  Leave it intact.
+        return;
+    }
+
+    // This node isn't completely contained inside the clipping region.
+    // Set any voxels that lie outside the region to the background value.
+
+    // Construct a boolean mask that is on inside the clipping region and off outside it.
+    NodeMaskType mask;
+    nodeBBox.intersect(clipBBox);
+    Coord xyz;
+    int &x = xyz.x(), &y = xyz.y(), &z = xyz.z();
+    for (x = nodeBBox.min().x(); x <= nodeBBox.max().x(); ++x) {
+        for (y = nodeBBox.min().y(); y <= nodeBBox.max().y(); ++y) {
+            for (z = nodeBBox.min().z(); z <= nodeBBox.max().z(); ++z) {
+                mask.setOn(static_cast<Index32>(this->coordToOffset(xyz)));
+            }
+        }
+    }
+
+    // Set voxels that lie in the inactive region of the mask (i.e., outside
+    // the clipping region) to the background value.
+    for (MaskOffIter maskIter = mask.beginOff(); maskIter; ++maskIter) {
+        this->setValueOff(maskIter.pos(), background);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::fill(const CoordBBox& bbox, bool value, bool)
+{
+    for (Int32 x = bbox.min().x(); x <= bbox.max().x(); ++x) {
+        const Index offsetX = (x & (DIM-1u))<<2*Log2Dim;
+        for (Int32 y = bbox.min().y(); y <= bbox.max().y(); ++y) {
+            const Index offsetXY = offsetX + ((y & (DIM-1u))<<  Log2Dim);
+            for (Int32 z = bbox.min().z(); z <= bbox.max().z(); ++z) {
+                const Index offset = offsetXY + (z & (DIM-1u));
+                mBuffer.mData.set(offset, value);
+            }
+        }
+    }
+}
+
+template<Index Log2Dim>
+inline void
+LeafNode<ValueMask, Log2Dim>::fill(const bool& value, bool)
+{
+    mBuffer.fill(value);
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename DenseT>
+inline void
+LeafNode<ValueMask, Log2Dim>::copyToDense(const CoordBBox& bbox, DenseT& dense) const
+{
+    typedef typename DenseT::ValueType DenseValueType;
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    DenseValueType* t0 = dense.data() + zStride * (bbox.min()[2] - min[2]); // target array
+    const Int32 n0 = bbox.min()[2] & (DIM-1u);
+    for (Int32 x = bbox.min()[0], ex = bbox.max()[0] + 1; x < ex; ++x) {
+        DenseValueType* t1 = t0 + xStride * (x - min[0]);
+        const Int32 n1 = n0 + ((x & (DIM-1u)) << 2*LOG2DIM);
+        for (Int32 y = bbox.min()[1], ey = bbox.max()[1] + 1; y < ey; ++y) {
+            DenseValueType* t2 = t1 + yStride * (y - min[1]);
+            Int32 n2 = n1 + ((y & (DIM-1u)) << LOG2DIM);
+            for (Int32 z = bbox.min()[2], ez = bbox.max()[2] + 1; z < ez; ++z, t2 += zStride) {
+                *t2 = DenseValueType(mBuffer.mData.isOn(n2++));
+            }
+        }
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename DenseT>
+inline void
+LeafNode<ValueMask, Log2Dim>::copyFromDense(const CoordBBox& bbox, const DenseT& dense,
+                                       bool background, bool tolerance)
+{
+    typedef typename DenseT::ValueType DenseValueType;
+    struct Local {
+        inline static bool toBool(const DenseValueType& v) { return !math::isZero(v); }
+    };
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    const DenseValueType* s0 = dense.data() + zStride * (bbox.min()[2] - min[2]); // source
+    const Int32 n0 = bbox.min()[2] & (DIM-1u);
+    for (Int32 x = bbox.min()[0], ex = bbox.max()[0] + 1; x < ex; ++x) {
+        const DenseValueType* s1 = s0 + xStride * (x - min[0]);
+        const Int32 n1 = n0 + ((x & (DIM-1u)) << 2*LOG2DIM);
+        for (Int32 y = bbox.min()[1], ey = bbox.max()[1] + 1; y < ey; ++y) {
+            const DenseValueType* s2 = s1 + yStride * (y - min[1]);
+            Int32 n2 = n1 + ((y & (DIM-1u)) << LOG2DIM);
+            for (Int32 z = bbox.min()[2], ez = bbox.max()[2]+1; z < ez; ++z, ++n2, s2 += zStride) {
+                // Note: if tolerance is true (i.e., 1), then all boolean values compare equal.
+                if (tolerance || (background == Local::toBool(*s2))) {
+                    mBuffer.mData.set(n2, background);
+                } else {
+                    mBuffer.mData.set(n2, Local::toBool(*s2));
+                }
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename CombineOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::combine(const LeafNode& other, CombineOp& op)
+{
+    CombineArgs<bool> args;
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, aVal = mBuffer.mData.isOn(i), bVal = other.mBuffer.mData.isOn(i);
+        op(args.setARef(aVal)
+            .setAIsActive(aVal)
+            .setBRef(bVal)
+            .setBIsActive(bVal)
+            .setResultRef(result));
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename CombineOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::combine(bool value, bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<bool> args;
+    args.setBRef(value).setBIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, aVal = mBuffer.mData.isOn(i);
+        op(args.setARef(aVal)
+            .setAIsActive(aVal)
+            .setResultRef(result));
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename CombineOp, typename OtherType>
+inline void
+LeafNode<ValueMask, Log2Dim>::combine2(const LeafNode& other, const OtherType& value,
+    bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<bool, OtherType> args;
+    args.setBRef(value).setBIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, aVal = other.mBuffer.mData.isOn(i);
+        op(args.setARef(aVal)
+            .setAIsActive(aVal)
+            .setResultRef(result));
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename CombineOp, typename OtherNodeT>
+inline void
+LeafNode<ValueMask, Log2Dim>::combine2(bool value, const OtherNodeT& other,
+    bool valueIsActive, CombineOp& op)
+{
+    CombineArgs<bool, typename OtherNodeT::ValueType> args;
+    args.setARef(value).setAIsActive(valueIsActive);
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, bVal = other.mBuffer.mData.isOn(i);
+        op(args.setBRef(bVal)
+            .setBIsActive(bVal)
+            .setResultRef(result));
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename CombineOp, typename OtherNodeT>
+inline void
+LeafNode<ValueMask, Log2Dim>::combine2(const LeafNode& b0, const OtherNodeT& b1, CombineOp& op)
+{
+    CombineArgs<bool, typename OtherNodeT::ValueType> args;
+    for (Index i = 0; i < SIZE; ++i) {
+        bool result = false, b0Val = b0.mBuffer.mData.isOn(i), b1Val = b1.mBuffer.mData.isOn(i);
+        op(args.setARef(b0Val)
+            .setAIsActive(b0Val)
+            .setBRef(b1Val)
+            .setBIsActive(b1Val)
+            .setResultRef(result));
+        mBuffer.mData.set(i, result);
+    }
+}
+
+
+////////////////////////////////////////
+
+template<Index Log2Dim>
+template<typename BBoxOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visitActiveBBox(BBoxOp& op) const
+{
+    if (op.template descent<LEVEL>()) {
+        for (ValueOnCIter i=this->cbeginValueOn(); i; ++i) {
+#ifdef _MSC_VER
+            op.operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), 1));
+#else
+            op.template operator()<LEVEL>(CoordBBox::createCube(i.getCoord(), 1));
+#endif
+        }
+    } else {
+#ifdef _MSC_VER
+        op.operator()<LEVEL>(this->getNodeBoundingBox());
+#else
+        op.template operator()<LEVEL>(this->getNodeBoundingBox());
+#endif
+    }
+}
+
+
+template<Index Log2Dim>
+template<typename VisitorOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visit(VisitorOp& op)
+{
+    doVisit<LeafNode, VisitorOp, ChildAllIter>(*this, op);
+}
+
+
+template<Index Log2Dim>
+template<typename VisitorOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visit(VisitorOp& op) const
+{
+    doVisit<const LeafNode, VisitorOp, ChildAllCIter>(*this, op);
+}
+
+
+template<Index Log2Dim>
+template<typename NodeT, typename VisitorOp, typename ChildAllIterT>
+inline void
+LeafNode<ValueMask, Log2Dim>::doVisit(NodeT& self, VisitorOp& op)
+{
+    for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+        op(iter);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename OtherLeafNodeType, typename VisitorOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visit2Node(OtherLeafNodeType& other, VisitorOp& op)
+{
+    doVisit2Node<LeafNode, OtherLeafNodeType, VisitorOp, ChildAllIter,
+        typename OtherLeafNodeType::ChildAllIter>(*this, other, op);
+}
+
+
+template<Index Log2Dim>
+template<typename OtherLeafNodeType, typename VisitorOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visit2Node(OtherLeafNodeType& other, VisitorOp& op) const
+{
+    doVisit2Node<const LeafNode, OtherLeafNodeType, VisitorOp, ChildAllCIter,
+        typename OtherLeafNodeType::ChildAllCIter>(*this, other, op);
+}
+
+
+template<Index Log2Dim>
+template<
+    typename NodeT,
+    typename OtherNodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+LeafNode<ValueMask, Log2Dim>::doVisit2Node(NodeT& self, OtherNodeT& other, VisitorOp& op)
+{
+    // Allow the two nodes to have different ValueTypes, but not different dimensions.
+    BOOST_STATIC_ASSERT(OtherNodeT::SIZE == NodeT::SIZE);
+    BOOST_STATIC_ASSERT(OtherNodeT::LEVEL == NodeT::LEVEL);
+
+    ChildAllIterT iter = self.beginChildAll();
+    OtherChildAllIterT otherIter = other.beginChildAll();
+
+    for ( ; iter && otherIter; ++iter, ++otherIter) {
+        op(iter, otherIter);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<Index Log2Dim>
+template<typename IterT, typename VisitorOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visit2(IterT& otherIter, VisitorOp& op, bool otherIsLHS)
+{
+    doVisit2<LeafNode, VisitorOp, ChildAllIter, IterT>(*this, otherIter, op, otherIsLHS);
+}
+
+
+template<Index Log2Dim>
+template<typename IterT, typename VisitorOp>
+inline void
+LeafNode<ValueMask, Log2Dim>::visit2(IterT& otherIter, VisitorOp& op, bool otherIsLHS) const
+{
+    doVisit2<const LeafNode, VisitorOp, ChildAllCIter, IterT>(*this, otherIter, op, otherIsLHS);
+}
+
+
+template<Index Log2Dim>
+template<
+    typename NodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+LeafNode<ValueMask, Log2Dim>::doVisit2(NodeT& self, OtherChildAllIterT& otherIter,
+    VisitorOp& op, bool otherIsLHS)
+{
+    if (!otherIter) return;
+
+    if (otherIsLHS) {
+        for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+            op(otherIter, iter);
+        }
+    } else {
+        for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+            op(iter, otherIter);
+        }
+    }
+}
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_LEAF_NODE_MASK_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/NodeManager.h b/nuparu/include/openvdb_new/tree/NodeManager.h
new file mode 100644
index 00000000..08cb0683
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/NodeManager.h
@@ -0,0 +1,1128 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file NodeManager.h
+///
+/// @author Ken Museth
+///
+/// @brief NodeManager produces linear arrays of all tree nodes
+/// allowing for efficient threading and bottom-up processing.
+///
+/// @note A NodeManager can be constructed from a Tree or LeafManager.
+/// The latter is slightly more efficient since the cached leaf nodes will be reused.
+
+#ifndef OPENVDB_TREE_NODEMANAGER_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_NODEMANAGER_HAS_BEEN_INCLUDED
+
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+#include <openvdb/Types.h>
+#include <deque>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+// Produce linear arrays of all tree nodes, to facilitate efficient threading
+// and bottom-up processing.
+template<typename TreeOrLeafManagerT, Index LEVELS = TreeOrLeafManagerT::RootNodeType::LEVEL>
+class NodeManager;
+
+
+////////////////////////////////////////
+
+
+/// @brief This class caches tree nodes of a specific type in a linear array.
+///
+/// @note It is for internal use and should rarely be used directly.
+template<typename NodeT>
+class NodeList
+{
+public:
+    typedef NodeT* value_type;
+    typedef std::deque<value_type> ListT;
+
+    NodeList() {}
+
+    void push_back(NodeT* node) { mList.push_back(node); }
+
+    NodeT& operator()(size_t n) const { assert(n<mList.size()); return *(mList[n]); }
+
+    NodeT*& operator[](size_t n) { assert(n<mList.size()); return mList[n]; }
+
+    Index64 nodeCount() const { return mList.size(); }
+
+    void clear() { mList.clear(); }
+
+    void resize(size_t n) { mList.resize(n); }
+
+    class NodeRange
+    {
+    public:
+
+        NodeRange(size_t begin, size_t end, const NodeList& nodeList, size_t grainSize=1):
+            mEnd(end), mBegin(begin), mGrainSize(grainSize), mNodeList(nodeList) {}
+
+        NodeRange(NodeRange& r, tbb::split):
+            mEnd(r.mEnd), mBegin(doSplit(r)), mGrainSize(r.mGrainSize),
+            mNodeList(r.mNodeList) {}
+
+        size_t size() const { return mEnd - mBegin; }
+
+        size_t grainsize() const { return mGrainSize; }
+
+        const NodeList& nodeList() const { return mNodeList; }
+
+        bool empty() const {return !(mBegin < mEnd);}
+
+        bool is_divisible() const {return mGrainSize < this->size();}
+
+        class Iterator
+        {
+        public:
+            Iterator(const NodeRange& range, size_t pos): mRange(range), mPos(pos)
+            {
+                assert(this->isValid());
+            }
+            Iterator& operator=(const Iterator& other)
+            {
+                mRange = other.mRange; mPos = other.mPos; return *this;
+            }
+            /// Advance to the next node.
+            Iterator& operator++() { ++mPos; return *this; }
+            /// Return a reference to the node to which this iterator is pointing.
+            NodeT& operator*() const { return mRange.mNodeList(mPos); }
+            /// Return a pointer to the node to which this iterator is pointing.
+            NodeT* operator->() const { return &(this->operator*()); }
+            /// Return the index into the list of the current node.
+            size_t pos() const { return mPos; }
+            bool isValid() const { return mPos>=mRange.mBegin && mPos<=mRange.mEnd; }
+            /// Return @c true if this iterator is not yet exhausted.
+            bool test() const { return mPos < mRange.mEnd; }
+            /// Return @c true if this iterator is not yet exhausted.
+            operator bool() const { return this->test(); }
+            /// Return @c true if this iterator is exhausted.
+            bool empty() const { return !this->test(); }
+            bool operator!=(const Iterator& other) const
+            {
+                return (mPos != other.mPos) || (&mRange != &other.mRange);
+            }
+            bool operator==(const Iterator& other) const { return !(*this != other); }
+            const NodeRange& nodeRange() const { return mRange; }
+
+        private:
+            const NodeRange& mRange;
+            size_t mPos;
+        };// NodeList::NodeRange::Iterator
+
+        Iterator begin() const {return Iterator(*this, mBegin);}
+
+        Iterator end() const {return Iterator(*this, mEnd);}
+
+    private:
+        size_t mEnd, mBegin, mGrainSize;
+        const NodeList& mNodeList;
+
+        static size_t doSplit(NodeRange& r)
+        {
+            assert(r.is_divisible());
+            size_t middle = r.mBegin + (r.mEnd - r.mBegin) / 2u;
+            r.mEnd = middle;
+            return middle;
+        }
+    };// NodeList::NodeRange
+
+    /// Return a TBB-compatible NodeRange.
+    NodeRange nodeRange(size_t grainsize = 1) const
+    {
+        return NodeRange(0, this->nodeCount(), *this, grainsize);
+    }
+
+    template<typename NodeOp>
+    void foreach(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        NodeTransformer<NodeOp> transform(op);
+        transform.run(this->nodeRange(grainSize), threaded);
+    }
+
+    template<typename NodeOp>
+    void reduce(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        NodeReducer<NodeOp> transform(op);
+        transform.run(this->nodeRange(grainSize), threaded);
+    }
+
+private:
+
+    // Private struct of NodeList that performs parallel_for
+    template<typename NodeOp>
+    struct NodeTransformer
+    {
+        NodeTransformer(const NodeOp& nodeOp) : mNodeOp(nodeOp)
+        {
+        }
+        void run(const NodeRange& range, bool threaded = true)
+        {
+            threaded ? tbb::parallel_for(range, *this) : (*this)(range);
+        }
+        void operator()(const NodeRange& range) const
+        {
+            for (typename NodeRange::Iterator it = range.begin(); it; ++it) mNodeOp(*it);
+        }
+        const NodeOp mNodeOp;
+    };// NodeList::NodeTransformer
+    
+    // Private struct of NodeList that performs parallel_reduce
+    template<typename NodeOp>
+    struct NodeReducer
+    {
+        NodeReducer(NodeOp& nodeOp) : mNodeOp(&nodeOp), mOwnsOp(false)
+        {
+        }
+        NodeReducer(const NodeReducer& other, tbb::split) : 
+            mNodeOp(new NodeOp(*(other.mNodeOp), tbb::split())), mOwnsOp(true)
+        {
+        }
+        ~NodeReducer() { if (mOwnsOp) delete mNodeOp; }
+        void run(const NodeRange& range, bool threaded = true)
+        {
+            threaded ? tbb::parallel_reduce(range, *this) : (*this)(range);
+        }
+        void operator()(const NodeRange& range)
+        {
+            NodeOp &op = *mNodeOp;
+            for (typename NodeRange::Iterator it = range.begin(); it; ++it) op(*it);
+        }
+        void join(const NodeReducer& other)
+        {
+            mNodeOp->join(*(other.mNodeOp));
+        }
+        NodeOp *mNodeOp;
+        const bool mOwnsOp;
+    };// NodeList::NodeReducer
+    
+
+protected:
+    ListT mList;
+};// NodeList
+
+
+/////////////////////////////////////////////
+
+
+/// @brief This class is a link in a chain that each caches tree nodes
+/// of a specific type in a linear array.
+///
+/// @note It is for internal use and should rarely be used directly.
+template<typename NodeT, Index LEVEL>
+class NodeManagerLink
+{
+public:
+    NodeManagerLink() {}
+
+    virtual ~NodeManagerLink() {}
+
+    void clear() { mList.clear(); mNext.clear(); }
+
+    template<typename ParentT, typename TreeOrLeafManagerT>
+    void init(ParentT& parent, TreeOrLeafManagerT& tree)
+    {
+        parent.getNodes(mList);
+        for (size_t i=0, n=mList.nodeCount(); i<n; ++i) mNext.init(mList(i), tree);
+    }
+
+    template<typename ParentT>
+    void rebuild(ParentT& parent)
+    {
+        mList.clear();
+        parent.getNodes(mList);
+        for (size_t i=0, n=mList.nodeCount(); i<n; ++i) mNext.rebuild(mList(i));
+    }
+
+    Index64 nodeCount() const { return mList.nodeCount() + mNext.nodeCount(); }
+
+    Index64 nodeCount(Index i) const
+    {
+        return i==NodeT::LEVEL ? mList.nodeCount() : mNext.nodeCount(i);
+    }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mNext.foreachBottomUp(op, threaded, grainSize);
+        mList.foreach(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mList.foreach(op, threaded, grainSize);
+        mNext.foreachTopDown(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+    
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mNext.reduceBottomUp(op, threaded, grainSize);
+        mList.reduce(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mList.reduce(op, threaded, grainSize);
+        mNext.reduceTopDown(op, threaded, grainSize);
+    }
+
+protected:
+    NodeList<NodeT> mList;
+    NodeManagerLink<typename NodeT::ChildNodeType, LEVEL-1> mNext;
+};// NodeManagerLink class
+
+
+////////////////////////////////////////
+
+
+/// @brief Specialization that terminates the chain of cached tree nodes
+///
+/// @note It is for internal use and should rarely be used directly.
+template<typename NodeT>
+class NodeManagerLink<NodeT, 0>
+{
+public:
+    NodeManagerLink() {}
+
+    virtual ~NodeManagerLink() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() { mList.clear(); }
+
+    template<typename ParentT>
+    void rebuild(ParentT& parent) { mList.clear(); parent.getNodes(mList); }
+
+    Index64 nodeCount() const { return mList.nodeCount(); }
+
+    Index64 nodeCount(Index) const { return mList.nodeCount(); }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mList.foreach(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mList.foreach(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded, size_t grainSize)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mList.reduce(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded, size_t grainSize)
+    {
+        mList.reduce(op, threaded, grainSize);
+    }
+
+    template<typename ParentT, typename TreeOrLeafManagerT>
+    void init(ParentT& parent, TreeOrLeafManagerT& tree)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (TreeOrLeafManagerT::DEPTH == 2 && NodeT::LEVEL == 0) {
+            tree.getNodes(mList);
+        } else {
+            parent.getNodes(mList);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+protected:
+    NodeList<NodeT> mList;
+};// NodeManagerLink class
+
+
+////////////////////////////////////////
+
+
+/// @brief To facilitate threading over the nodes of a tree, cache
+/// node pointers in linear arrays, one for each level of the tree.
+///
+/// @details This implementation works with trees of any depth, but
+/// optimized specializations are provided for the most typical tree depths.
+template<typename TreeOrLeafManagerT, Index _LEVELS>
+class NodeManager
+{
+public:
+    static const Index LEVELS = _LEVELS;
+    BOOST_STATIC_ASSERT(LEVELS > 0);//special implementation below
+    typedef typename TreeOrLeafManagerT::RootNodeType RootNodeType;
+    BOOST_STATIC_ASSERT(RootNodeType::LEVEL >= LEVELS);
+
+    NodeManager(TreeOrLeafManagerT& tree) : mRoot(tree.root()) { mChain.init(mRoot, tree); }
+
+    virtual ~NodeManager() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() { mChain.clear(); }
+
+    /// @brief Clear and recache all the tree nodes from the
+    /// tree. This is required if tree nodes have been added or removed.
+    void rebuild() { mChain.rebuild(mRoot); }
+
+    /// @brief Return a reference to the root node.
+    const RootNodeType& root() const { return mRoot; }
+
+    /// @brief Return the total number of cached nodes (excluding the root node)
+    Index64 nodeCount() const { return mChain.nodeCount(); }
+
+    /// @brief Return the number of cached nodes at level @a i, where
+    /// 0 corresponds to the lowest level.
+    Index64 nodeCount(Index i) const { return mChain.nodeCount(i); }
+
+    //@{
+    /// @brief   Threaded method that applies a user-supplied functor
+    ///          to all the nodes in the tree.
+    ///
+    /// @param op        user-supplied functor, see examples for interface details.
+    /// @param threaded  optional toggle to disable threading, on by default.
+    /// @param grainSize optional parameter to specify the grainsize
+    ///                  for threading, one by default.
+    ///
+    /// @warning The functor object is deep-copied to create TBB tasks.
+    ///
+    /// @par Example:
+    /// @code
+    /// // Functor to offset all the inactive values of a tree. Note
+    /// // this implementation also illustrates how different
+    /// // computation can be applied to the different node types.
+    /// template<typename TreeType>
+    /// struct OffsetOp
+    /// {
+    ///     typedef typename TreeT::ValueType    ValueT;
+    ///     typedef typename TreeT::RootNodeType RootT;
+    ///     typedef typename TreeT::LeafNodeType LeafT;
+    ///     OffsetOp(const ValueT& v) : mOffset(v) {}
+    ///
+    ///     // Processes the root node. Required by the NodeManager
+    ///     void operator()(RootT& root) const
+    ///     {
+    ///         for (typename RootT::ValueOffIter i = root.beginValueOff(); i; ++i) *i += mOffset;
+    ///     }
+    ///     // Processes the leaf nodes. Required by the NodeManager
+    ///     void operator()(LeafT& leaf) const
+    ///     {
+    ///         for (typename LeafT::ValueOffIter i = leaf.beginValueOff(); i; ++i) *i += mOffset;
+    ///     }
+    ///     // Processes the internal nodes. Required by the NodeManager
+    ///     template<typename NodeT>
+    ///     void operator()(NodeT& node) const
+    ///     {
+    ///         for (typename NodeT::ValueOffIter i = node.beginValueOff(); i; ++i) *i += mOffset;
+    ///     }
+    /// private:
+    ///     const ValueT mOffset;
+    /// };
+    ///
+    /// // usage:
+    /// OffsetOp<FloatTree> op(3.0f);
+    /// tree::NodeManager<FloatTree> nodes(tree);
+    /// nodes.foreachBottomUp(op);
+    ///
+    /// // or if a LeafManager already exists
+    /// typedef tree::LeafManager<FloatTree> T;
+    /// OffsetOp<T> op(3.0f);
+    /// tree::NodeManager<T> nodes(leafManager);
+    /// nodes.foreachBottomUp(op);
+    ///
+    /// @endcode
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mChain.foreachBottomUp(op, threaded, grainSize);
+        op(mRoot);
+    }
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mChain.foreachTopDown(op, threaded, grainSize);
+    }
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+    //@}
+
+    //@{
+    /// @brief   Threaded method that processes nodes with a user supplied functor
+    ///
+    /// @param op        user-supplied functor, see examples for interface details.
+    /// @param threaded  optional toggle to disable threading, on by default.
+    /// @param grainSize optional parameter to specify the grainsize
+    ///                  for threading, one by default.
+    ///
+    /// @warning The functor object is deep-copied to create TBB tasks.
+    ///
+    /// @par Example:
+    /// @code
+    ///  // Functor to count nodes in a tree
+    ///  template<typename TreeType>
+    ///  struct NodeCountOp
+    ///  {
+    ///      NodeCountOp() : nodeCount(TreeType::DEPTH, 0), totalCount(0) 
+    ///      {
+    ///      }
+    ///      NodeCountOp(const NodeCountOp& other, tbb::split) :
+    ///          nodeCount(TreeType::DEPTH, 0), totalCount(0)
+    ///      {
+    ///      }
+    ///      void join(const NodeCountOp& other)
+    ///      {
+    ///          for (size_t i = 0; i < nodeCount.size(); ++i) {
+    ///              nodeCount[i] += other.nodeCount[i];
+    ///          }
+    ///          totalCount += other.totalCount;
+    ///      }
+    ///      // do nothing for the root node
+    ///      void operator()(const typename TreeT::RootNodeType& node)
+    ///      {
+    ///      }  
+    ///      // count the internal and leaf nodes
+    ///      template<typename NodeT>
+    ///      void operator()(const NodeT& node) 
+    ///      {
+    ///          ++(nodeCount[NodeT::LEVEL]);
+    ///          ++totalCount;
+    ///      }
+    ///      std::vector<openvdb::Index64> nodeCount;
+    ///      openvdb::Index64 totalCount;
+    /// };
+    ///
+    /// // usage:
+    /// NodeCountOp<FloatTree> op;
+    /// tree::NodeManager<FloatTree> nodes(tree);
+    /// nodes.reduceBottomUp(op);
+    ///
+    /// // or if a LeafManager already exists
+    /// NodeCountOp<FloatTree> op;
+    /// typedef tree::LeafManager<FloatTree> T;
+    /// T leafManager(tree);
+    /// tree::NodeManager<T> nodes(leafManager);
+    /// nodes.reduceBottomUp(op);
+    ///
+    /// @endcode
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mChain.reduceBottomUp(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mChain.reduceTopDown(op, threaded, grainSize);
+    }
+    //@}
+
+protected:
+    RootNodeType& mRoot;
+    NodeManagerLink<typename RootNodeType::ChildNodeType, LEVELS-1> mChain;
+
+private:
+    NodeManager(const NodeManager&) {}//disallow copy-construction
+};// NodeManager class
+
+
+////////////////////////////////////////////
+
+
+/// Template specialization of the NodeManager with no caching of nodes
+template<typename TreeOrLeafManagerT>
+class NodeManager<TreeOrLeafManagerT, 0>
+{
+public:
+    typedef typename TreeOrLeafManagerT::RootNodeType RootNodeType;
+    static const Index LEVELS = 0;
+
+    NodeManager(TreeOrLeafManagerT& tree) : mRoot(tree.root()) {}
+
+    virtual ~NodeManager() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() {}
+
+    /// @brief Clear and recache all the tree nodes from the
+    /// tree. This is required if tree nodes have been added or removed.
+    void rebuild() {}
+
+    /// @brief Return a reference to the root node.
+    const RootNodeType& root() const { return mRoot; }
+
+    /// @brief Return the total number of cached nodes (excluding the root node)
+    Index64 nodeCount() const { return 0; }
+
+    Index64 nodeCount(Index) const { return 0; }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool, size_t) { op(mRoot); }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool, size_t) { op(mRoot); }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool, size_t) { op(mRoot); }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool, size_t) { op(mRoot); }
+
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool, size_t) { op(mRoot); }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool, size_t) { op(mRoot); }
+
+protected:
+    RootNodeType& mRoot;
+
+private:
+    NodeManager(const NodeManager&) {} // disallow copy-construction
+}; // NodeManager<0>
+
+
+////////////////////////////////////////////
+
+
+/// Template specialization of the NodeManager with one level of nodes
+template<typename TreeOrLeafManagerT>
+class NodeManager<TreeOrLeafManagerT, 1>
+{
+public:
+    typedef typename TreeOrLeafManagerT::RootNodeType RootNodeType;
+    BOOST_STATIC_ASSERT(RootNodeType::LEVEL > 0);
+    static const Index LEVELS = 1;
+
+    NodeManager(TreeOrLeafManagerT& tree) : mRoot(tree.root())
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (TreeOrLeafManagerT::DEPTH == 2 && NodeT0::LEVEL == 0) {
+            tree.getNodes(mList0);
+        } else {
+            mRoot.getNodes(mList0);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    virtual ~NodeManager() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() { mList0.clear(); }
+
+    /// @brief Clear and recache all the tree nodes from the
+    /// tree. This is required if tree nodes have been added or removed.
+    void rebuild() { mList0.clear(); mRoot.getNodes(mList0); }
+
+    /// @brief Return a reference to the root node.
+    const RootNodeType& root() const { return mRoot; }
+
+    /// @brief Return the total number of cached nodes (excluding the root node)
+    Index64 nodeCount() const { return mList0.nodeCount(); }
+
+    /// @brief Return the number of cached nodes at level @a i, where
+    /// 0 corresponds to the lowest level.
+    Index64 nodeCount(Index i) const { return i==0 ? mList0.nodeCount() : 0; }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.foreach(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList0.foreach(op, threaded, grainSize);
+    }
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.reduce(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList0.reduce(op, threaded, grainSize);
+    }
+
+protected:
+    typedef RootNodeType                   NodeT1;
+    typedef typename NodeT1::ChildNodeType NodeT0;
+    typedef NodeList<NodeT0>               ListT0;
+
+    NodeT1& mRoot;
+    ListT0 mList0;
+
+private:
+    NodeManager(const NodeManager&) {} // disallow copy-construction
+}; // NodeManager<1>
+
+
+////////////////////////////////////////////
+
+
+/// Template specialization of the NodeManager with two levels of nodes
+template<typename TreeOrLeafManagerT>
+class NodeManager<TreeOrLeafManagerT, 2>
+{
+public:
+    typedef typename TreeOrLeafManagerT::RootNodeType RootNodeType;
+    BOOST_STATIC_ASSERT(RootNodeType::LEVEL > 1);
+    static const Index LEVELS = 2;
+
+    NodeManager(TreeOrLeafManagerT& tree) : mRoot(tree.root())
+    {
+        mRoot.getNodes(mList1);
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (TreeOrLeafManagerT::DEPTH == 2 && NodeT0::LEVEL == 0) {
+            tree.getNodes(mList0);
+        } else {
+            for (size_t i=0, n=mList1.nodeCount(); i<n; ++i) mList1(i).getNodes(mList0);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    virtual ~NodeManager() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() { mList0.clear(); mList1.clear(); }
+
+    /// @brief Clear and recache all the tree nodes from the
+    /// tree. This is required if tree nodes have been added or removed.
+    void rebuild()
+    {
+        this->clear();
+        mRoot.getNodes(mList1);
+        for (size_t i=0, n=mList1.nodeCount(); i<n; ++i) mList1(i).getNodes(mList0);
+    }
+
+    /// @brief Return a reference to the root node.
+    const RootNodeType& root() const { return mRoot; }
+
+    /// @brief Return the total number of cached nodes (excluding the root node)
+    Index64 nodeCount() const { return mList0.nodeCount() + mList1.nodeCount(); }
+
+    /// @brief Return the number of cached nodes at level @a i, where
+    /// 0 corresponds to the lowest level.
+    Index64 nodeCount(Index i) const
+    {
+        return i==0 ? mList0.nodeCount() : i==1 ? mList1.nodeCount() : 0;
+    }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.foreach(op, threaded, grainSize);
+        mList1.foreach(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList1.foreach(op, threaded, grainSize);
+        mList0.foreach(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.reduce(op, threaded, grainSize);
+        mList1.reduce(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList1.reduce(op, threaded, grainSize);
+        mList0.reduce(op, threaded, grainSize);
+    }
+
+protected:
+    typedef RootNodeType                   NodeT2;
+    typedef typename NodeT2::ChildNodeType NodeT1;//upper level
+    typedef typename NodeT1::ChildNodeType NodeT0;//lower level
+
+    typedef NodeList<NodeT1>               ListT1;//upper level
+    typedef NodeList<NodeT0>               ListT0;//lower level
+
+    NodeT2& mRoot;
+    ListT1 mList1;
+    ListT0 mList0;
+
+private:
+    NodeManager(const NodeManager&) {} // disallow copy-construction
+}; // NodeManager<2>
+
+
+////////////////////////////////////////////
+
+
+/// Template specialization of the NodeManager with three levels of nodes
+template<typename TreeOrLeafManagerT>
+class NodeManager<TreeOrLeafManagerT, 3>
+{
+public:
+    typedef typename TreeOrLeafManagerT::RootNodeType RootNodeType;
+    BOOST_STATIC_ASSERT(RootNodeType::LEVEL > 2);
+    static const Index LEVELS = 3;
+
+    NodeManager(TreeOrLeafManagerT& tree) : mRoot(tree.root())
+    {
+        mRoot.getNodes(mList2);
+        for (size_t i=0, n=mList2.nodeCount(); i<n; ++i) mList2(i).getNodes(mList1);
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (TreeOrLeafManagerT::DEPTH == 2 && NodeT0::LEVEL == 0) {
+            tree.getNodes(mList0);
+        } else {
+            for (size_t i=0, n=mList1.nodeCount(); i<n; ++i) mList1(i).getNodes(mList0);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    virtual ~NodeManager() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() { mList0.clear(); mList1.clear(); mList2.clear(); }
+
+    /// @brief Clear and recache all the tree nodes from the
+    /// tree. This is required if tree nodes have been added or removed.
+    void rebuild()
+    {
+        this->clear();
+        mRoot.getNodes(mList2);
+        for (size_t i=0, n=mList2.nodeCount(); i<n; ++i) mList2(i).getNodes(mList1);
+        for (size_t i=0, n=mList1.nodeCount(); i<n; ++i) mList1(i).getNodes(mList0);
+    }
+
+    /// @brief Return a reference to the root node.
+    const RootNodeType& root() const { return mRoot; }
+
+    /// @brief Return the total number of cached nodes (excluding the root node)
+    Index64 nodeCount() const { return mList0.nodeCount()+mList1.nodeCount()+mList2.nodeCount(); }
+
+    /// @brief Return the number of cached nodes at level @a i, where
+    /// 0 corresponds to the lowest level.
+    Index64 nodeCount(Index i) const
+    {
+        return i==0 ? mList0.nodeCount() : i==1 ? mList1.nodeCount()
+             : i==2 ? mList2.nodeCount() : 0;
+    }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.foreach(op, threaded, grainSize);
+        mList1.foreach(op, threaded, grainSize);
+        mList2.foreach(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList2.foreach(op, threaded, grainSize);
+        mList1.foreach(op, threaded, grainSize);
+        mList0.foreach(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.reduce(op, threaded, grainSize);
+        mList1.reduce(op, threaded, grainSize);
+        mList2.reduce(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList2.reduce(op, threaded, grainSize);
+        mList1.reduce(op, threaded, grainSize);
+        mList0.reduce(op, threaded, grainSize);
+    }
+
+protected:
+    typedef RootNodeType                   NodeT3;
+    typedef typename NodeT3::ChildNodeType NodeT2;//upper level
+    typedef typename NodeT2::ChildNodeType NodeT1;//mid level
+    typedef typename NodeT1::ChildNodeType NodeT0;//lower level
+
+    typedef NodeList<NodeT2>               ListT2;//upper level of internal nodes
+    typedef NodeList<NodeT1>               ListT1;//lower level of internal nodes
+    typedef NodeList<NodeT0>               ListT0;//lower level of internal nodes or leafs
+
+    NodeT3& mRoot;
+    ListT2 mList2;
+    ListT1 mList1;
+    ListT0 mList0;
+
+private:
+    NodeManager(const NodeManager&) {} // disallow copy-construction
+}; // NodeManager<3>
+
+
+////////////////////////////////////////////
+
+
+/// Template specialization of the NodeManager with four levels of nodes
+template<typename TreeOrLeafManagerT>
+class NodeManager<TreeOrLeafManagerT, 4>
+{
+public:
+    typedef typename TreeOrLeafManagerT::RootNodeType RootNodeType;
+    BOOST_STATIC_ASSERT(RootNodeType::LEVEL > 3);
+    static const Index LEVELS = 4;
+
+    NodeManager(TreeOrLeafManagerT& tree) : mRoot(tree.root())
+    {
+        mRoot.getNodes(mList3);
+        for (size_t i=0, n=mList3.nodeCount(); i<n; ++i) mList3(i).getNodes(mList2);
+        for (size_t i=0, n=mList2.nodeCount(); i<n; ++i) mList2(i).getNodes(mList1);
+
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (TreeOrLeafManagerT::DEPTH == 2 && NodeT0::LEVEL == 0) {
+            tree.getNodes(mList0);
+        } else {
+            for (size_t i=0, n=mList1.nodeCount(); i<n; ++i) mList1(i).getNodes(mList0);
+        }
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    virtual ~NodeManager() {}
+
+    /// @brief Clear all the cached tree nodes
+    void clear() { mList0.clear(); mList1.clear(); mList2.clear(); mList3.clear; }
+
+    /// @brief Clear and recache all the tree nodes from the
+    /// tree. This is required if tree nodes have been added or removed.
+    void rebuild()
+    {
+        this->clear();
+        mRoot.getNodes(mList3);
+        for (size_t i=0, n=mList3.nodeCount(); i<n; ++i) mList3(i).getNodes(mList2);
+        for (size_t i=0, n=mList2.nodeCount(); i<n; ++i) mList2(i).getNodes(mList1);
+        for (size_t i=0, n=mList1.nodeCount(); i<n; ++i) mList1(i).getNodes(mList0);
+    }
+
+    /// @brief Return a reference to the root node.
+    const RootNodeType& root() const { return mRoot; }
+
+    /// @brief Return the total number of cached nodes (excluding the root node)
+    Index64 nodeCount() const
+    {
+        return mList0.nodeCount() + mList1.nodeCount()
+             + mList2.nodeCount() + mList3.nodeCount();
+    }
+
+    /// @brief Return the number of cached nodes at level @a i, where
+    /// 0 corresponds to the lowest level.
+    Index64 nodeCount(Index i) const
+    {
+        return i==0 ? mList0.nodeCount() : i==1 ? mList1.nodeCount() :
+               i==2 ? mList2.nodeCount() : i==3 ? mList3.nodeCount() : 0;
+    }
+
+    template<typename NodeOp>
+    void foreachBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.foreach(op, threaded, grainSize);
+        mList1.foreach(op, threaded, grainSize);
+        mList2.foreach(op, threaded, grainSize);
+        mList3.foreach(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void foreachTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList3.foreach(op, threaded, grainSize);
+        mList2.foreach(op, threaded, grainSize);
+        mList1.foreach(op, threaded, grainSize);
+        mList0.foreach(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processBottomUp(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachBottomUp<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    OPENVDB_DEPRECATED void processTopDown(const NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        this->foreachTopDown<NodeOp>(op, threaded, grainSize);
+    }
+
+    template<typename NodeOp>
+    void reduceBottomUp(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        mList0.reduce(op, threaded, grainSize);
+        mList1.reduce(op, threaded, grainSize);
+        mList2.reduce(op, threaded, grainSize);
+        mList3.reduce(op, threaded, grainSize);
+        op(mRoot);
+    }
+
+    template<typename NodeOp>
+    void reduceTopDown(NodeOp& op, bool threaded = true, size_t grainSize=1)
+    {
+        op(mRoot);
+        mList3.reduce(op, threaded, grainSize);
+        mList2.reduce(op, threaded, grainSize);
+        mList1.reduce(op, threaded, grainSize);
+        mList0.reduce(op, threaded, grainSize);
+    }
+
+protected:
+    typedef RootNodeType                   NodeT4;
+    typedef typename NodeT4::ChildNodeType NodeT3;//upper level
+    typedef typename NodeT3::ChildNodeType NodeT2;//upper mid level
+    typedef typename NodeT2::ChildNodeType NodeT1;//lower mid level
+    typedef typename NodeT1::ChildNodeType NodeT0;//lower level
+
+    typedef NodeList<NodeT3>               ListT3;//upper level of internal nodes
+    typedef NodeList<NodeT2>               ListT2;//upper mid level of internal nodes
+    typedef NodeList<NodeT1>               ListT1;//lower mid level of internal nodes
+    typedef NodeList<NodeT0>               ListT0;//lower level of internal nodes or leafs
+
+    NodeT4& mRoot;
+    ListT3  mList3;
+    ListT2  mList2;
+    ListT1  mList1;
+    ListT0  mList0;
+
+private:
+    NodeManager(const NodeManager&) {} // disallow copy-construction
+}; // NodeManager<4>
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_NODEMANAGER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/NodeUnion.h b/nuparu/include/openvdb_new/tree/NodeUnion.h
new file mode 100644
index 00000000..c2e4097d
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/NodeUnion.h
@@ -0,0 +1,137 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file NodeUnion.h
+///
+/// @author Peter Cucka
+///
+/// NodeUnion is a templated helper class that controls access to either
+/// the child node pointer or the value for a particular element of a root
+/// or internal node.  For space efficiency, the child pointer and the value
+/// are unioned, since the two are never in use simultaneously.
+/// Template specializations of NodeUnion allow for values of either POD
+/// (int, float, pointer, etc.) or class (std::string, math::Vec, etc.) types.
+/// (The latter cannot be stored directly in a union.)
+
+#ifndef OPENVDB_TREE_NODEUNION_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_NODEUNION_HAS_BEEN_INCLUDED
+
+#include <boost/type_traits/is_class.hpp>
+#include <openvdb/version.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+// Internal implementation of a union of a child node pointer and a value
+template<bool ValueIsClass, class ValueT, class ChildT> class NodeUnionImpl;
+
+
+// Partial specialization for values of non-class types
+// (int, float, pointer, etc.) that stores elements by value
+template<typename ValueT, typename ChildT>
+class NodeUnionImpl</*ValueIsClass=*/false, ValueT, ChildT>
+{
+private:
+    union { ChildT* child; ValueT value; } mUnion;
+
+public:
+    NodeUnionImpl() { setChild(NULL); }
+
+    ChildT* getChild() const { return mUnion.child; }
+    const ValueT& getValue() const { return mUnion.value; }
+    ValueT& getValue() { return mUnion.value; }
+    void setChild(ChildT* child) { mUnion.child = child; }
+    void setValue(const ValueT& val) { mUnion.value = val; }
+};
+
+
+// Partial specialization for values of class types (std::string,
+// math::Vec, etc.) that stores elements by pointer
+template<typename ValueT, typename ChildT>
+class NodeUnionImpl</*ValueIsClass=*/true, ValueT, ChildT>
+{
+private:
+    union { ChildT* child; ValueT* value; } mUnion;
+    bool mHasChild;
+
+public:
+    NodeUnionImpl(): mHasChild(true) { setChild(NULL); }
+    NodeUnionImpl(const NodeUnionImpl& other)
+    {
+        if (other.mHasChild) setChild(other.getChild());
+        else setValue(other.getValue());
+    }
+    NodeUnionImpl& operator=(const NodeUnionImpl& other)
+    {
+        if (other.mHasChild) setChild(other.getChild());
+        else setValue(other.getValue());
+    }
+    ~NodeUnionImpl() { setChild(NULL); }
+
+    ChildT* getChild() const
+        { return mHasChild ? mUnion.child : NULL; }
+    void setChild(ChildT* child)
+    {
+        if (!mHasChild) delete mUnion.value;
+        mUnion.child = child;
+        mHasChild = true;
+    }
+
+    const ValueT& getValue() const { return *mUnion.value; }
+    ValueT& getValue() { return *mUnion.value; }
+    void setValue(const ValueT& val)
+    {
+        /// @todo To minimize storage across nodes, intern and reuse
+        /// common values, using, e.g., boost::flyweight.
+        if (!mHasChild) delete mUnion.value;
+        mUnion.value = new ValueT(val);
+        mHasChild = false;
+    }
+};
+
+
+template<typename ValueT, typename ChildT>
+struct NodeUnion: public NodeUnionImpl<
+    boost::is_class<ValueT>::value, ValueT, ChildT>
+{
+    NodeUnion() {}
+};
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_NODEUNION_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/RootNode.h b/nuparu/include/openvdb_new/tree/RootNode.h
new file mode 100644
index 00000000..c8f28594
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/RootNode.h
@@ -0,0 +1,3411 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file RootNode.h
+///
+/// @brief The root node of an OpenVDB tree
+
+#ifndef OPENVDB_TREE_ROOTNODE_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_ROOTNODE_HAS_BEEN_INCLUDED
+
+#include <map>
+#include <set>
+#include <sstream>
+#include <deque>
+#include <boost/type_traits/remove_const.hpp>
+#include <boost/type_traits/remove_pointer.hpp>
+#include <boost/type_traits/is_pointer.hpp>
+#include <boost/type_traits/is_const.hpp>
+#include <boost/mpl/contains.hpp>
+#include <boost/mpl/if.hpp>
+#include <boost/mpl/vector.hpp>//for boost::mpl::vector
+#include <boost/mpl/at.hpp>
+#include <boost/mpl/push_back.hpp>
+#include <boost/mpl/size.hpp>
+#include <tbb/parallel_for.h>
+#include <openvdb/Exceptions.h>
+#include <openvdb/Types.h>
+#include <openvdb/io/Compression.h> // for truncateRealToHalf()
+#include <openvdb/math/Math.h> // for isZero(), isExactlyEqual(), etc.
+#include <openvdb/math/BBox.h>
+#include <openvdb/util/NodeMasks.h> // for backward compatibility only (see readTopology())
+#include <openvdb/version.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+// Forward declarations
+template<typename HeadType, int HeadLevel> struct NodeChain;
+template<typename, typename> struct SameRootConfig;
+template<typename, typename, bool> struct RootNodeCopyHelper;
+template<typename, typename, typename, bool> struct RootNodeCombineHelper;
+
+
+template<typename ChildType>
+class RootNode
+{
+public:
+    typedef ChildType                         ChildNodeType;
+    typedef typename ChildType::LeafNodeType  LeafNodeType;
+    typedef typename ChildType::ValueType     ValueType;
+    typedef typename ChildType::BuildType     BuildType;
+
+    static const Index LEVEL = 1 + ChildType::LEVEL; // level 0 = leaf
+
+    /// NodeChainType is a list of this tree's node types, from LeafNodeType to RootNode.
+    typedef typename NodeChain<RootNode, LEVEL>::Type NodeChainType;
+    BOOST_STATIC_ASSERT(boost::mpl::size<NodeChainType>::value == LEVEL + 1);
+
+    /// @brief ValueConverter<T>::Type is the type of a RootNode having the same
+    /// child hierarchy as this node but a different value type, T.
+    template<typename OtherValueType>
+    struct ValueConverter {
+        typedef RootNode<typename ChildType::template ValueConverter<OtherValueType>::Type> Type;
+    };
+
+    /// @brief SameConfiguration<OtherNodeType>::value is @c true if and only if
+    /// OtherNodeType is the type of a RootNode whose ChildNodeType has the same
+    /// configuration as this node's ChildNodeType.
+    template<typename OtherNodeType>
+    struct SameConfiguration {
+        static const bool value = SameRootConfig<ChildNodeType, OtherNodeType>::value;
+    };
+
+
+    /// Construct a new tree with a background value of 0.
+    RootNode();
+
+    /// Construct a new tree with the given background value.
+    explicit RootNode(const ValueType& background);
+
+    RootNode(const RootNode& other) { *this = other; }
+
+    /// @brief Construct a new tree that reproduces the topology and active states
+    /// of a tree of a different ValueType but the same configuration (levels,
+    /// node dimensions and branching factors).  Cast the other tree's values to
+    /// this tree's ValueType.
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's
+    /// or if this tree's ValueType is not constructible from the other tree's ValueType.
+    template<typename OtherChildType>
+    explicit RootNode(const RootNode<OtherChildType>& other) { *this = other; }
+
+    /// @brief Construct a new tree that reproduces the topology and active states of
+    /// another tree (which may have a different ValueType), but not the other tree's values.
+    /// @details All tiles and voxels that are active in the other tree are set to
+    /// @a foreground in the new tree, and all inactive tiles and voxels are set to @a background.
+    /// @param other       the root node of a tree having (possibly) a different ValueType
+    /// @param background  the value to which inactive tiles and voxels are initialized
+    /// @param foreground  the value to which active tiles and voxels are initialized
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's.
+    template<typename OtherChildType>
+    RootNode(const RootNode<OtherChildType>& other,
+        const ValueType& background, const ValueType& foreground, TopologyCopy);
+
+    /// @brief Construct a new tree that reproduces the topology and active states of
+    /// another tree (which may have a different ValueType), but not the other tree's values.
+    /// All tiles and voxels in the new tree are set to @a background regardless of
+    /// their active states in the other tree.
+    /// @param other       the root node of a tree having (possibly) a different ValueType
+    /// @param background  the value to which inactive tiles and voxels are initialized
+    /// @note This copy constructor is generally faster than the one that takes both
+    /// a foreground and a background value.  Its main application is in multithreaded
+    /// operations where the topology of the output tree exactly matches the input tree.
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's.
+    template<typename OtherChildType>
+    RootNode(const RootNode<OtherChildType>& other, const ValueType& background, TopologyCopy);
+
+    /// @brief Copy a root node of the same type as this node.
+    RootNode& operator=(const RootNode& other);
+    /// @brief Copy a root node of the same tree configuration as this node
+    /// but a different ValueType.
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's.
+    /// @note This node's ValueType must be constructible from the other node's ValueType.
+    /// For example, a root node with values of type float can be assigned to a root node
+    /// with values of type Vec3s, because a Vec3s can be constructed from a float.
+    /// But a Vec3s root node cannot be assigned to a float root node.
+    template<typename OtherChildType>
+    RootNode& operator=(const RootNode<OtherChildType>& other);
+
+    ~RootNode() { this->clear(); }
+
+private:
+    struct Tile {
+        Tile(): value(zeroVal<ValueType>()), active(false) {}
+        Tile(const ValueType& v, bool b): value(v), active(b) {}
+        ValueType value;
+        bool      active;
+    };
+
+    // This lightweight struct pairs child pointers and tiles.
+    struct NodeStruct {
+        ChildType* child;
+        Tile       tile;
+
+        NodeStruct(): child(NULL) {}
+        NodeStruct(ChildType& c): child(&c) {}
+        NodeStruct(const Tile& t): child(NULL), tile(t) {}
+        ~NodeStruct() {} ///< @note doesn't delete child
+
+        bool isChild() const { return child != NULL; }
+        bool isTile() const { return child == NULL; }
+        bool isTileOff() const { return isTile() && !tile.active; }
+        bool isTileOn() const { return isTile() && tile.active; }
+
+        void set(ChildType& c) { delete child; child = &c; }
+        void set(const Tile& t) { delete child; child = NULL; tile = t; }
+        ChildType& steal(const Tile& t) { ChildType* c = child; child = NULL; tile = t; return *c; }
+    };
+
+    typedef std::map<Coord, NodeStruct>      MapType;
+    typedef typename MapType::iterator       MapIter;
+    typedef typename MapType::const_iterator MapCIter;
+
+    typedef std::set<Coord>                   CoordSet;
+    typedef typename CoordSet::iterator       CoordSetIter;
+    typedef typename CoordSet::const_iterator CoordSetCIter;
+
+    static void             setTile(const MapIter& i, const Tile& t) { i->second.set(t); }
+    static void             setChild(const MapIter& i, ChildType& c) { i->second.set(c); }
+    static Tile&            getTile(const MapIter& i) { return i->second.tile; }
+    static const Tile&      getTile(const MapCIter& i) { return i->second.tile; }
+    static ChildType&       getChild(const MapIter& i) { return *(i->second.child); }
+    static const ChildType& getChild(const MapCIter& i) { return *(i->second.child); }
+    static ChildType&       stealChild(const MapIter& i, const Tile& t) {return i->second.steal(t);}
+    static const ChildType& stealChild(const MapCIter& i,const Tile& t) {return i->second.steal(t);}
+
+    static bool isChild(const MapCIter& i)   { return i->second.isChild(); }
+    static bool isChild(const MapIter& i)    { return i->second.isChild(); }
+    static bool isTile(const MapCIter& i)    { return i->second.isTile(); }
+    static bool isTile(const MapIter& i)     { return i->second.isTile(); }
+    static bool isTileOff(const MapCIter& i) { return i->second.isTileOff(); }
+    static bool isTileOff(const MapIter& i)  { return i->second.isTileOff(); }
+    static bool isTileOn(const MapCIter& i)  { return i->second.isTileOn(); }
+    static bool isTileOn(const MapIter& i)   { return i->second.isTileOn(); }
+
+    struct NullPred {
+        static inline bool test(const MapIter&) { return true; }
+        static inline bool test(const MapCIter&) { return true; }
+    };
+    struct ValueOnPred {
+        static inline bool test(const MapIter& i) { return isTileOn(i); }
+        static inline bool test(const MapCIter& i) { return isTileOn(i); }
+    };
+    struct ValueOffPred {
+        static inline bool test(const MapIter& i) { return isTileOff(i); }
+        static inline bool test(const MapCIter& i) { return isTileOff(i); }
+    };
+    struct ValueAllPred {
+        static inline bool test(const MapIter& i) { return isTile(i); }
+        static inline bool test(const MapCIter& i) { return isTile(i); }
+    };
+    struct ChildOnPred {
+        static inline bool test(const MapIter& i) { return isChild(i); }
+        static inline bool test(const MapCIter& i) { return isChild(i); }
+    };
+    struct ChildOffPred {
+        static inline bool test(const MapIter& i) { return isTile(i); }
+        static inline bool test(const MapCIter& i) { return isTile(i); }
+    };
+
+    template<typename _RootNodeT, typename _MapIterT, typename FilterPredT>
+    class BaseIter
+    {
+    public:
+        typedef _RootNodeT RootNodeT;
+        typedef _MapIterT MapIterT; // either MapIter or MapCIter
+
+        bool operator==(const BaseIter& other) const
+        {
+            return (mParentNode == other.mParentNode) && (mIter == other.mIter);
+        }
+        bool operator!=(const BaseIter& other) const { return !(*this == other); }
+
+        RootNodeT* getParentNode() const { return mParentNode; }
+        /// Return a reference to the node over which this iterator iterates.
+        RootNodeT& parent() const
+        {
+            if (!mParentNode) OPENVDB_THROW(ValueError, "iterator references a null parent node");
+            return *mParentNode;
+        }
+
+        bool test() const { assert(mParentNode); return mIter != mParentNode->mTable.end(); }
+        operator bool() const { return this->test(); }
+
+        void increment() { ++mIter; this->skip(); }
+        bool next() { this->increment(); return this->test(); }
+        void increment(Index n) { for (int i = 0; i < n && this->next(); ++i) {} }
+
+        /// @brief Return this iterator's position as an offset from
+        /// the beginning of the parent node's map.
+        Index pos() const
+        {
+            return !mParentNode ? 0U : Index(std::distance(mParentNode->mTable.begin(), mIter));
+        }
+
+        bool isValueOn() const { return RootNodeT::isTileOn(mIter); }
+        bool isValueOff() const { return RootNodeT::isTileOff(mIter); }
+        void setValueOn(bool on = true) const { mIter->second.tile.active = on; }
+        void setValueOff() const { mIter->second.tile.active = false; }
+
+        /// Return the coordinates of the item to which this iterator is pointing.
+        Coord getCoord() const { return mIter->first; }
+        /// Return in @a xyz the coordinates of the item to which this iterator is pointing.
+        void getCoord(Coord& xyz) const { xyz = this->getCoord(); }
+
+    protected:
+        BaseIter(): mParentNode(NULL) {}
+        BaseIter(RootNodeT& parent, const MapIterT& iter): mParentNode(&parent), mIter(iter) {}
+
+        void skip() { while (this->test() && !FilterPredT::test(mIter)) ++mIter; }
+
+        RootNodeT* mParentNode;
+        MapIterT mIter;
+    }; // BaseIter
+
+    template<typename RootNodeT, typename MapIterT, typename FilterPredT, typename ChildNodeT>
+    class ChildIter: public BaseIter<RootNodeT, MapIterT, FilterPredT>
+    {
+    public:
+        typedef BaseIter<RootNodeT, MapIterT, FilterPredT> BaseT;
+        typedef RootNodeT NodeType;
+        typedef NodeType ValueType;
+        typedef ChildNodeT ChildNodeType;
+        typedef typename boost::remove_const<NodeType>::type NonConstNodeType;
+        typedef typename boost::remove_const<ValueType>::type NonConstValueType;
+        typedef typename boost::remove_const<ChildNodeType>::type NonConstChildNodeType;
+        using BaseT::mIter;
+
+        ChildIter() {}
+        ChildIter(RootNodeT& parent, const MapIterT& iter): BaseT(parent, iter) { BaseT::skip(); }
+
+        ChildIter& operator++() { BaseT::increment(); return *this; }
+
+        ChildNodeT& getValue() const { return getChild(mIter); }
+        ChildNodeT& operator*() const { return this->getValue(); }
+        ChildNodeT* operator->() const { return &this->getValue(); }
+    }; // ChildIter
+
+    template<typename RootNodeT, typename MapIterT, typename FilterPredT, typename ValueT>
+    class ValueIter: public BaseIter<RootNodeT, MapIterT, FilterPredT>
+    {
+    public:
+        typedef BaseIter<RootNodeT, MapIterT, FilterPredT> BaseT;
+        typedef RootNodeT NodeType;
+        typedef ValueT ValueType;
+        typedef typename boost::remove_const<NodeType>::type NonConstNodeType;
+        typedef typename boost::remove_const<ValueT>::type NonConstValueType;
+        using BaseT::mIter;
+
+        ValueIter() {}
+        ValueIter(RootNodeT& parent, const MapIterT& iter): BaseT(parent, iter) { BaseT::skip(); }
+
+        ValueIter& operator++() { BaseT::increment(); return *this; }
+
+        ValueT& getValue() const { return getTile(mIter).value; }
+        ValueT& operator*() const { return this->getValue(); }
+        ValueT* operator->() const { return &(this->getValue()); }
+
+        void setValue(const ValueT& v) const { assert(isTile(mIter)); getTile(mIter).value = v; }
+
+        template<typename ModifyOp>
+        void modifyValue(const ModifyOp& op) const
+        {
+            assert(isTile(mIter));
+            op(getTile(mIter).value);
+        }
+    }; // ValueIter
+
+    template<typename RootNodeT, typename MapIterT, typename ChildNodeT, typename ValueT>
+    class DenseIter: public BaseIter<RootNodeT, MapIterT, NullPred>
+    {
+    public:
+        typedef BaseIter<RootNodeT, MapIterT, NullPred> BaseT;
+        typedef RootNodeT NodeType;
+        typedef ValueT ValueType;
+        typedef ChildNodeT ChildNodeType;
+        typedef typename boost::remove_const<NodeType>::type NonConstNodeType;
+        typedef typename boost::remove_const<ValueT>::type NonConstValueType;
+        typedef typename boost::remove_const<ChildNodeT>::type NonConstChildNodeType;
+        using BaseT::mIter;
+
+        DenseIter() {}
+        DenseIter(RootNodeT& parent, const MapIterT& iter): BaseT(parent, iter) {}
+
+        DenseIter& operator++() { BaseT::increment(); return *this; }
+
+        bool isChildNode() const { return isChild(mIter); }
+
+        ChildNodeT* probeChild(NonConstValueType& value) const
+        {
+            if (isChild(mIter)) return &getChild(mIter);
+            value = getTile(mIter).value;
+            return NULL;
+        }
+        bool probeChild(ChildNodeT*& child, NonConstValueType& value) const
+        {
+            child = this->probeChild(value);
+            return child != NULL;
+        }
+        bool probeValue(NonConstValueType& value) const { return !this->probeChild(value); }
+
+        void setChild(ChildNodeT& c) const { RootNodeT::setChild(mIter, c); }
+        void setChild(ChildNodeT* c) const { assert(c != NULL); RootNodeT::setChild(mIter, *c); }
+        void setValue(const ValueT& v) const
+        {
+            if (isTile(mIter)) getTile(mIter).value = v;
+            /// @internal For consistency with iterators for other node types
+            /// (see, e.g., InternalNode::DenseIter::unsetItem()), we don't call
+            /// setTile() here, because that would also delete the child.
+            else stealChild(mIter, Tile(v, /*active=*/true));
+        }
+    }; // DenseIter
+
+public:
+    typedef ChildIter<RootNode, MapIter, ChildOnPred, ChildType>                  ChildOnIter;
+    typedef ChildIter<const RootNode, MapCIter, ChildOnPred, const ChildType>     ChildOnCIter;
+    typedef ValueIter<RootNode, MapIter, ChildOffPred, const ValueType>           ChildOffIter;
+    typedef ValueIter<const RootNode, MapCIter, ChildOffPred, ValueType>          ChildOffCIter;
+    typedef DenseIter<RootNode, MapIter, ChildType, ValueType>                    ChildAllIter;
+    typedef DenseIter<const RootNode, MapCIter, const ChildType, const ValueType> ChildAllCIter;
+
+    typedef ValueIter<RootNode, MapIter, ValueOnPred, ValueType>                  ValueOnIter;
+    typedef ValueIter<const RootNode, MapCIter, ValueOnPred, const ValueType>     ValueOnCIter;
+    typedef ValueIter<RootNode, MapIter, ValueOffPred, ValueType>                 ValueOffIter;
+    typedef ValueIter<const RootNode, MapCIter, ValueOffPred, const ValueType>    ValueOffCIter;
+    typedef ValueIter<RootNode, MapIter, ValueAllPred, ValueType>                 ValueAllIter;
+    typedef ValueIter<const RootNode, MapCIter, ValueAllPred, const ValueType>    ValueAllCIter;
+
+
+    ChildOnCIter  cbeginChildOn()  const { return ChildOnCIter(*this, mTable.begin()); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(*this, mTable.begin()); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(*this, mTable.begin()); }
+    ChildOnCIter   beginChildOn()  const { return cbeginChildOn(); }
+    ChildOffCIter  beginChildOff() const { return cbeginChildOff(); }
+    ChildAllCIter  beginChildAll() const { return cbeginChildAll(); }
+    ChildOnIter    beginChildOn()  { return ChildOnIter(*this, mTable.begin()); }
+    ChildOffIter   beginChildOff() { return ChildOffIter(*this, mTable.begin()); }
+    ChildAllIter   beginChildAll() { return ChildAllIter(*this, mTable.begin()); }
+
+    ValueOnCIter  cbeginValueOn()  const { return ValueOnCIter(*this, mTable.begin()); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(*this, mTable.begin()); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(*this, mTable.begin()); }
+    ValueOnCIter   beginValueOn()  const { return cbeginValueOn(); }
+    ValueOffCIter  beginValueOff() const { return cbeginValueOff(); }
+    ValueAllCIter  beginValueAll() const { return cbeginValueAll(); }
+    ValueOnIter    beginValueOn()  { return ValueOnIter(*this, mTable.begin()); }
+    ValueOffIter   beginValueOff() { return ValueOffIter(*this, mTable.begin()); }
+    ValueAllIter   beginValueAll() { return ValueAllIter(*this, mTable.begin()); }
+
+    /// Return the total amount of memory in bytes occupied by this node and its children.
+    Index64 memUsage() const;
+
+    /// @brief Expand the specified bbox so it includes the active tiles of
+    /// this root node as well as all the active values in its child
+    /// nodes. If visitVoxels is false LeafNodes will be approximated
+    /// as dense, i.e. with all voxels active. Else the individual
+    /// active voxels are visited to produce a tight bbox.
+    void evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels = true) const;
+
+    /// Return the bounding box of this RootNode, i.e., an infinite bounding box.
+    static CoordBBox getNodeBoundingBox() { return CoordBBox::inf(); }
+
+    /// @brief Change inactive tiles or voxels with a value equal to +/- the
+    /// old background to the specified value (with the same sign). Active values
+    /// are unchanged.
+    ///
+    /// @param value The new background value
+    /// @param updateChildNodes If true the background values of the
+    /// child nodes is also updated. Else only the background value
+    /// stored in the RootNode itself is changed.
+    ///
+    /// @note Instead of setting @a updateChildNodes to true, consider
+    /// using tools::changeBackground or
+    /// tools::changeLevelSetBackground which are multi-threaded!
+    void setBackground(const ValueType& value, bool updateChildNodes);
+
+    /// Return this node's background value.
+    const ValueType& background() const { return mBackground; }
+
+    /// Return @c true if the given tile is inactive and has the background value.
+    bool isBackgroundTile(const Tile&) const;
+    //@{
+    /// Return @c true if the given iterator points to an inactive tile with the background value.
+    bool isBackgroundTile(const MapIter&) const;
+    bool isBackgroundTile(const MapCIter&) const;
+    //@}
+
+    /// Return the number of background tiles.
+    size_t numBackgroundTiles() const;
+    /// @brief Remove all background tiles.
+    /// @return the number of tiles removed.
+    size_t eraseBackgroundTiles();
+    inline void clear();
+
+    /// Return @c true if this node's table is either empty or contains only background tiles.
+    bool empty() const { return mTable.size() == numBackgroundTiles(); }
+
+    /// @brief Expand this node's table so that (x, y, z) is included in the index range.
+    /// @return @c true if an expansion was performed (i.e., if (x, y, z) was not already
+    /// included in the index range).
+    bool expand(const Coord& xyz);
+
+    static Index getLevel() { return LEVEL; }
+    static void getNodeLog2Dims(std::vector<Index>& dims);
+    static Index getChildDim() { return ChildType::DIM; }
+
+    /// Return the number of entries in this node's table.
+    Index getTableSize() const { return static_cast<Index>(mTable.size()); }
+
+    Index getWidth() const { return this->getMaxIndex()[0] - this->getMinIndex()[0]; }
+    Index getHeight() const { return this->getMaxIndex()[1] - this->getMinIndex()[1]; }
+    Index getDepth() const { return this->getMaxIndex()[2] - this->getMinIndex()[2]; }
+
+    /// Return the smallest index of the current tree.
+    Coord getMinIndex() const;
+    /// Return the largest index of the current tree.
+    Coord getMaxIndex() const;
+    /// Return the current index range.  Both min and max are inclusive.
+    void getIndexRange(CoordBBox& bbox) const;
+
+    /// @brief Return @c true if the given tree has the same node and active value
+    /// topology as this tree (but possibly a different @c ValueType).
+    template<typename OtherChildType>
+    bool hasSameTopology(const RootNode<OtherChildType>& other) const;
+
+    /// Return @c false if the other node's dimensions don't match this node's.
+    template<typename OtherChildType>
+    static bool hasSameConfiguration(const RootNode<OtherChildType>& other);
+
+    /// Return @c true if values of the other node's ValueType can be converted
+    /// to values of this node's ValueType.
+    template<typename OtherChildType>
+    static bool hasCompatibleValueType(const RootNode<OtherChildType>& other);
+
+    Index32 leafCount() const;
+    Index32 nonLeafCount() const;
+    Index64 onVoxelCount() const;
+    Index64 offVoxelCount() const;
+    Index64 onLeafVoxelCount() const;
+    Index64 offLeafVoxelCount() const;
+    Index64 onTileCount() const;
+
+    bool isValueOn(const Coord& xyz) const;
+
+    bool hasActiveTiles() const;
+
+    const ValueType& getValue(const Coord& xyz) const;
+    bool probeValue(const Coord& xyz, ValueType& value) const;
+
+    /// @brief Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides.
+    /// @details If (x, y, z) isn't explicitly represented in the tree (i.e.,
+    /// it is implicitly a background voxel), return -1.
+    int getValueDepth(const Coord& xyz) const;
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on);
+    /// Set the value of the voxel at the given coordinates but don't change its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValueOn(const Coord& xyz, const ValueType& value);
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value);
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op);
+    /// Apply a functor to the voxel at the given coordinates.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op);
+
+    /// @brief Set all voxels within a given box to a constant value, if necessary
+    /// subdividing tiles that intersect the box.
+    /// @param bbox    inclusive coordinates of opposite corners of an axis-aligned box
+    /// @param value   the value to which to set voxels within the box
+    /// @param active  if true, mark voxels within the box as active,
+    ///                otherwise mark them as inactive. Defaults to true.
+    /// @param sparse  if false, active tiles are voxelized, i.e. only active voxels
+    ///                are generated from the fill operation. Defaults to true.  
+    void fill(const CoordBBox& bbox, const ValueType& value, bool active = true, bool sparse = true);
+
+    /// @brief Copy into a dense grid the values of all voxels, both active and inactive,
+    /// that intersect a given bounding box.
+    /// @param bbox   inclusive bounding box of the voxels to be copied into the dense grid
+    /// @param dense  dense grid with a stride in @e z of one (see tools::Dense
+    ///               in tools/Dense.h for the required API)
+    template<typename DenseT>
+    void copyToDense(const CoordBBox& bbox, DenseT& dense) const;
+
+
+    //
+    // I/O
+    //
+    bool writeTopology(std::ostream&, bool toHalf = false) const;
+    bool readTopology(std::istream&, bool fromHalf = false);
+
+    void writeBuffers(std::ostream&, bool toHalf = false) const;
+    void readBuffers(std::istream&, bool fromHalf = false);
+    void readBuffers(std::istream&, const CoordBBox&, bool fromHalf = false);
+
+
+    //
+    // Voxel access
+    //
+    /// Return the value of the voxel at the given coordinates and, if necessary, update
+    /// the accessor with pointers to the nodes along the path from the root node to
+    /// the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    const ValueType& getValueAndCache(const Coord& xyz, AccessorT&) const;
+    /// Return @c true if the voxel at the given coordinates is active and, if necessary,
+    /// update the accessor with pointers to the nodes along the path from the root node
+    /// to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool isValueOnAndCache(const Coord& xyz, AccessorT&) const;
+
+    /// Change the value of the voxel at the given coordinates and mark it as active.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueAndCache(const Coord& xyz, const ValueType& value, AccessorT&);
+
+    /// Set the value of the voxel at the given coordinates without changing its active state.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord& xyz, const ValueType& value, AccessorT&);
+
+    /// Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&);
+
+    /// Apply a functor to the voxel at the given coordinates.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord& xyz, const ModifyOp& op, AccessorT&);
+
+    /// Change the value of the voxel at the given coordinates and mark it as inactive.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord& xyz, const ValueType& value, AccessorT&);
+
+    /// Set the active state of the voxel at the given coordinates without changing its value.
+    /// If necessary, update the accessor with pointers to the nodes along the path
+    /// from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT&);
+
+    /// Return, in @a value, the value of the voxel at the given coordinates and,
+    /// if necessary, update the accessor with pointers to the nodes along
+    /// the path from the root node to the node containing the voxel.
+    /// @return @c true if the voxel at the given coordinates is active
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    bool probeValueAndCache(const Coord& xyz, ValueType& value, AccessorT&) const;
+
+    /// Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides.
+    /// If (x, y, z) isn't explicitly represented in the tree (i.e., it is implicitly
+    /// a background voxel), return -1. If necessary, update the accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the voxel.
+    /// @note Used internally by ValueAccessor.
+    template<typename AccessorT>
+    int getValueDepthAndCache(const Coord& xyz, AccessorT&) const;
+
+    /// Set all voxels that lie outside the given axis-aligned box to the background.
+    void clip(const CoordBBox&);
+
+    /// @brief Reduce the memory footprint of this tree by replacing with tiles
+    /// any nodes whose values are all the same (optionally to within a tolerance)
+    /// and have the same active state.
+    ///
+    /// @note Consider instead using tools::prune which is multi-threaded!
+    void prune(const ValueType& tolerance = zeroVal<ValueType>());
+
+    /// @brief Add the given leaf node to this tree, creating a new branch if necessary.
+    /// If a leaf node with the same origin already exists, replace it.
+    void addLeaf(LeafNodeType* leaf);
+
+    /// @brief Same as addLeaf() but, if necessary, update the given accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename AccessorT>
+    void addLeafAndCache(LeafNodeType* leaf, AccessorT&);
+
+    /// @brief Return a pointer to the node of type @c NodeT that contains voxel (x, y, z)
+    /// and replace it with a tile of the specified value and state.
+    /// If no such node exists, leave the tree unchanged and return @c NULL.
+    ///
+    /// @note The caller takes ownership of the node and is responsible for deleting it.
+    ///
+    /// @warning Since this method potentially removes nodes and branches of the tree,
+    /// it is important to clear the caches of all ValueAccessors associated with this tree.
+    template<typename NodeT>
+    NodeT* stealNode(const Coord& xyz, const ValueType& value, bool state);
+
+    /// @brief Add a tile containing voxel (x, y, z) at the root level,
+    /// deleting the existing branch if necessary.
+    void addTile(const Coord& xyz, const ValueType& value, bool state);
+
+    /// @brief Add a tile containing voxel (x, y, z) at the specified tree level,
+    /// creating a new branch if necessary.  Delete any existing lower-level nodes
+    /// that contain (x, y, z).
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state);
+
+    /// @brief Same as addTile() but, if necessary, update the given accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename AccessorT>
+    void addTileAndCache(Index level, const Coord& xyz, const ValueType&, bool state, AccessorT&);
+
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, create one that preserves the values and
+    /// active states of all voxels.
+    /// @details Use this method to preallocate a static tree topology
+    /// over which to safely perform multithreaded processing.
+    LeafNodeType* touchLeaf(const Coord& xyz);
+
+    /// @brief Same as touchLeaf() but, if necessary, update the given accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename AccessorT>
+    LeafNodeType* touchLeafAndCache(const Coord& xyz, AccessorT& acc);
+
+    //@{
+    /// @brief Return a pointer to the node that contains voxel (x, y, z).
+    /// If no such node exists, return NULL.
+    template <typename NodeT>
+    NodeT* probeNode(const Coord& xyz);
+    template <typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz) const;
+    //@}
+
+    //@{
+    /// @brief Same as probeNode() but, if necessary, update the given accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord& xyz, AccessorT& acc);
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord& xyz, AccessorT& acc) const;
+    //@}
+
+    //@{
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, return NULL.
+    LeafNodeType* probeLeaf(const Coord& xyz);
+    const LeafNodeType* probeConstLeaf(const Coord& xyz) const;
+    const LeafNodeType* probeLeaf(const Coord& xyz) const;
+    //@}
+
+    //@{
+    /// @brief Same as probeLeaf() but, if necessary, update the given accessor with pointers
+    /// to the nodes along the path from the root node to the node containing the coordinate.
+    template<typename AccessorT>
+    LeafNodeType* probeLeafAndCache(const Coord& xyz, AccessorT& acc);
+    template<typename AccessorT>
+    const LeafNodeType* probeConstLeafAndCache(const Coord& xyz, AccessorT& acc) const;
+    template<typename AccessorT>
+    const LeafNodeType* probeLeafAndCache(const Coord& xyz, AccessorT& acc) const;
+    //@}
+
+
+    //
+    // Aux methods
+    //
+
+    //@{
+    /// @brief Adds all nodes of a certain type to a container with the following API:
+    /// @code
+    /// struct ArrayT {
+    ///    typedef value_type;// defines the type of nodes to be added to the array
+    ///    void push_back(value_type nodePtr);// method that add nodes to the array
+    /// };
+    /// @endcode
+    /// @details An example of a wrapper around a c-style array is:
+    /// @code
+    /// struct MyArray {
+    ///    typedef LeafType* value_type;
+    ///    value_type* ptr;
+    ///    MyArray(value_type* array) : ptr(array) {}
+    ///    void push_back(value_type leaf) { *ptr++ = leaf; }
+    ///};
+    /// @endcode
+    /// @details An example that constructs a list of pointer to all leaf nodes is:
+    /// @code
+    /// std::vector<const LeafNodeType*> array;//most std contains have the required API
+    /// array.reserve(tree.leafCount());//this is a fast preallocation.
+    /// tree.getNodes(array);
+    /// @endcode
+    template<typename ArrayT> void getNodes(ArrayT& array);
+    template<typename ArrayT> void getNodes(ArrayT& array) const;
+    //@}
+
+    //@{
+    /// @brief Steals all nodes of a certain type from the tree and
+    /// adds them to a container with the following API:
+    /// @code
+    /// struct ArrayT {
+    ///    typedef value_type;// defines the type of nodes to be added to the array
+    ///    void push_back(value_type nodePtr);// method that add nodes to the array
+    /// };
+    /// @endcode
+    /// @details An example of a wrapper around a c-style array is:
+    /// @code
+    /// struct MyArray {
+    ///    typedef LeafType* value_type;
+    ///    value_type* ptr;
+    ///    MyArray(value_type* array) : ptr(array) {}
+    ///    void push_back(value_type leaf) { *ptr++ = leaf; }
+    ///};
+    /// @endcode
+    /// @details An example that constructs a list of pointer to all leaf nodes is:
+    /// @code
+    /// std::vector<const LeafNodeType*> array;//most std contains have the required API
+    /// array.reserve(tree.leafCount());//this is a fast preallocation.
+    /// tree.stealNodes(array);
+    /// @endcode
+    template<typename ArrayT>
+    void stealNodes(ArrayT& array, const ValueType& value, bool state);
+    template<typename ArrayT>
+    void stealNodes(ArrayT& array) { this->stealNodes(array, mBackground, false); }
+    //@}
+    
+    /// @brief Densify active tiles, i.e., replace them with leaf-level active voxels.
+    ///
+    /// @param threaded if true, this operation is multi-threaded (over the internal nodes).
+    ///
+    /// @warning This method can explode the tree's memory footprint, especially if it 
+    /// contains active tiles at the upper levels, e.g. root level!
+    void voxelizeActiveTiles(bool threaded = true);
+
+    /// @brief Efficiently merge another tree into this tree using one of several schemes.
+    /// @details This operation is primarily intended to combine trees that are mostly
+    /// non-overlapping (for example, intermediate trees from computations that are
+    /// parallelized across disjoint regions of space).
+    /// @note This operation is not guaranteed to produce an optimally sparse tree.
+    /// Follow merge() with prune() for optimal sparseness.
+    /// @warning This operation always empties the other tree.
+    template<MergePolicy Policy> void merge(RootNode& other);
+
+    /// @brief Union this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different.
+    /// @details The resulting state of a value is active if the corresponding value
+    /// was already active OR if it is active in the other tree.  Also, a resulting
+    /// value maps to a voxel if the corresponding value already mapped to a voxel
+    /// OR if it is a voxel in the other tree.  Thus, a resulting value can only
+    /// map to a tile if the corresponding value already mapped to a tile
+    /// AND if it is a tile value in other tree.
+    ///
+    /// @note This operation modifies only active states, not values.
+    /// Specifically, active tiles and voxels in this tree are not changed, and
+    /// tiles or voxels that were inactive in this tree but active in the other tree
+    /// are marked as active in this tree but left with their original values.
+    template<typename OtherChildType>
+    void topologyUnion(const RootNode<OtherChildType>& other);
+
+    /// @brief Intersects this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different.
+    /// @details The resulting state of a value is active only if the corresponding
+    /// value was already active AND if it is active in the other tree. Also, a
+    /// resulting value maps to a voxel if the corresponding value
+    /// already mapped to an active voxel in either of the two grids
+    /// and it maps to an active tile or voxel in the other grid.
+    ///
+    /// @note This operation can delete branches in this grid if they
+    /// overlap with inactive tiles in the other grid. Likewise active
+    /// voxels can be turned into inactive voxels resulting in leaf
+    /// nodes with no active values. Thus, it is recommended to
+    /// subsequently call prune.
+    template<typename OtherChildType>
+    void topologyIntersection(const RootNode<OtherChildType>& other);
+
+    /// @brief Difference this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if the original voxel is
+    /// active in this tree and inactive in the other tree.
+    ///
+    /// @note This operation can delete branches in this grid if they
+    /// overlap with active tiles in the other grid. Likewise active
+    /// voxels can be turned into inactive voxels resulting in leaf
+    /// nodes with no active values. Thus, it is recommended to
+    /// subsequently call prune.
+    template<typename OtherChildType>
+    void topologyDifference(const RootNode<OtherChildType>& other);
+
+    template<typename CombineOp>
+    void combine(RootNode& other, CombineOp&, bool prune = false);
+
+    template<typename CombineOp, typename OtherRootNode /*= RootNode*/>
+    void combine2(const RootNode& other0, const OtherRootNode& other1,
+                  CombineOp& op, bool prune = false);
+
+    /// @brief Call the templated functor BBoxOp with bounding box
+    /// information for all active tiles and leaf nodes in the tree.
+    /// An additional level argument is provided for each callback.
+    ///
+    /// @note The bounding boxes are guaranteed to be non-overlapping.
+    template<typename BBoxOp> void visitActiveBBox(BBoxOp&) const;
+
+    template<typename VisitorOp> void visit(VisitorOp&);
+    template<typename VisitorOp> void visit(VisitorOp&) const;
+
+    template<typename OtherRootNodeType, typename VisitorOp>
+    void visit2(OtherRootNodeType& other, VisitorOp&);
+    template<typename OtherRootNodeType, typename VisitorOp>
+    void visit2(OtherRootNodeType& other, VisitorOp&) const;
+
+private:
+    /// During topology-only construction, access is needed
+    /// to protected/private members of other template instances.
+    template<typename> friend class RootNode;
+
+    template<typename, typename, bool> friend struct RootNodeCopyHelper;
+    template<typename, typename, typename, bool> friend struct RootNodeCombineHelper;
+
+    /// Currently no-op, but can be used to define empty and delete keys for mTable
+    void initTable() {}
+    //@{
+    /// @internal Used by doVisit2().
+    void resetTable(MapType& table) { mTable.swap(table); table.clear(); }
+    void resetTable(const MapType&) const {}
+    //@}
+
+    Index getChildCount() const;
+    Index getTileCount() const;
+    Index getActiveTileCount() const;
+    Index getInactiveTileCount() const;
+
+    /// Return a MapType key for the given coordinates.
+    static Coord coordToKey(const Coord& xyz) { return xyz & ~(ChildType::DIM - 1); }
+
+    /// Insert this node's mTable keys into the given set.
+    void insertKeys(CoordSet&) const;
+
+    /// Return @c true if this node's mTable contains the given key.
+    bool hasKey(const Coord& key) const { return mTable.find(key) != mTable.end(); }
+    //@{
+    /// @brief Look up the given key in this node's mTable.
+    /// @return an iterator pointing to the matching mTable entry or to mTable.end().
+    MapIter findKey(const Coord& key) { return mTable.find(key); }
+    MapCIter findKey(const Coord& key) const { return mTable.find(key); }
+    //@}
+    //@{
+    /// @brief Convert the given coordinates to a key and look the key up in this node's mTable.
+    /// @return an iterator pointing to the matching mTable entry or to mTable.end().
+    MapIter findCoord(const Coord& xyz) { return mTable.find(coordToKey(xyz)); }
+    MapCIter findCoord(const Coord& xyz) const { return mTable.find(coordToKey(xyz)); }
+    //@}
+    /// @brief Convert the given coordinates to a key and look the key up in this node's mTable.
+    /// @details If the key is not found, insert a background tile with that key.
+    /// @return an iterator pointing to the matching mTable entry.
+    MapIter findOrAddCoord(const Coord& xyz);
+
+    /// @brief Verify that the tree rooted at @a other has the same configuration
+    /// (levels, branching factors and node dimensions) as this tree, but allow
+    /// their ValueTypes to differ.
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's.
+    template<typename OtherChildType>
+    static void enforceSameConfiguration(const RootNode<OtherChildType>& other);
+
+    /// @brief Verify that @a other has values of a type that can be converted
+    /// to this node's ValueType.
+    /// @details For example, values of type float are compatible with values of type Vec3s,
+    /// because a Vec3s can be constructed from a float.  But the reverse is not true.
+    /// @throw TypeError if the other node's ValueType is not convertible into this node's.
+    template<typename OtherChildType>
+    static void enforceCompatibleValueTypes(const RootNode<OtherChildType>& other);
+
+    template<typename CombineOp, typename OtherRootNode /*= RootNode*/>
+    void doCombine2(const RootNode&, const OtherRootNode&, CombineOp&, bool prune);
+
+    template<typename RootNodeT, typename VisitorOp, typename ChildAllIterT>
+    static inline void doVisit(RootNodeT&, VisitorOp&);
+
+    template<typename RootNodeT, typename OtherRootNodeT, typename VisitorOp,
+        typename ChildAllIterT, typename OtherChildAllIterT>
+    static inline void doVisit2(RootNodeT&, OtherRootNodeT&, VisitorOp&);
+
+
+    MapType mTable;
+    ValueType mBackground;
+}; // end of RootNode class
+
+
+////////////////////////////////////////
+
+
+/// @brief NodeChain<RootNodeType, RootNodeType::LEVEL>::Type is a boost::mpl::vector
+/// that lists the types of the nodes of the tree rooted at RootNodeType in reverse order,
+/// from LeafNode to RootNode.
+/// @details For example, if RootNodeType is
+/// @code
+/// RootNode<InternalNode<InternalNode<LeafNode> > >
+/// @endcode
+/// then NodeChain::Type is
+/// @code
+/// boost::mpl::vector<
+///     LeafNode,
+///     InternalNode<LeafNode>,
+///     InternalNode<InternalNode<LeafNode> >,
+///     RootNode<InternalNode<InternalNode<LeafNode> > > >
+/// @endcode
+///
+/// @note Use the following to get the Nth node type, where N=0 is the LeafNodeType:
+/// @code
+/// boost::mpl::at<NodeChainType, boost::mpl::int_<N> >::type
+/// @endcode
+template<typename HeadT, int HeadLevel>
+struct NodeChain {
+    typedef typename NodeChain<typename HeadT::ChildNodeType, HeadLevel-1>::Type SubtreeT;
+    typedef typename boost::mpl::push_back<SubtreeT, HeadT>::type Type;
+};
+
+/// Specialization to terminate NodeChain
+template<typename HeadT>
+struct NodeChain<HeadT, /*HeadLevel=*/1> {
+    typedef typename boost::mpl::vector<typename HeadT::ChildNodeType, HeadT>::type Type;
+};
+
+
+////////////////////////////////////////
+
+
+//@{
+/// Helper metafunction used to implement RootNode::SameConfiguration
+/// (which, as an inner class, can't be independently specialized)
+template<typename ChildT1, typename NodeT2>
+struct SameRootConfig {
+    static const bool value = false;
+};
+
+template<typename ChildT1, typename ChildT2>
+struct SameRootConfig<ChildT1, RootNode<ChildT2> > {
+    static const bool value = ChildT1::template SameConfiguration<ChildT2>::value;
+};
+//@}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline
+RootNode<ChildT>::RootNode(): mBackground(zeroVal<ValueType>())
+{
+    this->initTable();
+}
+
+
+template<typename ChildT>
+inline
+RootNode<ChildT>::RootNode(const ValueType& background): mBackground(background)
+{
+    this->initTable();
+}
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline
+RootNode<ChildT>::RootNode(const RootNode<OtherChildType>& other,
+    const ValueType& backgd, const ValueType& foregd, TopologyCopy):
+    mBackground(backgd)
+{
+    typedef RootNode<OtherChildType> OtherRootT;
+
+    enforceSameConfiguration(other);
+
+    const Tile bgTile(backgd, /*active=*/false), fgTile(foregd, true);
+    this->initTable();
+
+    for (typename OtherRootT::MapCIter i=other.mTable.begin(), e=other.mTable.end(); i != e; ++i) {
+        mTable[i->first] = OtherRootT::isTile(i)
+            ? NodeStruct(OtherRootT::isTileOn(i) ? fgTile : bgTile)
+            : NodeStruct(*(new ChildT(OtherRootT::getChild(i), backgd, foregd, TopologyCopy())));
+    }
+}
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline
+RootNode<ChildT>::RootNode(const RootNode<OtherChildType>& other,
+    const ValueType& backgd, TopologyCopy):
+    mBackground(backgd)
+{
+    typedef RootNode<OtherChildType> OtherRootT;
+
+    enforceSameConfiguration(other);
+
+    const Tile bgTile(backgd, /*active=*/false), fgTile(backgd, true);
+    this->initTable();
+    for (typename OtherRootT::MapCIter i=other.mTable.begin(), e=other.mTable.end(); i != e; ++i) {
+        mTable[i->first] = OtherRootT::isTile(i)
+            ? NodeStruct(OtherRootT::isTileOn(i) ? fgTile : bgTile)
+            : NodeStruct(*(new ChildT(OtherRootT::getChild(i), backgd, TopologyCopy())));
+    }
+}
+
+
+////////////////////////////////////////
+
+
+// This helper class is a friend of RootNode and is needed so that assignment
+// with value conversion can be specialized for compatible and incompatible
+// pairs of RootNode types.
+template<typename RootT, typename OtherRootT, bool Compatible = false>
+struct RootNodeCopyHelper
+{
+    static inline void copyWithValueConversion(RootT& self, const OtherRootT& other)
+    {
+        // If the two root nodes have different configurations or incompatible ValueTypes,
+        // throw an exception.
+        self.enforceSameConfiguration(other);
+        self.enforceCompatibleValueTypes(other);
+        // One of the above two tests should throw, so we should never get here:
+        std::ostringstream ostr;
+        ostr << "cannot convert a " << typeid(OtherRootT).name()
+            << " to a " << typeid(RootT).name();
+        OPENVDB_THROW(TypeError, ostr.str());
+    }
+};
+
+// Specialization for root nodes of compatible types
+template<typename RootT, typename OtherRootT>
+struct RootNodeCopyHelper<RootT, OtherRootT, /*Compatible=*/true>
+{
+    static inline void copyWithValueConversion(RootT& self, const OtherRootT& other)
+    {
+        typedef typename RootT::ValueType          ValueT;
+        typedef typename RootT::ChildNodeType      ChildT;
+        typedef typename RootT::NodeStruct         NodeStruct;
+        typedef typename RootT::Tile               Tile;
+        typedef typename OtherRootT::ValueType     OtherValueT;
+        typedef typename OtherRootT::MapCIter      OtherMapCIter;
+        typedef typename OtherRootT::Tile          OtherTile;
+
+        struct Local {
+            /// @todo Consider using a value conversion functor passed as an argument instead.
+            static inline ValueT convertValue(const OtherValueT& val) { return ValueT(val); }
+        };
+
+        self.mBackground = Local::convertValue(other.mBackground);
+
+        self.clear();
+        self.initTable();
+
+        for (OtherMapCIter i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+            if (other.isTile(i)) {
+                // Copy the other node's tile, but convert its value to this node's ValueType.
+                const OtherTile& otherTile = other.getTile(i);
+                self.mTable[i->first] = NodeStruct(
+                    Tile(Local::convertValue(otherTile.value), otherTile.active));
+            } else {
+                // Copy the other node's child, but convert its values to this node's ValueType.
+                self.mTable[i->first] = NodeStruct(*(new ChildT(other.getChild(i))));
+            }
+        }
+    }
+};
+
+
+// Overload for root nodes of the same type as this node
+template<typename ChildT>
+inline RootNode<ChildT>&
+RootNode<ChildT>::operator=(const RootNode& other)
+{
+    if (&other != this) {
+        mBackground = other.mBackground;
+
+        this->clear();
+        this->initTable();
+
+        for (MapCIter i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+            mTable[i->first] =
+                isTile(i) ? NodeStruct(getTile(i)) : NodeStruct(*(new ChildT(getChild(i))));
+        }
+    }
+    return *this;
+}
+
+// Overload for root nodes of different types
+template<typename ChildT>
+template<typename OtherChildType>
+inline RootNode<ChildT>&
+RootNode<ChildT>::operator=(const RootNode<OtherChildType>& other)
+{
+    typedef RootNode<OtherChildType>       OtherRootT;
+    typedef typename OtherRootT::ValueType OtherValueT;
+    static const bool compatible = (SameConfiguration<OtherRootT>::value
+        && CanConvertType</*from=*/OtherValueT, /*to=*/ValueType>::value);
+    RootNodeCopyHelper<RootNode, OtherRootT, compatible>::copyWithValueConversion(*this, other);
+    return *this;
+}
+
+
+////////////////////////////////////////
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::setBackground(const ValueType& background, bool updateChildNodes)
+{
+    if (math::isExactlyEqual(background, mBackground)) return;
+
+    if (updateChildNodes) {
+        // Traverse the tree, replacing occurrences of mBackground with background
+        // and -mBackground with -background.
+        for (MapIter iter=mTable.begin(); iter!=mTable.end(); ++iter) {
+            ChildT *child = iter->second.child;
+            if (child) {
+                child->resetBackground(/*old=*/mBackground, /*new=*/background);
+            } else {
+                Tile& tile = getTile(iter);
+                if (tile.active) continue;//only change inactive tiles
+                if (math::isApproxEqual(tile.value, mBackground)) {
+                    tile.value = background;
+                } else if (math::isApproxEqual(tile.value, math::negative(mBackground))) {
+                    tile.value = math::negative(background);
+                }
+            }
+        }
+    }
+    mBackground = background;
+}
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::isBackgroundTile(const Tile& tile) const
+{
+    return !tile.active && math::isApproxEqual(tile.value, mBackground);
+}
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::isBackgroundTile(const MapIter& iter) const
+{
+    return isTileOff(iter) && math::isApproxEqual(getTile(iter).value, mBackground);
+}
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::isBackgroundTile(const MapCIter& iter) const
+{
+    return isTileOff(iter) && math::isApproxEqual(getTile(iter).value, mBackground);
+}
+
+
+template<typename ChildT>
+inline size_t
+RootNode<ChildT>::numBackgroundTiles() const
+{
+    size_t count = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (this->isBackgroundTile(i)) ++count;
+    }
+    return count;
+}
+
+
+template<typename ChildT>
+inline size_t
+RootNode<ChildT>::eraseBackgroundTiles()
+{
+    std::set<Coord> keysToErase;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (this->isBackgroundTile(i)) keysToErase.insert(i->first);
+    }
+    for (std::set<Coord>::iterator i = keysToErase.begin(), e = keysToErase.end(); i != e; ++i) {
+        mTable.erase(*i);
+    }
+    return keysToErase.size();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::insertKeys(CoordSet& keys) const
+{
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        keys.insert(i->first);
+    }
+}
+
+
+template<typename ChildT>
+inline typename RootNode<ChildT>::MapIter
+RootNode<ChildT>::findOrAddCoord(const Coord& xyz)
+{
+    const Coord key = coordToKey(xyz);
+    std::pair<MapIter, bool> result = mTable.insert(
+        typename MapType::value_type(key, NodeStruct(Tile(mBackground, /*active=*/false))));
+    return result.first;
+}
+
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::expand(const Coord& xyz)
+{
+    const Coord key = coordToKey(xyz);
+    std::pair<MapIter, bool> result = mTable.insert(
+        typename MapType::value_type(key, NodeStruct(Tile(mBackground, /*active=*/false))));
+    return result.second; // return true if the key did not already exist
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::getNodeLog2Dims(std::vector<Index>& dims)
+{
+    dims.push_back(0); // magic number; RootNode has no Log2Dim
+    ChildT::getNodeLog2Dims(dims);
+}
+
+
+template<typename ChildT>
+inline Coord
+RootNode<ChildT>::getMinIndex() const
+{
+    return mTable.empty() ? Coord(0) : mTable.begin()->first;
+}
+
+template<typename ChildT>
+inline Coord
+RootNode<ChildT>::getMaxIndex() const
+{
+    return mTable.empty() ? Coord(0) : mTable.rbegin()->first + Coord(ChildT::DIM - 1);
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::getIndexRange(CoordBBox& bbox) const
+{
+    bbox.min() = this->getMinIndex();
+    bbox.max() = this->getMaxIndex();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline bool
+RootNode<ChildT>::hasSameTopology(const RootNode<OtherChildType>& other) const
+{
+    typedef RootNode<OtherChildType> OtherRootT;
+    typedef typename OtherRootT::MapType OtherMapT;
+    typedef typename OtherRootT::MapIter OtherIterT;
+    typedef typename OtherRootT::MapCIter OtherCIterT;
+
+    if (!hasSameConfiguration(other)) return false;
+
+    // Create a local copy of the other node's table.
+    OtherMapT copyOfOtherTable = other.mTable;
+
+    // For each entry in this node's table...
+    for (MapCIter thisIter = mTable.begin(); thisIter != mTable.end(); ++thisIter) {
+        if (this->isBackgroundTile(thisIter)) continue; // ignore background tiles
+
+        // Fail if there is no corresponding entry in the other node's table.
+        OtherCIterT otherIter = other.findKey(thisIter->first);
+        if (otherIter == other.mTable.end()) return false;
+
+        // Fail if this entry is a tile and the other is a child or vice-versa.
+        if (isChild(thisIter)) {//thisIter points to a child
+            if (OtherRootT::isTile(otherIter)) return false;
+            // Fail if both entries are children, but the children have different topology.
+            if (!getChild(thisIter).hasSameTopology(&OtherRootT::getChild(otherIter))) return false;
+        } else {//thisIter points to a tile
+            if (OtherRootT::isChild(otherIter)) return false;
+            if (getTile(thisIter).active != OtherRootT::getTile(otherIter).active) return false;
+        }
+
+        // Remove tiles and child nodes with matching topology from
+        // the copy of the other node's table. This is required since
+        // the two root tables can include an arbitrary number of
+        // background tiles and still have the same topology!
+        copyOfOtherTable.erase(otherIter->first);
+    }
+    // Fail if the remaining entries in copyOfOtherTable are not all background tiles.
+    for (OtherIterT i = copyOfOtherTable.begin(), e = copyOfOtherTable.end(); i != e; ++i) {
+        if (!other.isBackgroundTile(i)) return false;
+    }
+    return true;
+}
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline bool
+RootNode<ChildT>::hasSameConfiguration(const RootNode<OtherChildType>&)
+{
+    std::vector<Index> thisDims, otherDims;
+    RootNode::getNodeLog2Dims(thisDims);
+    RootNode<OtherChildType>::getNodeLog2Dims(otherDims);
+    return (thisDims == otherDims);
+}
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline void
+RootNode<ChildT>::enforceSameConfiguration(const RootNode<OtherChildType>&)
+{
+    std::vector<Index> thisDims, otherDims;
+    RootNode::getNodeLog2Dims(thisDims);
+    RootNode<OtherChildType>::getNodeLog2Dims(otherDims);
+    if (thisDims != otherDims) {
+        std::ostringstream ostr;
+        ostr << "grids have incompatible configurations (" << thisDims[0];
+        for (size_t i = 1, N = thisDims.size(); i < N; ++i) ostr << " x " << thisDims[i];
+        ostr << " vs. " << otherDims[0];
+        for (size_t i = 1, N = otherDims.size(); i < N; ++i) ostr << " x " << otherDims[i];
+        ostr << ")";
+        OPENVDB_THROW(TypeError, ostr.str());
+    }
+}
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline bool
+RootNode<ChildT>::hasCompatibleValueType(const RootNode<OtherChildType>&)
+{
+    typedef typename OtherChildType::ValueType OtherValueType;
+    return CanConvertType</*from=*/OtherValueType, /*to=*/ValueType>::value;
+}
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline void
+RootNode<ChildT>::enforceCompatibleValueTypes(const RootNode<OtherChildType>&)
+{
+    typedef typename OtherChildType::ValueType OtherValueType;
+    if (!CanConvertType</*from=*/OtherValueType, /*to=*/ValueType>::value) {
+        std::ostringstream ostr;
+        ostr << "values of type " << typeNameAsString<OtherValueType>()
+            << " cannot be converted to type " << typeNameAsString<ValueType>();
+        OPENVDB_THROW(TypeError, ostr.str());
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline Index64
+RootNode<ChildT>::memUsage() const
+{
+    Index64 sum = sizeof(*this);
+    for (MapCIter iter=mTable.begin(); iter!=mTable.end(); ++iter) {
+        if (const ChildT *child = iter->second.child) {
+            sum += child->memUsage();
+        }
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::clear()
+{
+    for (MapIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        delete i->second.child;
+    }
+    mTable.clear();
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::evalActiveBoundingBox(CoordBBox& bbox, bool visitVoxels) const
+{
+    for (MapCIter iter=mTable.begin(); iter!=mTable.end(); ++iter) {
+        if (const ChildT *child = iter->second.child) {
+            child->evalActiveBoundingBox(bbox, visitVoxels);
+        } else if (isTileOn(iter)) {
+            bbox.expand(iter->first, ChildT::DIM);
+        }
+    }
+}
+
+
+template<typename ChildT>
+inline Index
+RootNode<ChildT>::getChildCount() const {
+    Index sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) ++sum;
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index
+RootNode<ChildT>::getTileCount() const
+{
+    Index sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isTile(i)) ++sum;
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index
+RootNode<ChildT>::getActiveTileCount() const
+{
+    Index sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isTileOn(i)) ++sum;
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index
+RootNode<ChildT>::getInactiveTileCount() const
+{
+    Index sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isTileOff(i)) ++sum;
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index32
+RootNode<ChildT>::leafCount() const
+{
+    Index32 sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) sum += getChild(i).leafCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index32
+RootNode<ChildT>::nonLeafCount() const
+{
+    Index32 sum = 1;
+    if (ChildT::LEVEL != 0) {
+        for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+            if (isChild(i)) sum += getChild(i).nonLeafCount();
+        }
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index64
+RootNode<ChildT>::onVoxelCount() const
+{
+    Index64 sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) {
+            sum += getChild(i).onVoxelCount();
+        } else if (isTileOn(i)) {
+            sum += ChildT::NUM_VOXELS;
+        }
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index64
+RootNode<ChildT>::offVoxelCount() const
+{
+    Index64 sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) {
+            sum += getChild(i).offVoxelCount();
+        } else if (isTileOff(i) && !this->isBackgroundTile(i)) {
+            sum += ChildT::NUM_VOXELS;
+        }
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index64
+RootNode<ChildT>::onLeafVoxelCount() const
+{
+    Index64 sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) sum += getChild(i).onLeafVoxelCount();
+    }
+    return sum;
+}
+
+
+template<typename ChildT>
+inline Index64
+RootNode<ChildT>::offLeafVoxelCount() const
+{
+    Index64 sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) sum += getChild(i).offLeafVoxelCount();
+    }
+    return sum;
+}
+
+template<typename ChildT>
+inline Index64
+RootNode<ChildT>::onTileCount() const
+{
+    Index64 sum = 0;
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) {
+            sum += getChild(i).onTileCount();
+        } else if (isTileOn(i)) {
+            sum += 1;
+        }
+    }
+    return sum;
+}
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::isValueOn(const Coord& xyz) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTileOff(iter)) return false;
+    return isTileOn(iter) ? true : getChild(iter).isValueOn(xyz);
+}
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::hasActiveTiles() const
+{
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i) ? getChild(i).hasActiveTiles() : getTile(i).active) return true;
+    }
+    return false;
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline bool
+RootNode<ChildT>::isValueOnAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTileOff(iter)) return false;
+    if (isTileOn(iter)) return true;
+    acc.insert(xyz, &getChild(iter));
+    return getChild(iter).isValueOnAndCache(xyz, acc);
+}
+
+
+template<typename ChildT>
+inline const typename ChildT::ValueType&
+RootNode<ChildT>::getValue(const Coord& xyz) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    return iter == mTable.end() ? mBackground
+        : (isTile(iter) ? getTile(iter).value : getChild(iter).getValue(xyz));
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline const typename ChildT::ValueType&
+RootNode<ChildT>::getValueAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) return mBackground;
+    if (isChild(iter)) {
+        acc.insert(xyz, &getChild(iter));
+        return getChild(iter).getValueAndCache(xyz, acc);
+    }
+    return getTile(iter).value;
+}
+
+
+template<typename ChildT>
+inline int
+RootNode<ChildT>::getValueDepth(const Coord& xyz) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    return iter == mTable.end() ? -1
+        : (isTile(iter) ? 0 : int(LEVEL) - int(getChild(iter).getValueLevel(xyz)));
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline int
+RootNode<ChildT>::getValueDepthAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) return -1;
+    if (isTile(iter)) return 0;
+    acc.insert(xyz, &getChild(iter));
+    return int(LEVEL) - int(getChild(iter).getValueLevelAndCache(xyz, acc));
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::setValueOff(const Coord& xyz)
+{
+    MapIter iter = this->findCoord(xyz);
+    if (iter != mTable.end() && !isTileOff(iter)) {
+        if (isTileOn(iter)) {
+            setChild(iter, *new ChildT(xyz, getTile(iter).value, /*active=*/true));
+        }
+        getChild(iter).setValueOff(xyz);
+    }
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::setActiveState(const Coord& xyz, bool on)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        if (on) {
+            child = new ChildT(xyz, mBackground);
+            mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+        } else {
+            // Nothing to do; (x, y, z) is background and therefore already inactive.
+        }
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (on != getTile(iter).active) {
+        child = new ChildT(xyz, getTile(iter).value, !on);
+        setChild(iter, *child);
+    }
+    if (child) child->setActiveState(xyz, on);
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline void
+RootNode<ChildT>::setActiveStateAndCache(const Coord& xyz, bool on, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        if (on) {
+            child = new ChildT(xyz, mBackground);
+            mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+        } else {
+            // Nothing to do; (x, y, z) is background and therefore already inactive.
+        }
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (on != getTile(iter).active) {
+        child = new ChildT(xyz, getTile(iter).value, !on);
+        setChild(iter, *child);
+    }
+    if (child) {
+        acc.insert(xyz, child);
+        child->setActiveStateAndCache(xyz, on, acc);
+    }
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::setValueOff(const Coord& xyz, const ValueType& value)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        if (!math::isExactlyEqual(mBackground, value)) {
+            child = new ChildT(xyz, mBackground);
+            mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+        }
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (isTileOn(iter) || !math::isExactlyEqual(getTile(iter).value, value)) {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    if (child) child->setValueOff(xyz, value);
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline void
+RootNode<ChildT>::setValueOffAndCache(const Coord& xyz, const ValueType& value, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        if (!math::isExactlyEqual(mBackground, value)) {
+            child = new ChildT(xyz, mBackground);
+            mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+        }
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (isTileOn(iter) || !math::isExactlyEqual(getTile(iter).value, value)) {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    if (child) {
+        acc.insert(xyz, child);
+        child->setValueOffAndCache(xyz, value, acc);
+    }
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::setValueOn(const Coord& xyz, const ValueType& value)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (isTileOff(iter) || !math::isExactlyEqual(getTile(iter).value, value)) {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    if (child) child->setValueOn(xyz, value);
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline void
+RootNode<ChildT>::setValueAndCache(const Coord& xyz, const ValueType& value, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (isTileOff(iter) || !math::isExactlyEqual(getTile(iter).value, value)) {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    if (child) {
+        acc.insert(xyz, child);
+        child->setValueAndCache(xyz, value, acc);
+    }
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::setValueOnly(const Coord& xyz, const ValueType& value)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (!math::isExactlyEqual(getTile(iter).value, value)) {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    if (child) child->setValueOnly(xyz, value);
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline void
+RootNode<ChildT>::setValueOnlyAndCache(const Coord& xyz, const ValueType& value, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else if (!math::isExactlyEqual(getTile(iter).value, value)) {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    if (child) {
+        acc.insert(xyz, child);
+        child->setValueOnlyAndCache(xyz, value, acc);
+    }
+}
+
+
+template<typename ChildT>
+template<typename ModifyOp>
+inline void
+RootNode<ChildT>::modifyValue(const Coord& xyz, const ModifyOp& op)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else {
+        // Need to create a child if the tile is inactive,
+        // in order to activate voxel (x, y, z).
+        bool createChild = isTileOff(iter);
+        if (!createChild) {
+            // Need to create a child if applying the functor
+            // to the tile value produces a different value.
+            const ValueType& tileVal = getTile(iter).value;
+            ValueType modifiedVal = tileVal;
+            op(modifiedVal);
+            createChild = !math::isExactlyEqual(tileVal, modifiedVal);
+        }
+        if (createChild) {
+            child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+            setChild(iter, *child);
+        }
+    }
+    if (child) child->modifyValue(xyz, op);
+}
+
+template<typename ChildT>
+template<typename ModifyOp, typename AccessorT>
+inline void
+RootNode<ChildT>::modifyValueAndCache(const Coord& xyz, const ModifyOp& op, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else {
+        // Need to create a child if the tile is inactive,
+        // in order to activate voxel (x, y, z).
+        bool createChild = isTileOff(iter);
+        if (!createChild) {
+            // Need to create a child if applying the functor
+            // to the tile value produces a different value.
+            const ValueType& tileVal = getTile(iter).value;
+            ValueType modifiedVal = tileVal;
+            op(modifiedVal);
+            createChild = !math::isExactlyEqual(tileVal, modifiedVal);
+        }
+        if (createChild) {
+            child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+            setChild(iter, *child);
+        }
+    }
+    if (child) {
+        acc.insert(xyz, child);
+        child->modifyValueAndCache(xyz, op, acc);
+    }
+}
+
+
+template<typename ChildT>
+template<typename ModifyOp>
+inline void
+RootNode<ChildT>::modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else {
+        const Tile& tile = getTile(iter);
+        bool modifiedState = tile.active;
+        ValueType modifiedVal = tile.value;
+        op(modifiedVal, modifiedState);
+        // Need to create a child if applying the functor to the tile
+        // produces a different value or active state.
+        if (modifiedState != tile.active || !math::isExactlyEqual(modifiedVal, tile.value)) {
+            child = new ChildT(xyz, tile.value, tile.active);
+            setChild(iter, *child);
+        }
+    }
+    if (child) child->modifyValueAndActiveState(xyz, op);
+}
+
+template<typename ChildT>
+template<typename ModifyOp, typename AccessorT>
+inline void
+RootNode<ChildT>::modifyValueAndActiveStateAndCache(
+    const Coord& xyz, const ModifyOp& op, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else {
+        const Tile& tile = getTile(iter);
+        bool modifiedState = tile.active;
+        ValueType modifiedVal = tile.value;
+        op(modifiedVal, modifiedState);
+        // Need to create a child if applying the functor to the tile
+        // produces a different value or active state.
+        if (modifiedState != tile.active || !math::isExactlyEqual(modifiedVal, tile.value)) {
+            child = new ChildT(xyz, tile.value, tile.active);
+            setChild(iter, *child);
+        }
+    }
+    if (child) {
+        acc.insert(xyz, child);
+        child->modifyValueAndActiveStateAndCache(xyz, op, acc);
+    }
+}
+
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::probeValue(const Coord& xyz, ValueType& value) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        value = mBackground;
+        return false;
+    } else if (isChild(iter)) {
+        return getChild(iter).probeValue(xyz, value);
+    }
+    value = getTile(iter).value;
+    return isTileOn(iter);
+}
+
+template<typename ChildT>
+template<typename AccessorT>
+inline bool
+RootNode<ChildT>::probeValueAndCache(const Coord& xyz, ValueType& value, AccessorT& acc) const
+{
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        value = mBackground;
+        return false;
+    } else if (isChild(iter)) {
+        acc.insert(xyz, &getChild(iter));
+        return getChild(iter).probeValueAndCache(xyz, value, acc);
+    }
+    value = getTile(iter).value;
+    return isTileOn(iter);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::fill(const CoordBBox& bbox, const ValueType& value, bool active, bool sparse)
+{
+    if (bbox.empty()) return;
+
+    Coord xyz, tileMax;
+    for (int x = bbox.min().x(); x <= bbox.max().x(); x = tileMax.x() + 1) {
+        xyz.setX(x);
+        for (int y = bbox.min().y(); y <= bbox.max().y(); y = tileMax.y() + 1) {
+            xyz.setY(y);
+            for (int z = bbox.min().z(); z <= bbox.max().z(); z = tileMax.z() + 1) {
+                xyz.setZ(z);
+
+                // Get the bounds of the tile that contains voxel (x, y, z).
+                Coord tileMin = coordToKey(xyz);
+                tileMax = tileMin.offsetBy(ChildT::DIM - 1);
+
+                if (xyz != tileMin || Coord::lessThan(bbox.max(), tileMax)) {
+                    // If the box defined by (xyz, bbox.max()) doesn't completely enclose
+                    // the tile to which xyz belongs, create a child node (or retrieve
+                    // the existing one).
+                    ChildT* child = NULL;
+                    MapIter iter = this->findKey(tileMin);
+                    if (iter == mTable.end()) {
+                        // No child or tile exists.  Create a child and initialize it
+                        // with the background value.
+                        child = new ChildT(xyz, mBackground);
+                        mTable[tileMin] = NodeStruct(*child);
+                    } else if (isTile(iter)) {
+                        // Replace the tile with a newly-created child that is initialized
+                        // with the tile's value and active state.
+                        const Tile& tile = getTile(iter);
+                        child = new ChildT(xyz, tile.value, tile.active);
+                        mTable[tileMin] = NodeStruct(*child);
+                    } else if (isChild(iter)) {
+                        child = &getChild(iter);
+                    }
+                    // Forward the fill request to the child.
+                    if (child) {
+                        child->fill(CoordBBox(xyz, Coord::minComponent(bbox.max(), tileMax)),
+                            value, active);
+                    }
+                } else {
+                    // If the box given by (xyz, bbox.max()) completely encloses
+                    // the tile to which xyz belongs, create the tile (if it
+                    // doesn't already exist) and give it the fill value.
+                    MapIter iter = this->findOrAddCoord(tileMin);
+                    setTile(iter, Tile(value, active));
+                }
+            }
+        }
+    }
+    if (!sparse) this->voxelizeActiveTiles(/*multi-threaded=*/true);
+}
+
+template<typename ChildT>
+template<typename DenseT>
+inline void
+RootNode<ChildT>::copyToDense(const CoordBBox& bbox, DenseT& dense) const
+{
+    typedef typename DenseT::ValueType DenseValueType;
+
+    const size_t xStride = dense.xStride(), yStride = dense.yStride(), zStride = dense.zStride();
+    const Coord& min = dense.bbox().min();
+    CoordBBox nodeBBox;
+    for (Coord xyz = bbox.min(); xyz[0] <= bbox.max()[0]; xyz[0] = nodeBBox.max()[0] + 1) {
+        for (xyz[1] = bbox.min()[1]; xyz[1] <= bbox.max()[1]; xyz[1] = nodeBBox.max()[1] + 1) {
+            for (xyz[2] = bbox.min()[2]; xyz[2] <= bbox.max()[2]; xyz[2] = nodeBBox.max()[2] + 1) {
+
+                // Get the coordinate bbox of the child node that contains voxel xyz.
+                nodeBBox = CoordBBox::createCube(coordToKey(xyz), ChildT::DIM);
+
+                // Get the coordinate bbox of the interection of inBBox and nodeBBox
+                CoordBBox sub(xyz, Coord::minComponent(bbox.max(), nodeBBox.max()));
+
+                MapCIter iter = this->findKey(nodeBBox.min());
+                if (iter != mTable.end() && isChild(iter)) {//is a child
+                    getChild(iter).copyToDense(sub, dense);
+                } else {//is background or a tile value
+                    const ValueType value = iter==mTable.end() ? mBackground : getTile(iter).value;
+                    sub.translate(-min);
+                    DenseValueType* a0 = dense.data() + zStride*sub.min()[2];
+                    for (Int32 x=sub.min()[0], ex=sub.max()[0]+1; x<ex; ++x) {
+                        DenseValueType* a1 = a0 + x*xStride;
+                        for (Int32 y=sub.min()[1], ey=sub.max()[1]+1; y<ey; ++y) {
+                            DenseValueType* a2 = a1 + y*yStride;
+                            for (Int32 z=sub.min()[2], ez=sub.max()[2]+1; z<ez; ++z, a2 += zStride) {
+                                *a2 =  DenseValueType(value);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::writeTopology(std::ostream& os, bool toHalf) const
+{
+    if (!toHalf) {
+        os.write(reinterpret_cast<const char*>(&mBackground), sizeof(ValueType));
+    } else {
+        ValueType truncatedVal = io::truncateRealToHalf(mBackground);
+        os.write(reinterpret_cast<const char*>(&truncatedVal), sizeof(ValueType));
+    }
+    io::setGridBackgroundValuePtr(os, &mBackground);
+
+    const Index numTiles = this->getTileCount(), numChildren = this->getChildCount();
+    os.write(reinterpret_cast<const char*>(&numTiles), sizeof(Index));
+    os.write(reinterpret_cast<const char*>(&numChildren), sizeof(Index));
+
+    if (numTiles == 0 && numChildren == 0) return false;
+
+    // Write tiles.
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) continue;
+        os.write(reinterpret_cast<const char*>(i->first.asPointer()), 3 * sizeof(Int32));
+        os.write(reinterpret_cast<const char*>(&getTile(i).value), sizeof(ValueType));
+        os.write(reinterpret_cast<const char*>(&getTile(i).active), sizeof(bool));
+    }
+    // Write child nodes.
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isTile(i)) continue;
+        os.write(reinterpret_cast<const char*>(i->first.asPointer()), 3 * sizeof(Int32));
+        getChild(i).writeTopology(os, toHalf);
+    }
+
+    return true; // not empty
+}
+
+
+template<typename ChildT>
+inline bool
+RootNode<ChildT>::readTopology(std::istream& is, bool fromHalf)
+{
+    // Delete the existing tree.
+    this->clear();
+
+    if (io::getFormatVersion(is) < OPENVDB_FILE_VERSION_ROOTNODE_MAP) {
+        // Read and convert an older-format RootNode.
+
+        // For backward compatibility with older file formats, read both
+        // outside and inside background values.
+        is.read(reinterpret_cast<char*>(&mBackground), sizeof(ValueType));
+        ValueType inside;
+        is.read(reinterpret_cast<char*>(&inside), sizeof(ValueType));
+
+        io::setGridBackgroundValuePtr(is, &mBackground);
+
+        // Read the index range.
+        Coord rangeMin, rangeMax;
+        is.read(reinterpret_cast<char*>(rangeMin.asPointer()), 3 * sizeof(Int32));
+        is.read(reinterpret_cast<char*>(rangeMax.asPointer()), 3 * sizeof(Int32));
+
+        this->initTable();
+        Index tableSize = 0, log2Dim[4] = { 0, 0, 0, 0 };
+        Int32 offset[3];
+        for (int i = 0; i < 3; ++i) {
+            offset[i] = rangeMin[i] >> ChildT::TOTAL;
+            rangeMin[i] = offset[i] << ChildT::TOTAL;
+            log2Dim[i] = 1 + util::FindHighestOn((rangeMax[i] >> ChildT::TOTAL) - offset[i]);
+            tableSize += log2Dim[i];
+            rangeMax[i] = (((1 << log2Dim[i]) + offset[i]) << ChildT::TOTAL) - 1;
+        }
+        log2Dim[3] = log2Dim[1] + log2Dim[2];
+        tableSize = 1U << tableSize;
+
+        // Read masks.
+        util::RootNodeMask childMask(tableSize), valueMask(tableSize);
+        childMask.load(is);
+        valueMask.load(is);
+
+        // Read child nodes/values.
+        for (Index i = 0; i < tableSize; ++i) {
+            // Compute origin = offset2coord(i).
+            Index n = i;
+            Coord origin;
+            origin[0] = (n >> log2Dim[3]) + offset[0];
+            n &= (1U << log2Dim[3]) - 1;
+            origin[1] = (n >> log2Dim[2]) + offset[1];
+            origin[2] = (n & ((1U << log2Dim[2]) - 1)) + offset[1];
+            origin <<= ChildT::TOTAL;
+
+            if (childMask.isOn(i)) {
+                // Read in and insert a child node.
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+                ChildT* child = new ChildT(origin, mBackground);
+#else
+                ChildT* child = new ChildT(PartialCreate(), origin, mBackground);
+#endif
+                child->readTopology(is);
+                mTable[origin] = NodeStruct(*child);
+            } else {
+                // Read in a tile value and insert a tile, but only if the value
+                // is either active or non-background.
+                ValueType value;
+                is.read(reinterpret_cast<char*>(&value), sizeof(ValueType));
+                if (valueMask.isOn(i) || (!math::isApproxEqual(value, mBackground))) {
+                    mTable[origin] = NodeStruct(Tile(value, valueMask.isOn(i)));
+                }
+            }
+        }
+        return true;
+    }
+
+    // Read a RootNode that was stored in the current format.
+
+    is.read(reinterpret_cast<char*>(&mBackground), sizeof(ValueType));
+    io::setGridBackgroundValuePtr(is, &mBackground);
+
+    Index numTiles = 0, numChildren = 0;
+    is.read(reinterpret_cast<char*>(&numTiles), sizeof(Index));
+    is.read(reinterpret_cast<char*>(&numChildren), sizeof(Index));
+
+    if (numTiles == 0 && numChildren == 0) return false;
+
+    Int32 vec[3];
+    ValueType value;
+    bool active;
+
+    // Read tiles.
+    for (Index n = 0; n < numTiles; ++n) {
+        is.read(reinterpret_cast<char*>(vec), 3 * sizeof(Int32));
+        is.read(reinterpret_cast<char*>(&value), sizeof(ValueType));
+        is.read(reinterpret_cast<char*>(&active), sizeof(bool));
+        mTable[Coord(vec)] = NodeStruct(Tile(value, active));
+    }
+
+    // Read child nodes.
+    for (Index n = 0; n < numChildren; ++n) {
+        is.read(reinterpret_cast<char*>(vec), 3 * sizeof(Int32));
+        Coord origin(vec);
+#ifdef OPENVDB_2_ABI_COMPATIBLE
+        ChildT* child = new ChildT(origin, mBackground);
+#else
+        ChildT* child = new ChildT(PartialCreate(), origin, mBackground);
+#endif
+        child->readTopology(is, fromHalf);
+        mTable[Coord(vec)] = NodeStruct(*child);
+    }
+
+    return true; // not empty
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::writeBuffers(std::ostream& os, bool toHalf) const
+{
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) getChild(i).writeBuffers(os, toHalf);
+    }
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::readBuffers(std::istream& is, bool fromHalf)
+{
+    for (MapIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) getChild(i).readBuffers(is, fromHalf);
+    }
+}
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::readBuffers(std::istream& is, const CoordBBox& clipBBox, bool fromHalf)
+{
+    const Tile bgTile(mBackground, /*active=*/false);
+
+    for (MapIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (isChild(i)) {
+            // Stream in and clip the branch rooted at this child.
+            // (We can't skip over children that lie outside the clipping region,
+            // because buffers are serialized in depth-first order and need to be
+            // unserialized in the same order.)
+            ChildT& child = getChild(i);
+            child.readBuffers(is, clipBBox, fromHalf);
+        }
+    }
+    // Clip root-level tiles and prune children that were clipped.
+    this->clip(clipBBox);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::clip(const CoordBBox& clipBBox)
+{
+    const Tile bgTile(mBackground, /*active=*/false);
+
+    // Iterate over a copy of this node's table so that we can modify the original.
+    // (Copying the table copies child node pointers, not the nodes themselves.)
+    MapType copyOfTable(mTable);
+    for (MapIter i = copyOfTable.begin(), e = copyOfTable.end(); i != e; ++i) {
+        const Coord& xyz = i->first; // tile or child origin
+        CoordBBox tileBBox(xyz, xyz.offsetBy(ChildT::DIM - 1)); // tile or child bounds
+        if (!clipBBox.hasOverlap(tileBBox)) {
+            // This table entry lies completely outside the clipping region.  Delete it.
+            setTile(this->findCoord(xyz), bgTile); // delete any existing child node first
+            mTable.erase(xyz);
+        } else if (!clipBBox.isInside(tileBBox)) {
+            // This table entry does not lie completely inside the clipping region
+            // and must be clipped.
+            if (isChild(i)) {
+                getChild(i).clip(clipBBox, mBackground);
+            } else {
+                // Replace this tile with a background tile, then fill the clip region
+                // with the tile's original value.  (This might create a child branch.)
+                tileBBox.intersect(clipBBox);
+                const Tile& origTile = getTile(i);
+                setTile(this->findCoord(xyz), bgTile);
+                this->fill(tileBBox, origTile.value, origTile.active);
+            }
+        } else {
+            // This table entry lies completely inside the clipping region.  Leave it intact.
+        }
+    }
+    this->prune(); // also erases root-level background tiles
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::prune(const ValueType& tolerance)
+{
+    bool state = false;
+    ValueType value = zeroVal<ValueType>();
+    for (MapIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (this->isTile(i)) continue;
+        this->getChild(i).prune(tolerance);
+        if (this->getChild(i).isConstant(value, state, tolerance)) {
+            this->setTile(i, Tile(value, state));
+        }
+    }
+    this->eraseBackgroundTiles();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename NodeT>
+inline NodeT*
+RootNode<ChildT>::stealNode(const Coord& xyz, const ValueType& value, bool state)
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTile(iter)) return NULL;
+    return (boost::is_same<NodeT, ChildT>::value)
+        ? reinterpret_cast<NodeT*>(&stealChild(iter, Tile(value, state)))
+        : getChild(iter).template stealNode<NodeT>(xyz, value, state);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::addLeaf(LeafNodeType* leaf)
+{
+    if (leaf == NULL) return;
+    ChildT* child = NULL;
+    const Coord& xyz = leaf->origin();
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        if (ChildT::LEVEL>0) {
+            child = new ChildT(xyz, mBackground, false);
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+        }
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        if (ChildT::LEVEL>0) {
+            child = &getChild(iter);
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+            setChild(iter, *child);//this also deletes the existing child node
+        }
+    } else {//tile
+        if (ChildT::LEVEL>0) {
+            child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+        }
+        setChild(iter, *child);
+    }
+    child->addLeaf(leaf);
+}
+
+
+template<typename ChildT>
+template<typename AccessorT>
+inline void
+RootNode<ChildT>::addLeafAndCache(LeafNodeType* leaf, AccessorT& acc)
+{
+    if (leaf == NULL) return;
+    ChildT* child = NULL;
+    const Coord& xyz = leaf->origin();
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        if (ChildT::LEVEL>0) {
+            child = new ChildT(xyz, mBackground, false);
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+        }
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        if (ChildT::LEVEL>0) {
+            child = &getChild(iter);
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+            setChild(iter, *child);//this also deletes the existing child node
+        }
+    } else {//tile
+        if (ChildT::LEVEL>0) {
+            child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        } else {
+            child = reinterpret_cast<ChildT*>(leaf);
+        }
+        setChild(iter, *child);
+    }
+    acc.insert(xyz, child);
+    child->addLeafAndCache(leaf, acc);
+}
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::addTile(const Coord& xyz, const ValueType& value, bool state)
+{
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {//background
+        mTable[this->coordToKey(xyz)] = NodeStruct(Tile(value, state));
+    } else {//child or tile
+        setTile(iter, Tile(value, state));//this also deletes the existing child node
+    }
+}
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::addTile(Index level, const Coord& xyz,
+                          const ValueType& value, bool state)
+{
+    if (LEVEL >= level) {
+        MapIter iter = this->findCoord(xyz);
+        if (iter == mTable.end()) {//background
+            if (LEVEL > level) {
+                ChildT* child = new ChildT(xyz, mBackground, false);
+                mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+                child->addTile(level, xyz, value, state);
+            } else {
+                mTable[this->coordToKey(xyz)] = NodeStruct(Tile(value, state));
+            }
+        } else if (isChild(iter)) {//child
+            if (LEVEL > level) {
+                getChild(iter).addTile(level, xyz, value, state);
+            } else {
+                setTile(iter, Tile(value, state));//this also deletes the existing child node
+            }
+        } else {//tile
+            if (LEVEL > level) {
+                ChildT* child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+                setChild(iter, *child);
+                child->addTile(level, xyz, value, state);
+            } else {
+                setTile(iter, Tile(value, state));
+            }
+        }
+    }
+}
+
+
+template<typename ChildT>
+template<typename AccessorT>
+inline void
+RootNode<ChildT>::addTileAndCache(Index level, const Coord& xyz, const ValueType& value,
+                                  bool state, AccessorT& acc)
+{
+    if (LEVEL >= level) {
+        MapIter iter = this->findCoord(xyz);
+        if (iter == mTable.end()) {//background
+            if (LEVEL > level) {
+                ChildT* child = new ChildT(xyz, mBackground, false);
+                acc.insert(xyz, child);
+                mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+                child->addTileAndCache(level, xyz, value, state, acc);
+            } else {
+                mTable[this->coordToKey(xyz)] = NodeStruct(Tile(value, state));
+            }
+        } else if (isChild(iter)) {//child
+            if (LEVEL > level) {
+                ChildT* child = &getChild(iter);
+                acc.insert(xyz, child);
+                child->addTileAndCache(level, xyz, value, state, acc);
+            } else {
+                setTile(iter, Tile(value, state));//this also deletes the existing child node
+            }
+        } else {//tile
+            if (LEVEL > level) {
+                ChildT* child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+                acc.insert(xyz, child);
+                setChild(iter, *child);
+                child->addTileAndCache(level, xyz, value, state, acc);
+            } else {
+                setTile(iter, Tile(value, state));
+            }
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline typename ChildT::LeafNodeType*
+RootNode<ChildT>::touchLeaf(const Coord& xyz)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground, false);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    return child->touchLeaf(xyz);
+}
+
+
+template<typename ChildT>
+template<typename AccessorT>
+inline typename ChildT::LeafNodeType*
+RootNode<ChildT>::touchLeafAndCache(const Coord& xyz, AccessorT& acc)
+{
+    ChildT* child = NULL;
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end()) {
+        child = new ChildT(xyz, mBackground, false);
+        mTable[this->coordToKey(xyz)] = NodeStruct(*child);
+    } else if (isChild(iter)) {
+        child = &getChild(iter);
+    } else {
+        child = new ChildT(xyz, getTile(iter).value, isTileOn(iter));
+        setChild(iter, *child);
+    }
+    acc.insert(xyz, child);
+    return child->touchLeafAndCache(xyz, acc);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename NodeT>
+inline NodeT*
+RootNode<ChildT>::probeNode(const Coord& xyz)
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTile(iter)) return NULL;
+    ChildT* child = &getChild(iter);
+    return (boost::is_same<NodeT, ChildT>::value)
+        ? reinterpret_cast<NodeT*>(child)
+        : child->template probeNode<NodeT>(xyz);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT>
+template<typename NodeT>
+inline const NodeT*
+RootNode<ChildT>::probeConstNode(const Coord& xyz) const
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTile(iter)) return NULL;
+    const ChildT* child = &getChild(iter);
+    return (boost::is_same<NodeT, ChildT>::value)
+        ? reinterpret_cast<const NodeT*>(child)
+        : child->template probeConstNode<NodeT>(xyz);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT>
+inline typename ChildT::LeafNodeType*
+RootNode<ChildT>::probeLeaf(const Coord& xyz)
+{
+    return this->template probeNode<LeafNodeType>(xyz);
+}
+
+
+template<typename ChildT>
+inline const typename ChildT::LeafNodeType*
+RootNode<ChildT>::probeConstLeaf(const Coord& xyz) const
+{
+    return this->template probeConstNode<LeafNodeType>(xyz);
+}
+
+
+template<typename ChildT>
+template<typename AccessorT>
+inline typename ChildT::LeafNodeType*
+RootNode<ChildT>::probeLeafAndCache(const Coord& xyz, AccessorT& acc)
+{
+    return this->template probeNodeAndCache<LeafNodeType>(xyz, acc);
+}
+
+
+template<typename ChildT>
+template<typename AccessorT>
+inline const typename ChildT::LeafNodeType*
+RootNode<ChildT>::probeConstLeafAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    return this->template probeConstNodeAndCache<LeafNodeType>(xyz, acc);
+}
+
+
+template<typename ChildT>
+template<typename AccessorT>
+inline const typename ChildT::LeafNodeType*
+RootNode<ChildT>::probeLeafAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    return this->probeConstLeafAndCache(xyz, acc);
+}
+
+
+template<typename ChildT>
+template<typename NodeT, typename AccessorT>
+inline NodeT*
+RootNode<ChildT>::probeNodeAndCache(const Coord& xyz, AccessorT& acc)
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    MapIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTile(iter)) return NULL;
+    ChildT* child = &getChild(iter);
+    acc.insert(xyz, child);
+    return (boost::is_same<NodeT, ChildT>::value)
+        ? reinterpret_cast<NodeT*>(child)
+        : child->template probeNodeAndCache<NodeT>(xyz, acc);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+template<typename ChildT>
+template<typename NodeT,typename AccessorT>
+inline const NodeT*
+RootNode<ChildT>::probeConstNodeAndCache(const Coord& xyz, AccessorT& acc) const
+{
+    if ((NodeT::LEVEL == ChildT::LEVEL && !(boost::is_same<NodeT, ChildT>::value)) ||
+         NodeT::LEVEL >  ChildT::LEVEL) return NULL;
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+    MapCIter iter = this->findCoord(xyz);
+    if (iter == mTable.end() || isTile(iter)) return NULL;
+    const ChildT* child = &getChild(iter);
+    acc.insert(xyz, child);
+    return (boost::is_same<NodeT, ChildT>::value)
+        ? reinterpret_cast<const NodeT*>(child)
+        : child->template probeConstNodeAndCache<NodeT>(xyz, acc);
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+template<typename ChildT>
+template<typename ArrayT>
+inline void
+RootNode<ChildT>::getNodes(ArrayT& array)
+{
+    typedef typename ArrayT::value_type NodePtr;
+    BOOST_STATIC_ASSERT(boost::is_pointer<NodePtr>::value);
+    typedef typename boost::remove_pointer<NodePtr>::type NodeType;
+    typedef typename boost::remove_const<NodeType>::type NonConstNodeType;
+    typedef typename boost::mpl::contains<NodeChainType, NonConstNodeType>::type result;
+    BOOST_STATIC_ASSERT(result::value);
+    typedef typename boost::mpl::if_<boost::is_const<NodeType>,
+                                     const ChildT, ChildT>::type ArrayChildT;
+
+    for (MapIter iter=mTable.begin(); iter!=mTable.end(); ++iter) {
+        if (ChildT* child = iter->second.child) {
+            OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+            if (boost::is_same<NodePtr, ArrayChildT*>::value) {
+                array.push_back(reinterpret_cast<NodePtr>(iter->second.child));
+            } else {
+                child->getNodes(array);//descent
+            }
+            OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+        }
+    }
+}
+
+template<typename ChildT>
+template<typename ArrayT>
+inline void
+RootNode<ChildT>::getNodes(ArrayT& array) const
+{
+    typedef typename ArrayT::value_type NodePtr;
+    BOOST_STATIC_ASSERT(boost::is_pointer<NodePtr>::value);
+    typedef typename boost::remove_pointer<NodePtr>::type NodeType;
+    BOOST_STATIC_ASSERT(boost::is_const<NodeType>::value);
+    typedef typename boost::remove_const<NodeType>::type NonConstNodeType;
+    typedef typename boost::mpl::contains<NodeChainType, NonConstNodeType>::type result;
+    BOOST_STATIC_ASSERT(result::value);
+
+    for (MapCIter iter=mTable.begin(); iter!=mTable.end(); ++iter) {
+        if (const ChildNodeType *child = iter->second.child) {
+            OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+            if (boost::is_same<NodePtr, const ChildT*>::value) {
+                array.push_back(reinterpret_cast<NodePtr>(iter->second.child));
+            } else {
+                child->getNodes(array);//descent
+            }
+            OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+        }
+    }
+}
+
+////////////////////////////////////////
+
+template<typename ChildT>
+template<typename ArrayT>
+inline void
+RootNode<ChildT>::stealNodes(ArrayT& array, const ValueType& value, bool state)
+{
+    typedef typename ArrayT::value_type NodePtr;
+    BOOST_STATIC_ASSERT(boost::is_pointer<NodePtr>::value);
+    typedef typename boost::remove_pointer<NodePtr>::type NodeType;
+    typedef typename boost::remove_const<NodeType>::type NonConstNodeType;
+    typedef typename boost::mpl::contains<NodeChainType, NonConstNodeType>::type result;
+    BOOST_STATIC_ASSERT(result::value);
+    typedef typename boost::mpl::if_<boost::is_const<NodeType>,
+                                     const ChildT, ChildT>::type ArrayChildT;
+
+    for (MapIter iter=mTable.begin(); iter!=mTable.end(); ++iter) {
+        if (ChildT* child = iter->second.child) {
+            OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+            if (boost::is_same<NodePtr, ArrayChildT*>::value) {
+                array.push_back(reinterpret_cast<NodePtr>(&stealChild(iter, Tile(value, state))));
+            } else {
+                child->stealNodes(array, value, state);//descent
+            }
+            OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+inline void
+RootNode<ChildT>::voxelizeActiveTiles(bool threaded)
+{
+    // These is little point in multi-threaded over the root table since
+    // each tile spans a huge index space (by default 4096^3) and hence we
+    // expect few if any at all. In fact, you're very likeky to run out
+    // of memory if this method is called on a tree with root-level
+    // active tiles!
+    for (MapIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (this->isTileOff(i)) continue;
+        ChildT* child = i->second.child;
+        if (child==NULL) {
+            child = new ChildT(i->first, this->getTile(i).value, true);
+            i->second.child = child;
+        }
+        child->voxelizeActiveTiles(threaded);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<MergePolicy Policy>
+inline void
+RootNode<ChildT>::merge(RootNode& other)
+{
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+
+    switch (Policy) {
+
+    default:
+    case MERGE_ACTIVE_STATES:
+        for (MapIter i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+            MapIter j = mTable.find(i->first);
+            if (other.isChild(i)) {
+                if (j == mTable.end()) { // insert other node's child
+                    ChildNodeType& child = stealChild(i, Tile(other.mBackground, /*on=*/false));
+                    child.resetBackground(other.mBackground, mBackground);
+                    mTable[i->first] = NodeStruct(child);
+                } else if (isTile(j)) {
+                    if (isTileOff(j)) { // replace inactive tile with other node's child
+                        ChildNodeType& child = stealChild(i, Tile(other.mBackground, /*on=*/false));
+                        child.resetBackground(other.mBackground, mBackground);
+                        setChild(j, child);
+                    }
+                } else { // merge both child nodes
+                    getChild(j).template merge<MERGE_ACTIVE_STATES>(getChild(i),
+                        other.mBackground, mBackground);
+                }
+            } else if (other.isTileOn(i)) {
+                if (j == mTable.end()) { // insert other node's active tile
+                    mTable[i->first] = i->second;
+                } else if (!isTileOn(j)) {
+                    // Replace anything except an active tile with the other node's active tile.
+                    setTile(j, Tile(other.getTile(i).value, true));
+                }
+            }
+        }
+        break;
+
+    case MERGE_NODES:
+        for (MapIter i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+            MapIter j = mTable.find(i->first);
+            if (other.isChild(i)) {
+                if (j == mTable.end()) { // insert other node's child
+                    ChildNodeType& child = stealChild(i, Tile(other.mBackground, /*on=*/false));
+                    child.resetBackground(other.mBackground, mBackground);
+                    mTable[i->first] = NodeStruct(child);
+                } else if (isTile(j)) { // replace tile with other node's child
+                    ChildNodeType& child = stealChild(i, Tile(other.mBackground, /*on=*/false));
+                    child.resetBackground(other.mBackground, mBackground);
+                    setChild(j, child);
+                } else { // merge both child nodes
+                    getChild(j).template merge<MERGE_NODES>(
+                        getChild(i), other.mBackground, mBackground);
+                }
+            }
+        }
+        break;
+
+    case MERGE_ACTIVE_STATES_AND_NODES:
+        for (MapIter i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+            MapIter j = mTable.find(i->first);
+            if (other.isChild(i)) {
+                if (j == mTable.end()) {
+                    // Steal and insert the other node's child.
+                    ChildNodeType& child = stealChild(i, Tile(other.mBackground, /*on=*/false));
+                    child.resetBackground(other.mBackground, mBackground);
+                    mTable[i->first] = NodeStruct(child);
+                } else if (isTile(j)) {
+                    // Replace this node's tile with the other node's child.
+                    ChildNodeType& child = stealChild(i, Tile(other.mBackground, /*on=*/false));
+                    child.resetBackground(other.mBackground, mBackground);
+                    const Tile tile = getTile(j);
+                    setChild(j, child);
+                    if (tile.active) {
+                        // Merge the other node's child with this node's active tile.
+                        child.template merge<MERGE_ACTIVE_STATES_AND_NODES>(
+                            tile.value, tile.active);
+                    }
+                } else /*if (isChild(j))*/ {
+                    // Merge the other node's child into this node's child.
+                    getChild(j).template merge<MERGE_ACTIVE_STATES_AND_NODES>(getChild(i),
+                        other.mBackground, mBackground);
+                }
+            } else if (other.isTileOn(i)) {
+                if (j == mTable.end()) {
+                    // Insert a copy of the other node's active tile.
+                    mTable[i->first] = i->second;
+                } else if (isTileOff(j)) {
+                    // Replace this node's inactive tile with a copy of the other's active tile.
+                    setTile(j, Tile(other.getTile(i).value, true));
+                } else if (isChild(j)) {
+                    // Merge the other node's active tile into this node's child.
+                    const Tile& tile = getTile(i);
+                    getChild(j).template merge<MERGE_ACTIVE_STATES_AND_NODES>(
+                        tile.value, tile.active);
+                }
+            } // else if (other.isTileOff(i)) {} // ignore the other node's inactive tiles
+        }
+        break;
+    }
+
+    // Empty the other tree so as not to leave it in a partially cannibalized state.
+    other.clear();
+
+    OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline void
+RootNode<ChildT>::topologyUnion(const RootNode<OtherChildType>& other)
+{
+    typedef RootNode<OtherChildType> OtherRootT;
+    typedef typename OtherRootT::MapCIter OtherCIterT;
+
+    enforceSameConfiguration(other);
+
+    for (OtherCIterT i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+        MapIter j = mTable.find(i->first);
+        if (other.isChild(i)) {
+            if (j == mTable.end()) { // create child branch with identical topology
+                mTable[i->first] = NodeStruct(
+                    *(new ChildT(other.getChild(i), mBackground, TopologyCopy())));
+            } else if (this->isChild(j)) { // union with child branch
+                this->getChild(j).topologyUnion(other.getChild(i));
+            } else {// this is a tile so replace it with a child branch with identical topology
+                ChildT* child = new ChildT(
+                    other.getChild(i), this->getTile(j).value, TopologyCopy());
+                if (this->isTileOn(j)) child->setValuesOn();//this is an active tile
+                this->setChild(j, *child);
+            }
+        } else if (other.isTileOn(i)) { // other is an active tile
+            if (j == mTable.end()) { // insert an active tile
+                mTable[i->first] = NodeStruct(Tile(mBackground, true));
+            } else if (this->isChild(j)) {
+                this->getChild(j).setValuesOn();
+            } else if (this->isTileOff(j)) {
+                this->setTile(j, Tile(this->getTile(j).value, true));
+            }
+        }
+    }
+}
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline void
+RootNode<ChildT>::topologyIntersection(const RootNode<OtherChildType>& other)
+{
+    typedef RootNode<OtherChildType> OtherRootT;
+    typedef typename OtherRootT::MapCIter OtherCIterT;
+
+    enforceSameConfiguration(other);
+
+    std::set<Coord> tmp;//keys to erase
+    for (MapIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        OtherCIterT j = other.mTable.find(i->first);
+        if (this->isChild(i)) {
+            if (j == other.mTable.end() || other.isTileOff(j)) {
+                tmp.insert(i->first);//delete child branch
+            } else if (other.isChild(j)) { // intersect with child branch
+                this->getChild(i).topologyIntersection(other.getChild(j), mBackground);
+            }
+        } else if (this->isTileOn(i)) {
+            if (j == other.mTable.end() || other.isTileOff(j)) {
+                this->setTile(i, Tile(this->getTile(i).value, false));//turn inactive
+            } else if (other.isChild(j)) { //replace with a child branch with identical topology
+                ChildT* child =
+                    new ChildT(other.getChild(j), this->getTile(i).value, TopologyCopy());
+                this->setChild(i, *child);
+            }
+        }
+    }
+    for (std::set<Coord>::iterator i = tmp.begin(), e = tmp.end(); i != e; ++i) {
+        MapIter it = this->findCoord(*i);
+        setTile(it, Tile()); // delete any existing child node first
+        mTable.erase(it);
+    }
+}
+
+template<typename ChildT>
+template<typename OtherChildType>
+inline void
+RootNode<ChildT>::topologyDifference(const RootNode<OtherChildType>& other)
+{
+    typedef RootNode<OtherChildType> OtherRootT;
+    typedef typename OtherRootT::MapCIter OtherCIterT;
+
+    enforceSameConfiguration(other);
+
+    for (OtherCIterT i = other.mTable.begin(), e = other.mTable.end(); i != e; ++i) {
+        MapIter j = mTable.find(i->first);
+        if (other.isChild(i)) {
+            if (j == mTable.end() || this->isTileOff(j)) {
+                //do nothing
+            } else if (this->isChild(j)) { // difference with child branch
+                this->getChild(j).topologyDifference(other.getChild(i), mBackground);
+            } else if (this->isTileOn(j)) {
+                // this is an active tile so create a child node and descent
+                ChildT* child = new ChildT(j->first, this->getTile(j).value, true);
+                child->topologyDifference(other.getChild(i), mBackground);
+                this->setChild(j, *child);
+            }
+        } else if (other.isTileOn(i)) { // other is an active tile
+            if (j == mTable.end() || this->isTileOff(j)) {
+                // do nothing
+            } else if (this->isChild(j)) {
+                setTile(j, Tile()); // delete any existing child node first
+                mTable.erase(j);
+            } else if (this->isTileOn(j)) {
+                this->setTile(j, Tile(this->getTile(j).value, false));
+            }
+        }
+    }
+}
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename CombineOp>
+inline void
+RootNode<ChildT>::combine(RootNode& other, CombineOp& op, bool prune)
+{
+    CombineArgs<ValueType> args;
+
+    CoordSet keys;
+    this->insertKeys(keys);
+    other.insertKeys(keys);
+
+    for (CoordSetCIter i = keys.begin(), e = keys.end(); i != e; ++i) {
+        MapIter iter = findOrAddCoord(*i), otherIter = other.findOrAddCoord(*i);
+        if (isTile(iter) && isTile(otherIter)) {
+            // Both this node and the other node have constant values (tiles).
+            // Combine the two values and store the result as this node's new tile value.
+            op(args.setARef(getTile(iter).value)
+                .setAIsActive(isTileOn(iter))
+                .setBRef(getTile(otherIter).value)
+                .setBIsActive(isTileOn(otherIter)));
+            setTile(iter, Tile(args.result(), args.resultIsActive()));
+
+        } else if (isChild(iter) && isTile(otherIter)) {
+            // Combine this node's child with the other node's constant value.
+            ChildT& child = getChild(iter);
+            child.combine(getTile(otherIter).value, isTileOn(otherIter), op);
+
+        } else if (isTile(iter) && isChild(otherIter)) {
+            // Combine this node's constant value with the other node's child,
+            // but use a new functor in which the A and B values are swapped,
+            // since the constant value is the A value, not the B value.
+            SwappedCombineOp<ValueType, CombineOp> swappedOp(op);
+            ChildT& child = getChild(otherIter);
+            child.combine(getTile(iter).value, isTileOn(iter), swappedOp);
+
+            // Steal the other node's child.
+            setChild(iter, stealChild(otherIter, Tile()));
+
+        } else /*if (isChild(iter) && isChild(otherIter))*/ {
+            // Combine this node's child with the other node's child.
+            ChildT &child = getChild(iter), &otherChild = getChild(otherIter);
+            child.combine(otherChild, op);
+        }
+        if (prune && isChild(iter)) getChild(iter).prune();
+    }
+
+    // Combine background values.
+    op(args.setARef(mBackground).setBRef(other.mBackground));
+    mBackground = args.result();
+
+    // Empty the other tree so as not to leave it in a partially cannibalized state.
+    other.clear();
+}
+
+
+////////////////////////////////////////
+
+
+// This helper class is a friend of RootNode and is needed so that combine2
+// can be specialized for compatible and incompatible pairs of RootNode types.
+template<typename CombineOp, typename RootT, typename OtherRootT, bool Compatible = false>
+struct RootNodeCombineHelper
+{
+    static inline void combine2(RootT& self, const RootT&, const OtherRootT& other1,
+        CombineOp&, bool)
+    {
+        // If the two root nodes have different configurations or incompatible ValueTypes,
+        // throw an exception.
+        self.enforceSameConfiguration(other1);
+        self.enforceCompatibleValueTypes(other1);
+        // One of the above two tests should throw, so we should never get here:
+        std::ostringstream ostr;
+        ostr << "cannot combine a " << typeid(OtherRootT).name()
+            << " into a " << typeid(RootT).name();
+        OPENVDB_THROW(TypeError, ostr.str());
+    }
+};
+
+// Specialization for root nodes of compatible types
+template<typename CombineOp, typename RootT, typename OtherRootT>
+struct RootNodeCombineHelper<CombineOp, RootT, OtherRootT, /*Compatible=*/true>
+{
+    static inline void combine2(RootT& self, const RootT& other0, const OtherRootT& other1,
+        CombineOp& op, bool prune)
+    {
+        self.doCombine2(other0, other1, op, prune);
+    }
+};
+
+
+template<typename ChildT>
+template<typename CombineOp, typename OtherRootNode>
+inline void
+RootNode<ChildT>::combine2(const RootNode& other0, const OtherRootNode& other1,
+    CombineOp& op, bool prune)
+{
+    typedef typename OtherRootNode::ValueType OtherValueType;
+    static const bool compatible = (SameConfiguration<OtherRootNode>::value
+        && CanConvertType</*from=*/OtherValueType, /*to=*/ValueType>::value);
+    RootNodeCombineHelper<CombineOp, RootNode, OtherRootNode, compatible>::combine2(
+        *this, other0, other1, op, prune);
+}
+
+
+template<typename ChildT>
+template<typename CombineOp, typename OtherRootNode>
+inline void
+RootNode<ChildT>::doCombine2(const RootNode& other0, const OtherRootNode& other1,
+    CombineOp& op, bool prune)
+{
+    enforceSameConfiguration(other1);
+
+    typedef typename OtherRootNode::ValueType  OtherValueT;
+    typedef typename OtherRootNode::Tile       OtherTileT;
+    typedef typename OtherRootNode::NodeStruct OtherNodeStructT;
+    typedef typename OtherRootNode::MapCIter   OtherMapCIterT;
+
+    CombineArgs<ValueType, OtherValueT> args;
+
+    CoordSet keys;
+    other0.insertKeys(keys);
+    other1.insertKeys(keys);
+
+    const NodeStruct bg0(Tile(other0.mBackground, /*active=*/false));
+    const OtherNodeStructT bg1(OtherTileT(other1.mBackground, /*active=*/false));
+
+    for (CoordSetCIter i = keys.begin(), e = keys.end(); i != e; ++i) {
+        MapIter thisIter = this->findOrAddCoord(*i);
+        MapCIter iter0 = other0.findKey(*i);
+        OtherMapCIterT iter1 = other1.findKey(*i);
+        const NodeStruct& ns0 = (iter0 != other0.mTable.end()) ? iter0->second : bg0;
+        const OtherNodeStructT& ns1 = (iter1 != other1.mTable.end()) ? iter1->second : bg1;
+        if (ns0.isTile() && ns1.isTile()) {
+            // Both input nodes have constant values (tiles).
+            // Combine the two values and add a new tile to this node with the result.
+            op(args.setARef(ns0.tile.value)
+                .setAIsActive(ns0.isTileOn())
+                .setBRef(ns1.tile.value)
+                .setBIsActive(ns1.isTileOn()));
+            setTile(thisIter, Tile(args.result(), args.resultIsActive()));
+        } else {
+            if (!isChild(thisIter)) {
+                // Add a new child with the same coordinates, etc. as the other node's child.
+                const Coord& childOrigin =
+                    ns0.isChild() ? ns0.child->origin() : ns1.child->origin();
+                setChild(thisIter, *(new ChildT(childOrigin, getTile(thisIter).value)));
+            }
+            ChildT& child = getChild(thisIter);
+
+            if (ns0.isTile()) {
+                // Combine node1's child with node0's constant value
+                // and write the result into this node's child.
+                child.combine2(ns0.tile.value, *ns1.child, ns0.isTileOn(), op);
+            } else if (ns1.isTile()) {
+                // Combine node0's child with node1's constant value
+                // and write the result into this node's child.
+                child.combine2(*ns0.child, ns1.tile.value, ns1.isTileOn(), op);
+            } else {
+                // Combine node0's child with node1's child
+                // and write the result into this node's child.
+                child.combine2(*ns0.child, *ns1.child, op);
+            }
+        }
+        if (prune && isChild(thisIter)) getChild(thisIter).prune();
+    }
+
+    // Combine background values.
+    op(args.setARef(other0.mBackground).setBRef(other1.mBackground));
+    mBackground = args.result();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename BBoxOp>
+inline void
+RootNode<ChildT>::visitActiveBBox(BBoxOp& op) const
+{
+    const bool descent = op.template descent<LEVEL>();
+    for (MapCIter i = mTable.begin(), e = mTable.end(); i != e; ++i) {
+        if (this->isTileOff(i)) continue;
+        if (this->isChild(i) && descent) {
+            this->getChild(i).visitActiveBBox(op);
+        } else {
+#ifdef _MSC_VER
+            op.operator()<LEVEL>(CoordBBox::createCube(i->first, ChildT::DIM));
+#else
+            op.template operator()<LEVEL>(CoordBBox::createCube(i->first, ChildT::DIM));
+#endif
+        }
+    }
+}
+
+
+template<typename ChildT>
+template<typename VisitorOp>
+inline void
+RootNode<ChildT>::visit(VisitorOp& op)
+{
+    doVisit<RootNode, VisitorOp, ChildAllIter>(*this, op);
+}
+
+
+template<typename ChildT>
+template<typename VisitorOp>
+inline void
+RootNode<ChildT>::visit(VisitorOp& op) const
+{
+    doVisit<const RootNode, VisitorOp, ChildAllCIter>(*this, op);
+}
+
+
+template<typename ChildT>
+template<typename RootNodeT, typename VisitorOp, typename ChildAllIterT>
+inline void
+RootNode<ChildT>::doVisit(RootNodeT& self, VisitorOp& op)
+{
+    typename RootNodeT::ValueType val;
+    for (ChildAllIterT iter = self.beginChildAll(); iter; ++iter) {
+        if (op(iter)) continue;
+        if (typename ChildAllIterT::ChildNodeType* child = iter.probeChild(val)) {
+            child->visit(op);
+        }
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template<typename ChildT>
+template<typename OtherRootNodeType, typename VisitorOp>
+inline void
+RootNode<ChildT>::visit2(OtherRootNodeType& other, VisitorOp& op)
+{
+    doVisit2<RootNode, OtherRootNodeType, VisitorOp, ChildAllIter,
+        typename OtherRootNodeType::ChildAllIter>(*this, other, op);
+}
+
+
+template<typename ChildT>
+template<typename OtherRootNodeType, typename VisitorOp>
+inline void
+RootNode<ChildT>::visit2(OtherRootNodeType& other, VisitorOp& op) const
+{
+    doVisit2<const RootNode, OtherRootNodeType, VisitorOp, ChildAllCIter,
+        typename OtherRootNodeType::ChildAllCIter>(*this, other, op);
+}
+
+
+template<typename ChildT>
+template<
+    typename RootNodeT,
+    typename OtherRootNodeT,
+    typename VisitorOp,
+    typename ChildAllIterT,
+    typename OtherChildAllIterT>
+inline void
+RootNode<ChildT>::doVisit2(RootNodeT& self, OtherRootNodeT& other, VisitorOp& op)
+{
+    enforceSameConfiguration(other);
+
+    typename RootNodeT::ValueType val;
+    typename OtherRootNodeT::ValueType otherVal;
+
+    // The two nodes are required to have corresponding table entries,
+    // but since that might require background tiles to be added to one or both,
+    // and the nodes might be const, we operate on shallow copies of the nodes instead.
+    RootNodeT copyOfSelf(self.mBackground);
+    copyOfSelf.mTable = self.mTable;
+    OtherRootNodeT copyOfOther(other.mBackground);
+    copyOfOther.mTable = other.mTable;
+
+    // Add background tiles to both nodes as needed.
+    CoordSet keys;
+    self.insertKeys(keys);
+    other.insertKeys(keys);
+    for (CoordSetCIter i = keys.begin(), e = keys.end(); i != e; ++i) {
+        copyOfSelf.findOrAddCoord(*i);
+        copyOfOther.findOrAddCoord(*i);
+    }
+
+    ChildAllIterT iter = copyOfSelf.beginChildAll();
+    OtherChildAllIterT otherIter = copyOfOther.beginChildAll();
+
+    for ( ; iter && otherIter; ++iter, ++otherIter)
+    {
+        const size_t skipBranch = static_cast<size_t>(op(iter, otherIter));
+
+        typename ChildAllIterT::ChildNodeType* child =
+            (skipBranch & 1U) ? NULL : iter.probeChild(val);
+        typename OtherChildAllIterT::ChildNodeType* otherChild =
+            (skipBranch & 2U) ? NULL : otherIter.probeChild(otherVal);
+
+        if (child != NULL && otherChild != NULL) {
+            child->visit2Node(*otherChild, op);
+        } else if (child != NULL) {
+            child->visit2(otherIter, op);
+        } else if (otherChild != NULL) {
+            otherChild->visit2(iter, op, /*otherIsLHS=*/true);
+        }
+    }
+    // Remove any background tiles that were added above,
+    // as well as any that were created by the visitors.
+    copyOfSelf.eraseBackgroundTiles();
+    copyOfOther.eraseBackgroundTiles();
+
+    // If either input node is non-const, replace its table with
+    // the (possibly modified) copy.
+    self.resetTable(copyOfSelf.mTable);
+    other.resetTable(copyOfOther.mTable);
+}
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_ROOTNODE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/Tree.h b/nuparu/include/openvdb_new/tree/Tree.h
new file mode 100644
index 00000000..8becf8f1
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/Tree.h
@@ -0,0 +1,2337 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file tree/Tree.h
+
+#ifndef OPENVDB_TREE_TREE_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_TREE_HAS_BEEN_INCLUDED
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <boost/cstdint.hpp>
+#include <tbb/atomic.h>
+#include <tbb/concurrent_hash_map.h>
+#include <openvdb/Types.h>
+#include <openvdb/metadata/Metadata.h>
+#include <openvdb/math/Math.h>
+#include <openvdb/math/BBox.h>
+#include <openvdb/util/Formats.h>
+#include <openvdb/util/logging.h>
+#include <openvdb/Platform.h>
+#include "RootNode.h"
+#include "InternalNode.h"
+#include "LeafNode.h"
+#include "TreeIterator.h"
+#include "ValueAccessor.h"
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+/// @brief Base class for typed trees
+class OPENVDB_API TreeBase
+{
+public:
+    typedef boost::shared_ptr<TreeBase> Ptr;
+    typedef boost::shared_ptr<const TreeBase> ConstPtr;
+
+    TreeBase() {}
+    virtual ~TreeBase() {}
+
+    /// Return the name of this tree's type.
+    virtual const Name& type() const = 0;
+
+    /// Return the name of the type of a voxel's value (e.g., "float" or "vec3d").
+    virtual Name valueType() const = 0;
+
+    /// Return a pointer to a deep copy of this tree
+    virtual TreeBase::Ptr copy() const = 0;
+
+    //
+    // Tree methods
+    //
+    /// @brief Return this tree's background value wrapped as metadata.
+    /// @note Query the metadata object for the value's type.
+    virtual Metadata::Ptr getBackgroundValue() const { return Metadata::Ptr(); }
+
+    /// @brief Return in @a bbox the axis-aligned bounding box of all
+    /// leaf nodes and active tiles.
+    /// @details This is faster than calling evalActiveVoxelBoundingBox,
+    /// which visits the individual active voxels, and hence
+    /// evalLeafBoundingBox produces a less tight, i.e. approximate, bbox.
+    /// @return @c false if the bounding box is empty (in which case
+    /// the bbox is set to its default value).
+    virtual bool evalLeafBoundingBox(CoordBBox& bbox) const = 0;
+
+    /// @brief Return in @a dim the dimensions of the axis-aligned bounding box
+    /// of all leaf nodes.
+    /// @return @c false if the bounding box is empty.
+    virtual bool evalLeafDim(Coord& dim) const = 0;
+
+    /// @brief Return in @a bbox the axis-aligned bounding box of all
+    /// active voxels and tiles.
+    /// @details This method produces a more accurate, i.e. tighter,
+    /// bounding box than evalLeafBoundingBox which is approximate but
+    /// faster.
+    /// @return @c false if the bounding box is empty (in which case
+    /// the bbox is set to its default value).
+    virtual bool evalActiveVoxelBoundingBox(CoordBBox& bbox) const = 0;
+
+    /// @brief Return in @a dim the dimensions of the axis-aligned bounding box of all
+    /// active voxels.  This is a tighter bounding box than the leaf node bounding box.
+    /// @return @c false if the bounding box is empty.
+    virtual bool evalActiveVoxelDim(Coord& dim) const = 0;
+
+    virtual void getIndexRange(CoordBBox& bbox) const = 0;
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Replace with background tiles any nodes whose voxel buffers
+    /// have not yet been allocated.
+    /// @details Typically, unallocated nodes are leaf nodes whose voxel buffers
+    /// are not yet resident in memory because delayed loading is in effect.
+    /// @sa readNonresidentBuffers, io::File::open
+    virtual void clipUnallocatedNodes() = 0;
+#endif
+
+
+    //
+    // Statistics
+    //
+    /// @brief Return the depth of this tree.
+    ///
+    /// A tree with only a root node and leaf nodes has depth 2, for example.
+    virtual Index treeDepth() const = 0;
+    /// Return the number of leaf nodes.
+    virtual Index32 leafCount() const = 0;
+    /// Return the number of non-leaf nodes.
+    virtual Index32 nonLeafCount() const = 0;
+    /// Return the number of active voxels stored in leaf nodes.
+    virtual Index64 activeLeafVoxelCount() const = 0;
+    /// Return the number of inactive voxels stored in leaf nodes.
+    virtual Index64 inactiveLeafVoxelCount() const = 0;
+    /// Return the total number of active voxels.
+    virtual Index64 activeVoxelCount() const = 0;
+    /// Return the number of inactive voxels within the bounding box of all active voxels.
+    virtual Index64 inactiveVoxelCount() const = 0;
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Return the total number of active tiles.
+    virtual Index64 activeTileCount() const = 0;
+#endif
+
+    /// Return the total amount of memory in bytes occupied by this tree.
+    virtual Index64 memUsage() const { return 0; }
+
+
+    //
+    // I/O methods
+    //
+    /// @brief Read the tree topology from a stream.
+    ///
+    /// This will read the tree structure and tile values, but not voxel data.
+    virtual void readTopology(std::istream&, bool saveFloatAsHalf = false);
+    /// @brief Write the tree topology to a stream.
+    ///
+    /// This will write the tree structure and tile values, but not voxel data.
+    virtual void writeTopology(std::ostream&, bool saveFloatAsHalf = false) const;
+
+    /// Read all data buffers for this tree.
+    virtual void readBuffers(std::istream&, bool saveFloatAsHalf = false) = 0;
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Read all of this tree's data buffers that intersect the given bounding box.
+    virtual void readBuffers(std::istream&, const CoordBBox&, bool saveFloatAsHalf = false) = 0;
+    /// @brief Read all of this tree's data buffers that are not yet resident in memory
+    /// (because delayed loading is in effect).
+    /// @details If this tree was read from a memory-mapped file, this operation
+    /// disconnects the tree from the file.
+    /// @sa clipUnallocatedNodes, io::File::open, io::MappedFile
+    virtual void readNonresidentBuffers() const = 0;
+#endif
+    /// Write out all the data buffers for this tree.
+    virtual void writeBuffers(std::ostream&, bool saveFloatAsHalf = false) const = 0;
+
+    /// @brief Print statistics, memory usage and other information about this tree.
+    /// @param os            a stream to which to write textual information
+    /// @param verboseLevel  1: print tree configuration only;
+    ///                      2: include node and voxel statistics;
+    ///                      3: include memory usage;
+    ///                      4: include minimum and maximum voxel values
+    /// @warning @a verboseLevel 4 forces loading of any unallocated nodes.
+    virtual void print(std::ostream& os = std::cout, int verboseLevel = 1) const;
+
+private:
+    // Disallow copying of instances of this class.
+    //TreeBase(const TreeBase& other);
+    TreeBase& operator=(const TreeBase& other);
+};
+
+
+////////////////////////////////////////
+
+
+template<typename _RootNodeType>
+class Tree: public TreeBase
+{
+public:
+    typedef boost::shared_ptr<Tree> Ptr;
+    typedef boost::shared_ptr<const Tree> ConstPtr;
+
+    typedef _RootNodeType                        RootNodeType;
+    typedef typename RootNodeType::ValueType     ValueType;
+    typedef typename RootNodeType::BuildType     BuildType;
+    typedef typename RootNodeType::LeafNodeType  LeafNodeType;
+
+    static const Index DEPTH = RootNodeType::LEVEL + 1;
+
+    /// @brief ValueConverter<T>::Type is the type of a tree having the same
+    /// hierarchy as this tree but a different value type, T.
+    ///
+    /// For example, FloatTree::ValueConverter<double>::Type is equivalent to DoubleTree.
+    /// @note If the source tree type is a template argument, it might be necessary
+    /// to write "typename SourceTree::template ValueConverter<T>::Type".
+    template<typename OtherValueType>
+    struct ValueConverter {
+        typedef Tree<typename RootNodeType::template ValueConverter<OtherValueType>::Type> Type;
+    };
+
+
+    Tree() {}
+
+    /// Deep copy constructor
+    Tree(const Tree& other): TreeBase(other), mRoot(other.mRoot)
+    {
+    }
+
+    /// @brief Value conversion deep copy constructor
+    ///
+    /// Deep copy a tree of the same configuration as this tree type but a different
+    /// ValueType, casting the other tree's values to this tree's ValueType.
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's
+    /// or if this tree's ValueType is not constructible from the other tree's ValueType.
+    template<typename OtherRootType>
+    explicit Tree(const Tree<OtherRootType>& other): TreeBase(other), mRoot(other.root())
+    {
+    }
+
+    /// @brief Topology copy constructor from a tree of a different type
+    ///
+    /// Copy the structure, i.e., the active states of tiles and voxels, of another
+    /// tree of a possibly different type, but don't copy any tile or voxel values.
+    /// Instead, initialize tiles and voxels with the given active and inactive values.
+    /// @param other          a tree having (possibly) a different ValueType
+    /// @param inactiveValue  background value for this tree, and the value to which
+    ///                       all inactive tiles and voxels are initialized
+    /// @param activeValue    value to which active tiles and voxels are initialized
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's.
+    template<typename OtherTreeType>
+    Tree(const OtherTreeType& other,
+        const ValueType& inactiveValue,
+        const ValueType& activeValue,
+        TopologyCopy):
+        TreeBase(other),
+        mRoot(other.root(), inactiveValue, activeValue, TopologyCopy())
+    {
+    }
+
+    /// @brief Topology copy constructor from a tree of a different type
+    ///
+    /// @note This topology copy constructor is generally faster than
+    /// the one that takes both a foreground and a background value.
+    ///
+    /// Copy the structure, i.e., the active states of tiles and voxels, of another
+    /// tree of a possibly different type, but don't copy any tile or voxel values.
+    /// Instead, initialize tiles and voxels with the given background value.
+    /// @param other        a tree having (possibly) a different ValueType
+    /// @param background   the value to which tiles and voxels are initialized
+    /// @throw TypeError if the other tree's configuration doesn't match this tree's.
+    template<typename OtherTreeType>
+    Tree(const OtherTreeType& other, const ValueType& background, TopologyCopy):
+        TreeBase(other),
+        mRoot(other.root(), background, TopologyCopy())
+    {
+    }
+
+    /// Empty tree constructor
+    Tree(const ValueType& background): mRoot(background) {}
+
+    virtual ~Tree() { this->clear(); releaseAllAccessors(); }
+
+    /// Return a pointer to a deep copy of this tree
+    virtual TreeBase::Ptr copy() const { return TreeBase::Ptr(new Tree(*this)); }
+
+    /// Return the name of the type of a voxel's value (e.g., "float" or "vec3d")
+    virtual Name valueType() const { return typeNameAsString<ValueType>(); }
+
+    /// Return the name of this type of tree.
+    static const Name& treeType();
+    /// Return the name of this type of tree.
+    virtual const Name& type() const { return this->treeType(); }
+
+    bool operator==(const Tree&) const { OPENVDB_THROW(NotImplementedError, ""); }
+    bool operator!=(const Tree&) const { OPENVDB_THROW(NotImplementedError, ""); }
+
+    //@{
+    /// Return this tree's root node.
+    RootNodeType& root() { return mRoot; }
+    const RootNodeType& root() const { return mRoot; }
+    //@}
+
+
+    //
+    // Tree methods
+    //
+    /// @brief Return @c true if the given tree has the same node and active value
+    /// topology as this tree, whether or not it has the same @c ValueType.
+    template<typename OtherRootNodeType>
+    bool hasSameTopology(const Tree<OtherRootNodeType>& other) const;
+
+    virtual bool evalLeafBoundingBox(CoordBBox& bbox) const;
+    virtual bool evalActiveVoxelBoundingBox(CoordBBox& bbox) const;
+    virtual bool evalActiveVoxelDim(Coord& dim) const;
+    virtual bool evalLeafDim(Coord& dim) const;
+
+    /// @brief Traverse the type hierarchy of nodes, and return, in @a dims, a list
+    /// of the Log2Dims of nodes in order from RootNode to LeafNode.
+    /// @note Because RootNodes are resizable, the RootNode Log2Dim is 0 for all trees.
+    static void getNodeLog2Dims(std::vector<Index>& dims);
+
+
+    //
+    // I/O methods
+    //
+    /// @brief Read the tree topology from a stream.
+    ///
+    /// This will read the tree structure and tile values, but not voxel data.
+    virtual void readTopology(std::istream&, bool saveFloatAsHalf = false);
+    /// @brief Write the tree topology to a stream.
+    ///
+    /// This will write the tree structure and tile values, but not voxel data.
+    virtual void writeTopology(std::ostream&, bool saveFloatAsHalf = false) const;
+    /// Read all data buffers for this tree.
+    virtual void readBuffers(std::istream&, bool saveFloatAsHalf = false);
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// Read all of this tree's data buffers that intersect the given bounding box.
+    virtual void readBuffers(std::istream&, const CoordBBox&, bool saveFloatAsHalf = false);
+    /// @brief Read all of this tree's data buffers that are not yet resident in memory
+    /// (because delayed loading is in effect).
+    /// @details If this tree was read from a memory-mapped file, this operation
+    /// disconnects the tree from the file.
+    /// @sa clipUnallocatedNodes, io::File::open, io::MappedFile
+    virtual void readNonresidentBuffers() const;
+#endif
+    /// Write out all data buffers for this tree.
+    virtual void writeBuffers(std::ostream&, bool saveFloatAsHalf = false) const;
+
+    virtual void print(std::ostream& os = std::cout, int verboseLevel = 1) const;
+
+
+    //
+    // Statistics
+    //
+    /// @brief Return the depth of this tree.
+    ///
+    /// A tree with only a root node and leaf nodes has depth 2, for example.
+    virtual Index treeDepth() const { return DEPTH; }
+    /// Return the number of leaf nodes.
+    virtual Index32 leafCount() const { return mRoot.leafCount(); }
+    /// Return the number of non-leaf nodes.
+    virtual Index32 nonLeafCount() const { return mRoot.nonLeafCount(); }
+    /// Return the number of active voxels stored in leaf nodes.
+    virtual Index64 activeLeafVoxelCount() const { return mRoot.onLeafVoxelCount(); }
+    /// Return the number of inactive voxels stored in leaf nodes.
+    virtual Index64 inactiveLeafVoxelCount() const { return mRoot.offLeafVoxelCount(); }
+    /// Return the total number of active voxels.
+    virtual Index64 activeVoxelCount() const { return mRoot.onVoxelCount(); }
+    /// Return the number of inactive voxels within the bounding box of all active voxels.
+    virtual Index64 inactiveVoxelCount() const;
+    /// Return the total number of active tiles.
+    Index64 activeTileCount() const { return mRoot.onTileCount(); }
+
+    /// Return the minimum and maximum active values in this tree.
+    void evalMinMax(ValueType &min, ValueType &max) const;
+
+    virtual Index64 memUsage() const { return sizeof(*this) + mRoot.memUsage(); }
+
+
+    //
+    // Voxel access methods (using signed indexing)
+    //
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const;
+    /// @brief Return the value of the voxel at the given coordinates
+    /// and update the given accessor's node cache.
+    template<typename AccessT> const ValueType& getValue(const Coord& xyz, AccessT&) const;
+
+    /// @brief Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides.
+    /// @details If (x, y, z) isn't explicitly represented in the tree (i.e., it is
+    /// implicitly a background voxel), return -1.
+    int getValueDepth(const Coord& xyz) const;
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on);
+    /// Set the value of the voxel at the given coordinates but don't change its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value);
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValueOn(const Coord& xyz, const ValueType& value);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value);
+    /// @brief Set the value of the voxel at the given coordinates, mark the voxel as active,
+    /// and update the given accessor's node cache.
+    template<typename AccessT> void setValue(const Coord& xyz, const ValueType& value, AccessT&);
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz);
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value);
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details Provided that the functor can be inlined, this is typically
+    /// significantly faster than calling getValue() followed by setValueOn().
+    /// @param xyz  the coordinates of a voxel whose value is to be modified
+    /// @param op   a functor of the form <tt>void op(ValueType&) const</tt> that modifies
+    ///             its argument in place
+    /// @par Example:
+    /// @code
+    /// Coord xyz(1, 0, -2);
+    /// // Multiply the value of a voxel by a constant and mark the voxel as active.
+    /// floatTree.modifyValue(xyz, [](float& f) { f *= 0.25; }); // C++11
+    /// // Set the value of a voxel to the maximum of its current value and 0.25,
+    /// // and mark the voxel as active.
+    /// floatTree.modifyValue(xyz, [](float& f) { f = std::max(f, 0.25f); }); // C++11
+    /// @endcode
+    /// @note The functor is not guaranteed to be called only once.
+    /// @see tools::foreach()
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op);
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details Provided that the functor can be inlined, this is typically
+    /// significantly faster than calling getValue() followed by setValue().
+    /// @param xyz  the coordinates of a voxel to be modified
+    /// @param op   a functor of the form <tt>void op(ValueType&, bool&) const</tt> that
+    ///             modifies its arguments, a voxel's value and active state, in place
+    /// @par Example:
+    /// @code
+    /// Coord xyz(1, 0, -2);
+    /// // Multiply the value of a voxel by a constant and mark the voxel as inactive.
+    /// floatTree.modifyValueAndActiveState(xyz,
+    ///     [](float& f, bool& b) { f *= 0.25; b = false; }); // C++11
+    /// // Set the value of a voxel to the maximum of its current value and 0.25,
+    /// // but don't change the voxel's active state.
+    /// floatTree.modifyValueAndActiveState(xyz,
+    ///     [](float& f, bool&) { f = std::max(f, 0.25f); }); // C++11
+    /// @endcode
+    /// @note The functor is not guaranteed to be called only once.
+    /// @see tools::foreach()
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op);
+
+    /// @brief Get the value of the voxel at the given coordinates.
+    /// @return @c true if the value is active.
+    bool probeValue(const Coord& xyz, ValueType& value) const;
+
+    /// Return @c true if the value at the given coordinates is active.
+    bool isValueOn(const Coord& xyz) const { return mRoot.isValueOn(xyz); }
+    /// Return @c true if the value at the given coordinates is inactive.
+    bool isValueOff(const Coord& xyz) const { return !this->isValueOn(xyz); }
+    /// Return @c true if this tree has any active tiles.
+    bool hasActiveTiles() const { return mRoot.hasActiveTiles(); }
+
+    /// Set all voxels that lie outside the given axis-aligned box to the background.
+    void clip(const CoordBBox&);
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    /// @brief Replace with background tiles any nodes whose voxel buffers
+    /// have not yet been allocated.
+    /// @details Typically, unallocated nodes are leaf nodes whose voxel buffers
+    /// are not yet resident in memory because delayed loading is in effect.
+    /// @sa readNonresidentBuffers, io::File::open
+    virtual void clipUnallocatedNodes();
+#endif
+
+    /// @brief Set all voxels within a given axis-aligned box to a constant value.
+    /// If necessary, subdivide tiles that intersect the box.
+    /// @param bbox           inclusive coordinates of opposite corners of an axis-aligned box
+    /// @param value          the value to which to set voxels within the box
+    /// @param active         if true, mark voxels within the box as active,
+    ///                       otherwise mark them as inactive. Defaults to true.
+    /// @param sparse         if false, active tiles are voxelized, i.e. only active voxels
+    ///                       are generated from the fill operation. Defaults to true.  
+    /// @note If @a sparse is true this operation generates a sparse, but not always optimally sparse,
+    /// representation of the filled box.  Follow fill operations with a prune()
+    /// operation for optimal sparseness.
+    void fill(const CoordBBox& bbox, const ValueType& value, bool active = true, bool sparse = true);
+
+    /// @brief Reduce the memory footprint of this tree by replacing with tiles
+    /// any nodes whose values are all the same (optionally to within a tolerance)
+    /// and have the same active state.
+    /// @warning Will soon be deprecated!
+    void prune(const ValueType& tolerance = zeroVal<ValueType>())
+    {
+        this->clearAllAccessors();
+        mRoot.prune(tolerance);
+    }
+
+    /// @brief Add the given leaf node to this tree, creating a new branch if necessary.
+    /// If a leaf node with the same origin already exists, replace it.
+    void addLeaf(LeafNodeType& leaf) { mRoot.addLeaf(&leaf); }
+
+    /// @brief Add a tile containing voxel (x, y, z) at the specified tree level,
+    /// creating a new branch if necessary.  Delete any existing lower-level nodes
+    /// that contain (x, y, z).
+    /// @note @a level must be less than this tree's depth.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool active);
+
+    /// @brief Return a pointer to the node of type @c NodeT that contains voxel (x, y, z)
+    /// and replace it with a tile of the specified value and state.
+    /// If no such node exists, leave the tree unchanged and return @c NULL.
+    /// @note The caller takes ownership of the node and is responsible for deleting it.
+    template<typename NodeT>
+    NodeT* stealNode(const Coord& xyz, const ValueType& value, bool active);
+
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, create one that preserves the values and
+    /// active states of all voxels.
+    /// @details Use this method to preallocate a static tree topology over which to
+    /// safely perform multithreaded processing.
+    LeafNodeType* touchLeaf(const Coord& xyz);
+
+    //@{
+    /// @brief Return a pointer to the node of type @c NodeType that contains
+    /// voxel (x, y, z).  If no such node exists, return NULL.
+    template<typename NodeType> NodeType* probeNode(const Coord& xyz);
+    template<typename NodeType> const NodeType* probeConstNode(const Coord& xyz) const;
+    template<typename NodeType> const NodeType* probeNode(const Coord& xyz) const;
+    //@}
+
+    //@{
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, return NULL.
+    LeafNodeType* probeLeaf(const Coord& xyz);
+    const LeafNodeType* probeConstLeaf(const Coord& xyz) const;
+    const LeafNodeType* probeLeaf(const Coord& xyz) const { return this->probeConstLeaf(xyz); }
+    //@}
+
+    //@{
+    /// @brief Adds all nodes of a certain type to a container with the following API:
+    /// @code
+    /// struct ArrayT {
+    ///    typedef value_type;// defines the type of nodes to be added to the array
+    ///    void push_back(value_type nodePtr);// method that add nodes to the array
+    /// };
+    /// @endcode
+    /// @details An example of a wrapper around a c-style array is:
+    /// @code
+    /// struct MyArray {
+    ///    typedef LeafType* value_type;
+    ///    value_type* ptr;
+    ///    MyArray(value_type* array) : ptr(array) {}
+    ///    void push_back(value_type leaf) { *ptr++ = leaf; }
+    ///};
+    /// @endcode
+    /// @details An example that constructs a list of pointer to all leaf nodes is:
+    /// @code
+    /// std::vector<const LeafNodeType*> array;//most std contains have the required API
+    /// array.reserve(tree.leafCount());//this is a fast preallocation.
+    /// tree.getNodes(array);
+    /// @endcode
+    template<typename ArrayT> void getNodes(ArrayT& array) { mRoot.getNodes(array); }
+    template<typename ArrayT> void getNodes(ArrayT& array) const { mRoot.getNodes(array); }
+    //@}
+
+    /// @brief Steals all nodes of a certain type from the tree and
+    /// adds them to a container with the following API:
+    /// @code
+    /// struct ArrayT {
+    ///    typedef value_type;// defines the type of nodes to be added to the array
+    ///    void push_back(value_type nodePtr);// method that add nodes to the array
+    /// };
+    /// @endcode
+    /// @details An example of a wrapper around a c-style array is:
+    /// @code
+    /// struct MyArray {
+    ///    typedef LeafType* value_type;
+    ///    value_type* ptr;
+    ///    MyArray(value_type* array) : ptr(array) {}
+    ///    void push_back(value_type leaf) { *ptr++ = leaf; }
+    ///};
+    /// @endcode
+    /// @details An example that constructs a list of pointer to all leaf nodes is:
+    /// @code
+    /// std::vector<const LeafNodeType*> array;//most std contains have the required API
+    /// array.reserve(tree.leafCount());//this is a fast preallocation.
+    /// tree.stealNodes(array);
+    /// @endcode
+    template<typename ArrayT>
+    void stealNodes(ArrayT& array) { this->clearAllAccessors(); mRoot.stealNodes(array); }
+    template<typename ArrayT>
+    void stealNodes(ArrayT& array, const ValueType& value, bool state)
+    {
+        this->clearAllAccessors();
+        mRoot.stealNodes(array, value, state);
+    }
+
+    //
+    // Aux methods
+    //
+    /// @brief Return @c true if this tree contains no nodes other than
+    /// the root node and no tiles other than background tiles.
+    bool empty() const { return mRoot.empty(); }
+
+    /// Remove all tiles from this tree and all nodes other than the root node.
+    void clear();
+
+    /// Clear all registered accessors.
+    void clearAllAccessors();
+
+    //@{
+    /// @brief Register an accessor for this tree.  Registered accessors are
+    /// automatically cleared whenever one of this tree's nodes is deleted.
+    void attachAccessor(ValueAccessorBase<Tree, true>&) const;
+    void attachAccessor(ValueAccessorBase<const Tree, true>&) const;
+    //@}
+
+    //@{
+    /// Dummy implementations
+    void attachAccessor(ValueAccessorBase<Tree, false>&) const {}
+    void attachAccessor(ValueAccessorBase<const Tree, false>&) const {}
+    //@}
+
+    //@{
+    /// Deregister an accessor so that it is no longer automatically cleared.
+    void releaseAccessor(ValueAccessorBase<Tree, true>&) const;
+    void releaseAccessor(ValueAccessorBase<const Tree, true>&) const;
+    //@}
+
+    //@{
+    /// Dummy implementations
+    void releaseAccessor(ValueAccessorBase<Tree, false>&) const {}
+    void releaseAccessor(ValueAccessorBase<const Tree, false>&) const {}
+    //@}
+
+    /// @brief Return this tree's background value wrapped as metadata.
+    /// @note Query the metadata object for the value's type.
+    virtual Metadata::Ptr getBackgroundValue() const;
+
+    /// @brief Return this tree's background value.
+    ///
+    /// @note Use tools::changeBackground to efficiently modify the
+    /// background values. Else use tree.root().setBackground, which
+    /// is serial and hence slower.
+    const ValueType& background() const { return mRoot.background(); }
+
+    /// Min and max are both inclusive.
+    virtual void getIndexRange(CoordBBox& bbox) const { mRoot.getIndexRange(bbox); }
+
+    /// @brief Densify active tiles, i.e., replace them with leaf-level active voxels.
+    ///
+    /// @param threaded if true, this operation is multi-threaded (over the internal nodes).
+    ///
+    /// @warning This method can explode the tree's memory footprint, especially if it 
+    /// contains active tiles at the upper levels, e.g. root level!
+    void voxelizeActiveTiles(bool threaded = true);
+
+    /// @brief Efficiently merge another tree into this tree using one of several schemes.
+    /// @details This operation is primarily intended to combine trees that are mostly
+    /// non-overlapping (for example, intermediate trees from computations that are
+    /// parallelized across disjoint regions of space).
+    /// @note This operation is not guaranteed to produce an optimally sparse tree.
+    /// Follow merge() with prune() for optimal sparseness.
+    /// @warning This operation always empties the other tree.
+    void merge(Tree& other, MergePolicy = MERGE_ACTIVE_STATES);
+
+    /// @brief Union this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different.
+    /// @details The resulting state of a value is active if the corresponding value
+    /// was already active OR if it is active in the other tree.  Also, a resulting
+    /// value maps to a voxel if the corresponding value already mapped to a voxel
+    /// OR if it is a voxel in the other tree.  Thus, a resulting value can only
+    /// map to a tile if the corresponding value already mapped to a tile
+    /// AND if it is a tile value in other tree.
+    ///
+    /// @note This operation modifies only active states, not values.
+    /// Specifically, active tiles and voxels in this tree are not changed, and
+    /// tiles or voxels that were inactive in this tree but active in the other tree
+    /// are marked as active in this tree but left with their original values.
+    template<typename OtherRootNodeType>
+    void topologyUnion(const Tree<OtherRootNodeType>& other);
+
+    /// @brief Intersects this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different.
+    /// @details The resulting state of a value is active only if the corresponding
+    /// value was already active AND if it is active in the other tree. Also, a
+    /// resulting value maps to a voxel if the corresponding value
+    /// already mapped to an active voxel in either of the two grids
+    /// and it maps to an active tile or voxel in the other grid.
+    ///
+    /// @note This operation can delete branches in this grid if they
+    /// overlap with inactive tiles in the other grid. Likewise active
+    /// voxels can be turned into unactive voxels resulting in leaf
+    /// nodes with no active values. Thus, it is recommended to
+    /// subsequently call tools::pruneInactive.
+    template<typename OtherRootNodeType>
+    void topologyIntersection(const Tree<OtherRootNodeType>& other);
+
+    /// @brief Difference this tree's set of active values with the active values
+    /// of the other tree, whose @c ValueType may be different. So a
+    /// resulting voxel will be active only if the original voxel is
+    /// active in this tree and inactive in the other tree.
+    ///
+    /// @note This operation can delete branches in this grid if they
+    /// overlap with active tiles in the other grid. Likewise active
+    /// voxels can be turned into inactive voxels resulting in leaf
+    /// nodes with no active values. Thus, it is recommended to
+    /// subsequently call tools::pruneInactive.
+    template<typename OtherRootNodeType>
+    void topologyDifference(const Tree<OtherRootNodeType>& other);
+
+    /// For a given function @c f, use sparse traversal to compute <tt>f(this, other)</tt>
+    /// over all corresponding pairs of values (tile or voxel) of this tree and the other tree
+    /// and store the result in this tree.
+    /// This method is typically more space-efficient than the two-tree combine2(),
+    /// since it moves rather than copies nodes from the other tree into this tree.
+    /// @note This operation always empties the other tree.
+    /// @param other  a tree of the same type as this tree
+    /// @param op     a functor of the form <tt>void op(const T& a, const T& b, T& result)</tt>,
+    ///               where @c T is this tree's @c ValueType, that computes
+    ///               <tt>result = f(a, b)</tt>
+    /// @param prune  if true, prune the resulting tree one branch at a time (this is usually
+    ///               more space-efficient than pruning the entire tree in one pass)
+    ///
+    /// @par Example:
+    ///     Compute the per-voxel difference between two floating-point trees,
+    ///     @c aTree and @c bTree, and store the result in @c aTree (leaving @c bTree empty).
+    /// @code
+    /// {
+    ///     struct Local {
+    ///         static inline void diff(const float& a, const float& b, float& result) {
+    ///             result = a - b;
+    ///         }
+    ///     };
+    ///     aTree.combine(bTree, Local::diff);
+    /// }
+    /// @endcode
+    ///
+    /// @par Example:
+    ///     Compute <tt>f * a + (1 - f) * b</tt> over all voxels of two floating-point trees,
+    ///     @c aTree and @c bTree, and store the result in @c aTree (leaving @c bTree empty).
+    /// @code
+    /// namespace {
+    ///     struct Blend {
+    ///         Blend(float f): frac(f) {}
+    ///         inline void operator()(const float& a, const float& b, float& result) const {
+    ///             result = frac * a + (1.0 - frac) * b;
+    ///         }
+    ///         float frac;
+    ///     };
+    /// }
+    /// {
+    ///     aTree.combine(bTree, Blend(0.25)); // 0.25 * a + 0.75 * b
+    /// }
+    /// @endcode
+    template<typename CombineOp>
+    void combine(Tree& other, CombineOp& op, bool prune = false);
+#ifndef _MSC_VER
+    template<typename CombineOp>
+    void combine(Tree& other, const CombineOp& op, bool prune = false);
+#endif
+
+    /// Like combine(), but with
+    /// @param other  a tree of the same type as this tree
+    /// @param op     a functor of the form <tt>void op(CombineArgs<ValueType>& args)</tt> that
+    ///               computes <tt>args.setResult(f(args.a(), args.b()))</tt> and, optionally,
+    ///               <tt>args.setResultIsActive(g(args.aIsActive(), args.bIsActive()))</tt>
+    ///               for some functions @c f and @c g
+    /// @param prune  if true, prune the resulting tree one branch at a time (this is usually
+    ///               more space-efficient than pruning the entire tree in one pass)
+    ///
+    /// This variant passes not only the @em a and @em b values but also the active states
+    /// of the @em a and @em b values to the functor, which may then return, by calling
+    /// @c args.setResultIsActive(), a computed active state for the result value.
+    /// By default, the result is active if either the @em a or the @em b value is active.
+    ///
+    /// @see openvdb/Types.h for the definition of the CombineArgs struct.
+    ///
+    /// @par Example:
+    ///     Replace voxel values in floating-point @c aTree with corresponding values
+    ///     from floating-point @c bTree (leaving @c bTree empty) wherever the @c bTree
+    ///     values are larger.  Also, preserve the active states of any transferred values.
+    /// @code
+    /// {
+    ///     struct Local {
+    ///         static inline void max(CombineArgs<float>& args) {
+    ///             if (args.b() > args.a()) {
+    ///                 // Transfer the B value and its active state.
+    ///                 args.setResult(args.b());
+    ///                 args.setResultIsActive(args.bIsActive());
+    ///             } else {
+    ///                 // Preserve the A value and its active state.
+    ///                 args.setResult(args.a());
+    ///                 args.setResultIsActive(args.aIsActive());
+    ///             }
+    ///         }
+    ///     };
+    ///     aTree.combineExtended(bTree, Local::max);
+    /// }
+    /// @endcode
+    template<typename ExtendedCombineOp>
+    void combineExtended(Tree& other, ExtendedCombineOp& op, bool prune = false);
+#ifndef _MSC_VER
+    template<typename ExtendedCombineOp>
+    void combineExtended(Tree& other, const ExtendedCombineOp& op, bool prune = false);
+#endif
+
+    /// For a given function @c f, use sparse traversal to compute <tt>f(a, b)</tt> over all
+    /// corresponding pairs of values (tile or voxel) of trees A and B and store the result
+    /// in this tree.
+    /// @param a,b    two trees with the same configuration (levels and node dimensions)
+    ///               as this tree but with the B tree possibly having a different value type
+    /// @param op     a functor of the form <tt>void op(const T1& a, const T2& b, T1& result)</tt>,
+    ///               where @c T1 is this tree's and the A tree's @c ValueType and @c T2 is the
+    ///               B tree's @c ValueType, that computes <tt>result = f(a, b)</tt>
+    /// @param prune  if true, prune the resulting tree one branch at a time (this is usually
+    ///               more space-efficient than pruning the entire tree in one pass)
+    ///
+    /// @throw TypeError if the B tree's configuration doesn't match this tree's
+    /// or if this tree's ValueType is not constructible from the B tree's ValueType.
+    ///
+    /// @par Example:
+    ///     Compute the per-voxel difference between two floating-point trees,
+    ///     @c aTree and @c bTree, and store the result in a third tree.
+    /// @code
+    /// {
+    ///     struct Local {
+    ///         static inline void diff(const float& a, const float& b, float& result) {
+    ///             result = a - b;
+    ///         }
+    ///     };
+    ///     FloatTree resultTree;
+    ///     resultTree.combine2(aTree, bTree, Local::diff);
+    /// }
+    /// @endcode
+    template<typename CombineOp, typename OtherTreeType /*= Tree*/>
+    void combine2(const Tree& a, const OtherTreeType& b, CombineOp& op, bool prune = false);
+#ifndef _MSC_VER
+    template<typename CombineOp, typename OtherTreeType /*= Tree*/>
+    void combine2(const Tree& a, const OtherTreeType& b, const CombineOp& op, bool prune = false);
+#endif
+
+    /// Like combine2(), but with
+    /// @param a,b    two trees with the same configuration (levels and node dimensions)
+    ///               as this tree but with the B tree possibly having a different value type
+    /// @param op     a functor of the form <tt>void op(CombineArgs<T1, T2>& args)</tt>, where
+    ///               @c T1 is this tree's and the A tree's @c ValueType and @c T2 is the B tree's
+    ///               @c ValueType, that computes <tt>args.setResult(f(args.a(), args.b()))</tt>
+    ///               and, optionally,
+    ///               <tt>args.setResultIsActive(g(args.aIsActive(), args.bIsActive()))</tt>
+    ///               for some functions @c f and @c g
+    /// @param prune  if true, prune the resulting tree one branch at a time (this is usually
+    ///               more space-efficient than pruning the entire tree in one pass)
+    /// This variant passes not only the @em a and @em b values but also the active states
+    /// of the @em a and @em b values to the functor, which may then return, by calling
+    /// <tt>args.setResultIsActive()</tt>, a computed active state for the result value.
+    /// By default, the result is active if either the @em a or the @em b value is active.
+    ///
+    /// @throw TypeError if the B tree's configuration doesn't match this tree's
+    /// or if this tree's ValueType is not constructible from the B tree's ValueType.
+    ///
+    /// @see openvdb/Types.h for the definition of the CombineArgs struct.
+    ///
+    /// @par Example:
+    ///     Compute the per-voxel maximum values of two single-precision floating-point trees,
+    ///     @c aTree and @c bTree, and store the result in a third tree.  Set the active state
+    ///     of each output value to that of the larger of the two input values.
+    /// @code
+    /// {
+    ///     struct Local {
+    ///         static inline void max(CombineArgs<float>& args) {
+    ///             if (args.b() > args.a()) {
+    ///                 // Transfer the B value and its active state.
+    ///                 args.setResult(args.b());
+    ///                 args.setResultIsActive(args.bIsActive());
+    ///             } else {
+    ///                 // Preserve the A value and its active state.
+    ///                 args.setResult(args.a());
+    ///                 args.setResultIsActive(args.aIsActive());
+    ///             }
+    ///         }
+    ///     };
+    ///     FloatTree aTree = ...;
+    ///     FloatTree bTree = ...;
+    ///     FloatTree resultTree;
+    ///     resultTree.combine2Extended(aTree, bTree, Local::max);
+    /// }
+    /// @endcode
+    ///
+    /// @par Example:
+    ///     Compute the per-voxel maximum values of a double-precision and a single-precision
+    ///     floating-point tree, @c aTree and @c bTree, and store the result in a third,
+    ///     double-precision tree.  Set the active state of each output value to that of
+    ///     the larger of the two input values.
+    /// @code
+    /// {
+    ///     struct Local {
+    ///         static inline void max(CombineArgs<double, float>& args) {
+    ///             if (args.b() > args.a()) {
+    ///                 // Transfer the B value and its active state.
+    ///                 args.setResult(args.b());
+    ///                 args.setResultIsActive(args.bIsActive());
+    ///             } else {
+    ///                 // Preserve the A value and its active state.
+    ///                 args.setResult(args.a());
+    ///                 args.setResultIsActive(args.aIsActive());
+    ///             }
+    ///         }
+    ///     };
+    ///     DoubleTree aTree = ...;
+    ///     FloatTree bTree = ...;
+    ///     DoubleTree resultTree;
+    ///     resultTree.combine2Extended(aTree, bTree, Local::max);
+    /// }
+    /// @endcode
+    template<typename ExtendedCombineOp, typename OtherTreeType /*= Tree*/>
+    void combine2Extended(const Tree& a, const OtherTreeType& b, ExtendedCombineOp& op,
+        bool prune = false);
+#ifndef _MSC_VER
+    template<typename ExtendedCombineOp, typename OtherTreeType /*= Tree*/>
+    void combine2Extended(const Tree& a, const OtherTreeType& b, const ExtendedCombineOp&,
+        bool prune = false);
+#endif
+
+    /// @brief Use sparse traversal to call the given functor with bounding box
+    /// information for all active tiles and leaf nodes or active voxels in the tree.
+    ///
+    /// @note The bounding boxes are guaranteed to be non-overlapping.
+    /// @param op  a functor with a templated call operator of the form
+    ///     <tt>template<Index LEVEL> void operator()(const CoordBBox& bbox)</tt>,
+    ///     where <tt>bbox</tt> is the bounding box of either an active tile
+    ///     (if @c LEVEL > 0), a leaf node or an active voxel.
+    ///     The functor must also provide a templated method of the form
+    ///     <tt>template<Index LEVEL> bool descent()</tt> that returns @c false
+    ///     if bounding boxes below the specified tree level are not to be visited.
+    ///     In such cases of early tree termination, a bounding box is instead
+    ///     derived from each terminating child node.
+    ///
+    /// @par Example:
+    ///     Visit and process all active tiles and leaf nodes in a tree, but don't
+    ///     descend to the active voxels.  The smallest bounding boxes that will be
+    ///     visited are those of leaf nodes or level-1 active tiles.
+    /// @code
+    /// {
+    ///     struct ProcessTilesAndLeafNodes {
+    ///         // Descend to leaf nodes, but no further.
+    ///         template<Index LEVEL> inline bool descent() { return LEVEL > 0; }
+    ///         // Use this version to descend to voxels:
+    ///         //template<Index LEVEL> inline bool descent() { return true; }
+    ///
+    ///         template<Index LEVEL>
+    ///         inline void operator()(const CoordBBox &bbox) {
+    ///             if (LEVEL > 0) {
+    ///                 // code to process an active tile
+    ///             } else {
+    ///                 // code to process a leaf node
+    ///             }
+    ///         }
+    ///     };
+    ///     ProcessTilesAndLeafNodes op;
+    ///     aTree.visitActiveBBox(op);
+    /// }
+    /// @endcode
+    /// @see openvdb/unittest/TestTree.cc for another example.
+    template<typename BBoxOp> void visitActiveBBox(BBoxOp& op) const { mRoot.visitActiveBBox(op); }
+
+    /// Traverse this tree in depth-first order, and at each node call the given functor
+    /// with a @c DenseIterator (see Iterator.h) that points to either a child node or a
+    /// tile value.  If the iterator points to a child node and the functor returns true,
+    /// do not descend to the child node; instead, continue the traversal at the next
+    /// iterator position.
+    /// @param op  a functor of the form <tt>template<typename IterT> bool op(IterT&)</tt>,
+    ///            where @c IterT is either a RootNode::ChildAllIter,
+    ///            an InternalNode::ChildAllIter or a LeafNode::ChildAllIter
+    ///
+    /// @note There is no iterator that points to a RootNode, so to visit the root node,
+    /// retrieve the @c parent() of a RootNode::ChildAllIter.
+    ///
+    /// @par Example:
+    ///     Print information about the nodes and tiles of a tree, but not individual voxels.
+    /// @code
+    /// namespace {
+    ///     template<typename TreeT>
+    ///     struct PrintTreeVisitor
+    ///     {
+    ///         typedef typename TreeT::RootNodeType RootT;
+    ///         bool visitedRoot;
+    ///
+    ///         PrintTreeVisitor(): visitedRoot(false) {}
+    ///
+    ///         template<typename IterT>
+    ///         inline bool operator()(IterT& iter)
+    ///         {
+    ///             if (!visitedRoot && iter.parent().getLevel() == RootT::LEVEL) {
+    ///                 visitedRoot = true;
+    ///                 std::cout << "Level-" << RootT::LEVEL << " node" << std::endl;
+    ///             }
+    ///             typename IterT::NonConstValueType value;
+    ///             typename IterT::ChildNodeType* child = iter.probeChild(value);
+    ///             if (child == NULL) {
+    ///                 std::cout << "Tile with value " << value << std::endl;
+    ///                 return true; // no child to visit, so stop descending
+    ///             }
+    ///             std::cout << "Level-" << child->getLevel() << " node" << std::endl;
+    ///             return (child->getLevel() == 0); // don't visit leaf nodes
+    ///         }
+    ///
+    ///         // The generic method, above, calls iter.probeChild(), which is not defined
+    ///         // for LeafNode::ChildAllIter.  These overloads ensure that the generic
+    ///         // method template doesn't get instantiated for LeafNode iterators.
+    ///         bool operator()(typename TreeT::LeafNodeType::ChildAllIter&) { return true; }
+    ///         bool operator()(typename TreeT::LeafNodeType::ChildAllCIter&) { return true; }
+    ///     };
+    /// }
+    /// {
+    ///     PrintTreeVisitor visitor;
+    ///     tree.visit(visitor);
+    /// }
+    /// @endcode
+    template<typename VisitorOp> void visit(VisitorOp& op);
+    template<typename VisitorOp> void visit(const VisitorOp& op);
+
+    /// Like visit(), but using @c const iterators, i.e., with
+    /// @param op  a functor of the form <tt>template<typename IterT> bool op(IterT&)</tt>,
+    ///            where @c IterT is either a RootNode::ChildAllCIter,
+    ///            an InternalNode::ChildAllCIter or a LeafNode::ChildAllCIter
+    template<typename VisitorOp> void visit(VisitorOp& op) const;
+    template<typename VisitorOp> void visit(const VisitorOp& op) const;
+
+    /// Traverse this tree and another tree in depth-first order, and for corresponding
+    /// subregions of index space call the given functor with two @c DenseIterators
+    /// (see Iterator.h), each of which points to either a child node or a tile value
+    /// of this tree and the other tree.  If the A iterator points to a child node
+    /// and the functor returns a nonzero value with bit 0 set (e.g., 1), do not descend
+    /// to the child node; instead, continue the traversal at the next A iterator position.
+    /// Similarly, if the B iterator points to a child node and the functor returns a value
+    /// with bit 1 set (e.g., 2), continue the traversal at the next B iterator position.
+    /// @note The other tree must have the same index space and fan-out factors as
+    /// this tree, but it may have a different @c ValueType and a different topology.
+    /// @param other  a tree of the same type as this tree
+    /// @param op     a functor of the form
+    ///               <tt>template<class AIterT, class BIterT> int op(AIterT&, BIterT&)</tt>,
+    ///               where @c AIterT and @c BIterT are any combination of a
+    ///               RootNode::ChildAllIter, an InternalNode::ChildAllIter or a
+    ///               LeafNode::ChildAllIter with an @c OtherTreeType::RootNode::ChildAllIter,
+    ///               an @c OtherTreeType::InternalNode::ChildAllIter
+    ///               or an @c OtherTreeType::LeafNode::ChildAllIter
+    ///
+    /// @par Example:
+    ///     Given two trees of the same type, @c aTree and @c bTree, replace leaf nodes of
+    ///     @c aTree with corresponding leaf nodes of @c bTree, leaving @c bTree partially empty.
+    /// @code
+    /// namespace {
+    ///     template<typename AIterT, typename BIterT>
+    ///     inline int stealLeafNodes(AIterT& aIter, BIterT& bIter)
+    ///     {
+    ///         typename AIterT::NonConstValueType aValue;
+    ///         typename AIterT::ChildNodeType* aChild = aIter.probeChild(aValue);
+    ///         typename BIterT::NonConstValueType bValue;
+    ///         typename BIterT::ChildNodeType* bChild = bIter.probeChild(bValue);
+    ///
+    ///         const Index aLevel = aChild->getLevel(), bLevel = bChild->getLevel();
+    ///         if (aChild && bChild && aLevel == 0 && bLevel == 0) { // both are leaf nodes
+    ///             aIter.setChild(bChild); // give B's child to A
+    ///             bIter.setValue(bValue); // replace B's child with a constant tile value
+    ///         }
+    ///         // Don't iterate over leaf node voxels of either A or B.
+    ///         int skipBranch = (aLevel == 0) ? 1 : 0;
+    ///         if (bLevel == 0) skipBranch = skipBranch | 2;
+    ///         return skipBranch;
+    ///     }
+    /// }
+    /// {
+    ///     aTree.visit2(bTree, stealLeafNodes);
+    /// }
+    /// @endcode
+    template<typename OtherTreeType, typename VisitorOp>
+    void visit2(OtherTreeType& other, VisitorOp& op);
+    template<typename OtherTreeType, typename VisitorOp>
+    void visit2(OtherTreeType& other, const VisitorOp& op);
+
+    /// Like visit2(), but using @c const iterators, i.e., with
+    /// @param other  a tree of the same type as this tree
+    /// @param op     a functor of the form
+    ///               <tt>template<class AIterT, class BIterT> int op(AIterT&, BIterT&)</tt>,
+    ///               where @c AIterT and @c BIterT are any combination of a
+    ///               RootNode::ChildAllCIter, an InternalNode::ChildAllCIter
+    ///               or a LeafNode::ChildAllCIter with an
+    ///               @c OtherTreeType::RootNode::ChildAllCIter,
+    ///               an @c OtherTreeType::InternalNode::ChildAllCIter
+    ///               or an @c OtherTreeType::LeafNode::ChildAllCIter
+    template<typename OtherTreeType, typename VisitorOp>
+    void visit2(OtherTreeType& other, VisitorOp& op) const;
+    template<typename OtherTreeType, typename VisitorOp>
+    void visit2(OtherTreeType& other, const VisitorOp& op) const;
+
+
+    //
+    // Iteration
+    //
+    //@{
+    /// Return an iterator over children of the root node.
+    typename RootNodeType::ChildOnCIter  beginRootChildren() const { return mRoot.cbeginChildOn(); }
+    typename RootNodeType::ChildOnCIter cbeginRootChildren() const { return mRoot.cbeginChildOn(); }
+    typename RootNodeType::ChildOnIter   beginRootChildren() { return mRoot.beginChildOn(); }
+    //@}
+
+    //@{
+    /// Return an iterator over non-child entries of the root node's table.
+    typename RootNodeType::ChildOffCIter  beginRootTiles() const { return mRoot.cbeginChildOff(); }
+    typename RootNodeType::ChildOffCIter cbeginRootTiles() const { return mRoot.cbeginChildOff(); }
+    typename RootNodeType::ChildOffIter   beginRootTiles() { return mRoot.beginChildOff(); }
+    //@}
+
+    //@{
+    /// Return an iterator over all entries of the root node's table.
+    typename RootNodeType::ChildAllCIter  beginRootDense() const { return mRoot.cbeginChildAll(); }
+    typename RootNodeType::ChildAllCIter cbeginRootDense() const { return mRoot.cbeginChildAll(); }
+    typename RootNodeType::ChildAllIter   beginRootDense() { return mRoot.beginChildAll(); }
+    //@}
+
+
+    //@{
+    /// Iterator over all nodes in this tree
+    typedef NodeIteratorBase<Tree, typename RootNodeType::ChildOnIter>        NodeIter;
+    typedef NodeIteratorBase<const Tree, typename RootNodeType::ChildOnCIter> NodeCIter;
+    //@}
+
+    //@{
+    /// Iterator over all leaf nodes in this tree
+    typedef LeafIteratorBase<Tree, typename RootNodeType::ChildOnIter>        LeafIter;
+    typedef LeafIteratorBase<const Tree, typename RootNodeType::ChildOnCIter> LeafCIter;
+    //@}
+
+    //@{
+    /// Return an iterator over all nodes in this tree.
+    NodeIter   beginNode() { return NodeIter(*this); }
+    NodeCIter  beginNode() const { return NodeCIter(*this); }
+    NodeCIter cbeginNode() const { return NodeCIter(*this); }
+    //@}
+
+    //@{
+    /// Return an iterator over all leaf nodes in this tree.
+    LeafIter   beginLeaf() { return LeafIter(*this); }
+    LeafCIter  beginLeaf() const { return LeafCIter(*this); }
+    LeafCIter cbeginLeaf() const { return LeafCIter(*this); }
+    //@}
+
+    typedef TreeValueIteratorBase<Tree, typename RootNodeType::ValueAllIter> ValueAllIter;
+    typedef TreeValueIteratorBase<const Tree, typename RootNodeType::ValueAllCIter> ValueAllCIter;
+    typedef TreeValueIteratorBase<Tree, typename RootNodeType::ValueOnIter> ValueOnIter;
+    typedef TreeValueIteratorBase<const Tree, typename RootNodeType::ValueOnCIter> ValueOnCIter;
+    typedef TreeValueIteratorBase<Tree, typename RootNodeType::ValueOffIter> ValueOffIter;
+    typedef TreeValueIteratorBase<const Tree, typename RootNodeType::ValueOffCIter> ValueOffCIter;
+
+    //@{
+    /// Return an iterator over all values (tile and voxel) across all nodes.
+    ValueAllIter   beginValueAll() { return ValueAllIter(*this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(*this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(*this); }
+    //@}
+    //@{
+    /// Return an iterator over active values (tile and voxel) across all nodes.
+    ValueOnIter   beginValueOn() { return ValueOnIter(*this); }
+    ValueOnCIter  beginValueOn() const { return ValueOnCIter(*this); }
+    ValueOnCIter cbeginValueOn() const { return ValueOnCIter(*this); }
+    //@}
+    //@{
+    /// Return an iterator over inactive values (tile and voxel) across all nodes.
+    ValueOffIter   beginValueOff() { return ValueOffIter(*this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(*this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(*this); }
+    //@}
+
+    /// @brief Return an iterator of type @c IterT (for example, begin<ValueOnIter>() is
+    /// equivalent to beginValueOn()).
+    template<typename IterT> IterT begin();
+    /// @brief Return a const iterator of type CIterT (for example, cbegin<ValueOnCIter>()
+    /// is equivalent to cbeginValueOn()).
+    template<typename CIterT> CIterT cbegin() const;
+
+
+protected:
+    typedef tbb::concurrent_hash_map<ValueAccessorBase<Tree, true>*, bool> AccessorRegistry;
+    typedef tbb::concurrent_hash_map<ValueAccessorBase<const Tree, true>*, bool> ConstAccessorRegistry;
+
+    // Disallow assignment of instances of this class.
+    Tree& operator=(const Tree&);
+
+    /// @brief Notify all registered accessors, by calling ValueAccessor::release(),
+    /// that this tree is about to be deleted.
+    void releaseAllAccessors();
+
+    // TBB body object used to deallocates leafnodes in parallel.
+    struct DeallocateLeafNodes {
+        DeallocateLeafNodes(std::vector<LeafNodeType*>& nodes)
+            : mNodes(nodes.empty() ? NULL : &nodes.front()) { }
+        void operator()(const tbb::blocked_range<size_t>& range) const {
+            for (size_t n = range.begin(), N = range.end(); n < N; ++n) {
+                delete mNodes[n]; mNodes[n] = NULL;
+            }
+        }
+        LeafNodeType ** const mNodes;
+    };
+
+    //
+    // Data members
+    //
+    RootNodeType mRoot; // root node of the tree
+    mutable AccessorRegistry mAccessorRegistry;
+    mutable ConstAccessorRegistry mConstAccessorRegistry;
+
+    static tbb::atomic<const Name*> sTreeTypeName;
+}; // end of Tree class
+
+template<typename _RootNodeType>
+tbb::atomic<const Name*> Tree<_RootNodeType>::sTreeTypeName;
+
+
+/// @brief Tree3<T, N1, N2>::Type is the type of a three-level tree
+/// (Root, Internal, Leaf) with value type T and
+/// internal and leaf node log dimensions N1 and N2, respectively.
+/// @note This is NOT the standard tree configuration (Tree4 is).
+template<typename T, Index N1=4, Index N2=3>
+struct Tree3 {
+    typedef Tree<RootNode<InternalNode<LeafNode<T, N2>, N1> > > Type;
+};
+
+
+/// @brief Tree4<T, N1, N2, N3>::Type is the type of a four-level tree
+/// (Root, Internal, Internal, Leaf) with value type T and
+/// internal and leaf node log dimensions N1, N2 and N3, respectively.
+/// @note This is the standard tree configuration.
+template<typename T, Index N1=5, Index N2=4, Index N3=3>
+struct Tree4 {
+    typedef Tree<RootNode<InternalNode<InternalNode<LeafNode<T, N3>, N2>, N1> > > Type;
+};
+
+/// @brief Tree5<T, N1, N2, N3, N4>::Type is the type of a five-level tree
+/// (Root, Internal, Internal, Internal, Leaf) with value type T and
+/// internal and leaf node log dimensions N1, N2, N3 and N4, respectively.
+/// @note This is NOT the standard tree configuration (Tree4 is).
+template<typename T, Index N1=6, Index N2=5, Index N3=4, Index N4=3>
+struct Tree5 {
+    typedef Tree<RootNode<InternalNode<InternalNode<InternalNode<LeafNode<T, N4>, N3>, N2>, N1> > >
+        Type;
+};
+
+
+////////////////////////////////////////
+
+
+inline void
+TreeBase::readTopology(std::istream& is, bool /*saveFloatAsHalf*/)
+{
+    int32_t bufferCount;
+    is.read(reinterpret_cast<char*>(&bufferCount), sizeof(int32_t));
+    if (bufferCount != 1) OPENVDB_LOG_WARN("multi-buffer trees are no longer supported");
+}
+
+
+inline void
+TreeBase::writeTopology(std::ostream& os, bool /*saveFloatAsHalf*/) const
+{
+    int32_t bufferCount = 1;
+    os.write(reinterpret_cast<char*>(&bufferCount), sizeof(int32_t));
+}
+
+
+inline void
+TreeBase::print(std::ostream& os, int /*verboseLevel*/) const
+{
+    os << "    Tree Type: " << type()
+       << "    Active Voxel Count: " << activeVoxelCount() << std::endl
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+       << "    Active tile Count: " << activeTileCount() << std::endl
+#endif
+       << "    Inactive Voxel Count: " << inactiveVoxelCount() << std::endl
+       << "    Leaf Node Count: " << leafCount() << std::endl
+       << "    Non-leaf Node Count: " << nonLeafCount() << std::endl;
+}
+
+
+////////////////////////////////////////
+
+
+//
+// Type traits for tree iterators
+//
+
+/// @brief TreeIterTraits provides, for all tree iterators, a begin(tree) function
+/// that returns an iterator over a tree of arbitrary type.
+template<typename TreeT, typename IterT> struct TreeIterTraits;
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::RootNodeType::ChildOnIter> {
+    static typename TreeT::RootNodeType::ChildOnIter begin(TreeT& tree) {
+        return tree.beginRootChildren();
+    }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::RootNodeType::ChildOnCIter> {
+    static typename TreeT::RootNodeType::ChildOnCIter begin(const TreeT& tree) {
+        return tree.cbeginRootChildren();
+    }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::RootNodeType::ChildOffIter> {
+    static typename TreeT::RootNodeType::ChildOffIter begin(TreeT& tree) {
+        return tree.beginRootTiles();
+    }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::RootNodeType::ChildOffCIter> {
+    static typename TreeT::RootNodeType::ChildOffCIter begin(const TreeT& tree) {
+        return tree.cbeginRootTiles();
+    }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::RootNodeType::ChildAllIter> {
+    static typename TreeT::RootNodeType::ChildAllIter begin(TreeT& tree) {
+        return tree.beginRootDense();
+    }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::RootNodeType::ChildAllCIter> {
+    static typename TreeT::RootNodeType::ChildAllCIter begin(const TreeT& tree) {
+        return tree.cbeginRootDense();
+    }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::NodeIter> {
+    static typename TreeT::NodeIter begin(TreeT& tree) { return tree.beginNode(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::NodeCIter> {
+    static typename TreeT::NodeCIter begin(const TreeT& tree) { return tree.cbeginNode(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::LeafIter> {
+    static typename TreeT::LeafIter begin(TreeT& tree) { return tree.beginLeaf(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::LeafCIter> {
+    static typename TreeT::LeafCIter begin(const TreeT& tree) { return tree.cbeginLeaf(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::ValueOnIter> {
+    static typename TreeT::ValueOnIter begin(TreeT& tree) { return tree.beginValueOn(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::ValueOnCIter> {
+    static typename TreeT::ValueOnCIter begin(const TreeT& tree) { return tree.cbeginValueOn(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::ValueOffIter> {
+    static typename TreeT::ValueOffIter begin(TreeT& tree) { return tree.beginValueOff(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::ValueOffCIter> {
+    static typename TreeT::ValueOffCIter begin(const TreeT& tree) { return tree.cbeginValueOff(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::ValueAllIter> {
+    static typename TreeT::ValueAllIter begin(TreeT& tree) { return tree.beginValueAll(); }
+};
+
+template<typename TreeT> struct TreeIterTraits<TreeT, typename TreeT::ValueAllCIter> {
+    static typename TreeT::ValueAllCIter begin(const TreeT& tree) { return tree.cbeginValueAll(); }
+};
+
+
+template<typename RootNodeType>
+template<typename IterT>
+inline IterT
+Tree<RootNodeType>::begin()
+{
+    return TreeIterTraits<Tree, IterT>::begin(*this);
+}
+
+
+template<typename RootNodeType>
+template<typename IterT>
+inline IterT
+Tree<RootNodeType>::cbegin() const
+{
+    return TreeIterTraits<Tree, IterT>::begin(*this);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+void
+Tree<RootNodeType>::readTopology(std::istream& is, bool saveFloatAsHalf)
+{
+    this->clearAllAccessors();
+    TreeBase::readTopology(is, saveFloatAsHalf);
+    mRoot.readTopology(is, saveFloatAsHalf);
+}
+
+
+template<typename RootNodeType>
+void
+Tree<RootNodeType>::writeTopology(std::ostream& os, bool saveFloatAsHalf) const
+{
+    TreeBase::writeTopology(os, saveFloatAsHalf);
+    mRoot.writeTopology(os, saveFloatAsHalf);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::readBuffers(std::istream &is, bool saveFloatAsHalf)
+{
+    this->clearAllAccessors();
+    mRoot.readBuffers(is, saveFloatAsHalf);
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::readBuffers(std::istream &is, const CoordBBox& bbox, bool saveFloatAsHalf)
+{
+    this->clearAllAccessors();
+    mRoot.readBuffers(is, bbox, saveFloatAsHalf);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::readNonresidentBuffers() const
+{
+    for (LeafCIter it = this->cbeginLeaf(); it; ++it) {
+        // Retrieving the value of a leaf voxel forces loading of the leaf node's voxel buffer.
+        it->getValue(Index(0));
+    }
+}
+
+#endif // !OPENVDB_2_ABI_COMPATIBLE
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::writeBuffers(std::ostream &os, bool saveFloatAsHalf) const
+{
+    mRoot.writeBuffers(os, saveFloatAsHalf);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::clear()
+{
+    std::vector<LeafNodeType*> leafnodes;
+    this->stealNodes(leafnodes);
+    mRoot.clear();
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, leafnodes.size()),
+        DeallocateLeafNodes(leafnodes));
+
+    this->clearAllAccessors();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::attachAccessor(ValueAccessorBase<Tree, true>& accessor) const
+{
+    typename AccessorRegistry::accessor a;
+    mAccessorRegistry.insert(a, &accessor);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::attachAccessor(ValueAccessorBase<const Tree, true>& accessor) const
+{
+    typename ConstAccessorRegistry::accessor a;
+    mConstAccessorRegistry.insert(a, &accessor);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::releaseAccessor(ValueAccessorBase<Tree, true>& accessor) const
+{
+    mAccessorRegistry.erase(&accessor);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::releaseAccessor(ValueAccessorBase<const Tree, true>& accessor) const
+{
+    mConstAccessorRegistry.erase(&accessor);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::clearAllAccessors()
+{
+    for (typename AccessorRegistry::iterator it = mAccessorRegistry.begin();
+        it != mAccessorRegistry.end(); ++it)
+    {
+        if (it->first) it->first->clear();
+    }
+
+    for (typename ConstAccessorRegistry::iterator it = mConstAccessorRegistry.begin();
+        it != mConstAccessorRegistry.end(); ++it)
+    {
+        if (it->first) it->first->clear();
+    }
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::releaseAllAccessors()
+{
+    mAccessorRegistry.erase(NULL);
+    for (typename AccessorRegistry::iterator it = mAccessorRegistry.begin();
+        it != mAccessorRegistry.end(); ++it)
+    {
+        it->first->release();
+    }
+    mAccessorRegistry.clear();
+
+    mAccessorRegistry.erase(NULL);
+    for (typename ConstAccessorRegistry::iterator it = mConstAccessorRegistry.begin();
+        it != mConstAccessorRegistry.end(); ++it)
+    {
+        it->first->release();
+    }
+    mConstAccessorRegistry.clear();
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+inline const typename RootNodeType::ValueType&
+Tree<RootNodeType>::getValue(const Coord& xyz) const
+{
+    return mRoot.getValue(xyz);
+}
+
+
+template<typename RootNodeType>
+template<typename AccessT>
+inline const typename RootNodeType::ValueType&
+Tree<RootNodeType>::getValue(const Coord& xyz, AccessT& accessor) const
+{
+    return accessor.getValue(xyz);
+}
+
+
+template<typename RootNodeType>
+inline int
+Tree<RootNodeType>::getValueDepth(const Coord& xyz) const
+{
+    return mRoot.getValueDepth(xyz);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setValueOff(const Coord& xyz)
+{
+    mRoot.setValueOff(xyz);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setValueOff(const Coord& xyz, const ValueType& value)
+{
+    mRoot.setValueOff(xyz, value);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setActiveState(const Coord& xyz, bool on)
+{
+    mRoot.setActiveState(xyz, on);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setValue(const Coord& xyz, const ValueType& value)
+{
+    mRoot.setValueOn(xyz, value);
+}
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setValueOnly(const Coord& xyz, const ValueType& value)
+{
+    mRoot.setValueOnly(xyz, value);
+}
+
+template<typename RootNodeType>
+template<typename AccessT>
+inline void
+Tree<RootNodeType>::setValue(const Coord& xyz, const ValueType& value, AccessT& accessor)
+{
+    accessor.setValue(xyz, value);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setValueOn(const Coord& xyz)
+{
+    mRoot.setActiveState(xyz, true);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::setValueOn(const Coord& xyz, const ValueType& value)
+{
+    mRoot.setValueOn(xyz, value);
+}
+
+
+template<typename RootNodeType>
+template<typename ModifyOp>
+inline void
+Tree<RootNodeType>::modifyValue(const Coord& xyz, const ModifyOp& op)
+{
+    mRoot.modifyValue(xyz, op);
+}
+
+
+template<typename RootNodeType>
+template<typename ModifyOp>
+inline void
+Tree<RootNodeType>::modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+{
+    mRoot.modifyValueAndActiveState(xyz, op);
+}
+
+
+template<typename RootNodeType>
+inline bool
+Tree<RootNodeType>::probeValue(const Coord& xyz, ValueType& value) const
+{
+    return mRoot.probeValue(xyz, value);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::addTile(Index level, const Coord& xyz,
+                            const ValueType& value, bool active)
+{
+    mRoot.addTile(level, xyz, value, active);
+}
+
+
+template<typename RootNodeType>
+template<typename NodeT>
+inline NodeT*
+Tree<RootNodeType>::stealNode(const Coord& xyz, const ValueType& value, bool active)
+{
+    this->clearAllAccessors();
+    return mRoot.template stealNode<NodeT>(xyz, value, active);
+}
+
+
+template<typename RootNodeType>
+inline typename RootNodeType::LeafNodeType*
+Tree<RootNodeType>::touchLeaf(const Coord& xyz)
+{
+    return mRoot.touchLeaf(xyz);
+}
+
+
+template<typename RootNodeType>
+inline typename RootNodeType::LeafNodeType*
+Tree<RootNodeType>::probeLeaf(const Coord& xyz)
+{
+    return mRoot.probeLeaf(xyz);
+}
+
+
+template<typename RootNodeType>
+inline const typename RootNodeType::LeafNodeType*
+Tree<RootNodeType>::probeConstLeaf(const Coord& xyz) const
+{
+    return mRoot.probeConstLeaf(xyz);
+}
+
+
+template<typename RootNodeType>
+template<typename NodeType>
+inline NodeType*
+Tree<RootNodeType>::probeNode(const Coord& xyz)
+{
+    return mRoot.template probeNode<NodeType>(xyz);
+}
+
+
+template<typename RootNodeType>
+template<typename NodeType>
+inline const NodeType*
+Tree<RootNodeType>::probeNode(const Coord& xyz) const
+{
+    return this->template probeConstNode<NodeType>(xyz);
+}
+
+
+template<typename RootNodeType>
+template<typename NodeType>
+inline const NodeType*
+Tree<RootNodeType>::probeConstNode(const Coord& xyz) const
+{
+    return mRoot.template probeConstNode<NodeType>(xyz);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::clip(const CoordBBox& bbox)
+{
+    this->clearAllAccessors();
+    return mRoot.clip(bbox);
+}
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::clipUnallocatedNodes()
+{
+    this->clearAllAccessors();
+    for (LeafIter it = this->beginLeaf(); it; ) {
+        const LeafNodeType* leaf = it.getLeaf();
+        ++it; // advance the iterator before deleting the leaf node
+        if (!leaf->isAllocated()) {
+            this->addTile(/*level=*/0, leaf->origin(), this->background(), /*active=*/false);
+        }
+    }
+}
+#endif
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::fill(const CoordBBox& bbox, const ValueType& value, bool active, bool sparse)
+{
+    this->clearAllAccessors();
+    return mRoot.fill(bbox, value, active, sparse);
+}
+
+
+template<typename RootNodeType>
+Metadata::Ptr
+Tree<RootNodeType>::getBackgroundValue() const
+{
+    Metadata::Ptr result;
+    if (Metadata::isRegisteredType(valueType())) {
+        typedef TypedMetadata<ValueType> MetadataT;
+        result = Metadata::createMetadata(valueType());
+        if (result->typeName() == MetadataT::staticTypeName()) {
+            MetadataT* m = static_cast<MetadataT*>(result.get());
+            m->value() = mRoot.background();
+        }
+    }
+    return result;
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::voxelizeActiveTiles(bool threaded)
+{
+    this->clearAllAccessors();
+    mRoot.voxelizeActiveTiles(threaded);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::merge(Tree& other, MergePolicy policy)
+{
+    this->clearAllAccessors();
+    other.clearAllAccessors();
+    switch (policy) {
+        case MERGE_ACTIVE_STATES:
+            mRoot.template merge<MERGE_ACTIVE_STATES>(other.mRoot); break;
+        case MERGE_NODES:
+            mRoot.template merge<MERGE_NODES>(other.mRoot); break;
+        case MERGE_ACTIVE_STATES_AND_NODES:
+            mRoot.template merge<MERGE_ACTIVE_STATES_AND_NODES>(other.mRoot); break;
+    }
+}
+
+
+template<typename RootNodeType>
+template<typename OtherRootNodeType>
+inline void
+Tree<RootNodeType>::topologyUnion(const Tree<OtherRootNodeType>& other)
+{
+    this->clearAllAccessors();
+    mRoot.topologyUnion(other.root());
+}
+
+template<typename RootNodeType>
+template<typename OtherRootNodeType>
+inline void
+Tree<RootNodeType>::topologyIntersection(const Tree<OtherRootNodeType>& other)
+{
+    this->clearAllAccessors();
+    mRoot.topologyIntersection(other.root());
+}
+
+template<typename RootNodeType>
+template<typename OtherRootNodeType>
+inline void
+Tree<RootNodeType>::topologyDifference(const Tree<OtherRootNodeType>& other)
+{
+    this->clearAllAccessors();
+    mRoot.topologyDifference(other.root());
+}
+
+////////////////////////////////////////
+
+
+/// @brief Helper class to adapt a three-argument (a, b, result) CombineOp functor
+/// into a single-argument functor that accepts a CombineArgs struct
+template<typename AValueT, typename CombineOp, typename BValueT = AValueT>
+struct CombineOpAdapter
+{
+    CombineOpAdapter(CombineOp& _op): op(_op) {}
+
+    void operator()(CombineArgs<AValueT, BValueT>& args) const {
+        op(args.a(), args.b(), args.result());
+    }
+
+    CombineOp& op;
+};
+
+
+template<typename RootNodeType>
+template<typename CombineOp>
+inline void
+Tree<RootNodeType>::combine(Tree& other, CombineOp& op, bool prune)
+{
+    CombineOpAdapter<ValueType, CombineOp> extendedOp(op);
+    this->combineExtended(other, extendedOp, prune);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>aTree.combine(bTree, MyCombineOp(...))</tt>.
+#ifndef _MSC_VER
+template<typename RootNodeType>
+template<typename CombineOp>
+inline void
+Tree<RootNodeType>::combine(Tree& other, const CombineOp& op, bool prune)
+{
+    CombineOpAdapter<ValueType, const CombineOp> extendedOp(op);
+    this->combineExtended(other, extendedOp, prune);
+}
+#endif
+
+
+template<typename RootNodeType>
+template<typename ExtendedCombineOp>
+inline void
+Tree<RootNodeType>::combineExtended(Tree& other, ExtendedCombineOp& op, bool prune)
+{
+    this->clearAllAccessors();
+    mRoot.combine(other.root(), op, prune);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>aTree.combineExtended(bTree, MyCombineOp(...))</tt>.
+#ifndef _MSC_VER
+template<typename RootNodeType>
+template<typename ExtendedCombineOp>
+inline void
+Tree<RootNodeType>::combineExtended(Tree& other, const ExtendedCombineOp& op, bool prune)
+{
+    this->clearAllAccessors();
+    mRoot.template combine<const ExtendedCombineOp>(other.mRoot, op, prune);
+}
+#endif
+
+
+template<typename RootNodeType>
+template<typename CombineOp, typename OtherTreeType>
+inline void
+Tree<RootNodeType>::combine2(const Tree& a, const OtherTreeType& b, CombineOp& op, bool prune)
+{
+    CombineOpAdapter<ValueType, CombineOp, typename OtherTreeType::ValueType> extendedOp(op);
+    this->combine2Extended(a, b, extendedOp, prune);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>tree.combine2(aTree, bTree, MyCombineOp(...))</tt>.
+#ifndef _MSC_VER
+template<typename RootNodeType>
+template<typename CombineOp, typename OtherTreeType>
+inline void
+Tree<RootNodeType>::combine2(const Tree& a, const OtherTreeType& b, const CombineOp& op, bool prune)
+{
+    CombineOpAdapter<ValueType, const CombineOp, typename OtherTreeType::ValueType> extendedOp(op);
+    this->combine2Extended(a, b, extendedOp, prune);
+}
+#endif
+
+
+template<typename RootNodeType>
+template<typename ExtendedCombineOp, typename OtherTreeType>
+inline void
+Tree<RootNodeType>::combine2Extended(const Tree& a, const OtherTreeType& b,
+    ExtendedCombineOp& op, bool prune)
+{
+    this->clearAllAccessors();
+    mRoot.combine2(a.root(), b.root(), op, prune);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like the following, where the functor argument is a temporary:
+/// <tt>tree.combine2Extended(aTree, bTree, MyCombineOp(...))</tt>.
+#ifndef _MSC_VER
+template<typename RootNodeType>
+template<typename ExtendedCombineOp, typename OtherTreeType>
+inline void
+Tree<RootNodeType>::combine2Extended(const Tree& a, const OtherTreeType& b,
+    const ExtendedCombineOp& op, bool prune)
+{
+    this->clearAllAccessors();
+    mRoot.template combine2<const ExtendedCombineOp>(a.root(), b.root(), op, prune);
+}
+#endif
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+template<typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit(VisitorOp& op)
+{
+    this->clearAllAccessors();
+    mRoot.template visit<VisitorOp>(op);
+}
+
+
+template<typename RootNodeType>
+template<typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit(VisitorOp& op) const
+{
+    mRoot.template visit<VisitorOp>(op);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>tree.visit(MyVisitorOp(...))</tt>.
+template<typename RootNodeType>
+template<typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit(const VisitorOp& op)
+{
+    this->clearAllAccessors();
+    mRoot.template visit<const VisitorOp>(op);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>tree.visit(MyVisitorOp(...))</tt>.
+template<typename RootNodeType>
+template<typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit(const VisitorOp& op) const
+{
+    mRoot.template visit<const VisitorOp>(op);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+template<typename OtherTreeType, typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit2(OtherTreeType& other, VisitorOp& op)
+{
+    this->clearAllAccessors();
+    typedef typename OtherTreeType::RootNodeType OtherRootNodeType;
+    mRoot.template visit2<OtherRootNodeType, VisitorOp>(other.root(), op);
+}
+
+
+template<typename RootNodeType>
+template<typename OtherTreeType, typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit2(OtherTreeType& other, VisitorOp& op) const
+{
+    typedef typename OtherTreeType::RootNodeType OtherRootNodeType;
+    mRoot.template visit2<OtherRootNodeType, VisitorOp>(other.root(), op);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>aTree.visit2(bTree, MyVisitorOp(...))</tt>.
+template<typename RootNodeType>
+template<typename OtherTreeType, typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit2(OtherTreeType& other, const VisitorOp& op)
+{
+    this->clearAllAccessors();
+    typedef typename OtherTreeType::RootNodeType OtherRootNodeType;
+    mRoot.template visit2<OtherRootNodeType, const VisitorOp>(other.root(), op);
+}
+
+
+/// @internal This overload is needed (for ICC and GCC, but not for VC) to disambiguate
+/// code like this: <tt>aTree.visit2(bTree, MyVisitorOp(...))</tt>.
+template<typename RootNodeType>
+template<typename OtherTreeType, typename VisitorOp>
+inline void
+Tree<RootNodeType>::visit2(OtherTreeType& other, const VisitorOp& op) const
+{
+    typedef typename OtherTreeType::RootNodeType OtherRootNodeType;
+    mRoot.template visit2<OtherRootNodeType, const VisitorOp>(other.root(), op);
+}
+
+
+////////////////////////////////////////
+
+
+template<typename RootNodeType>
+inline const Name&
+Tree<RootNodeType>::treeType()
+{
+    if (sTreeTypeName == NULL) {
+        std::vector<Index> dims;
+        Tree::getNodeLog2Dims(dims);
+        std::ostringstream ostr;
+        ostr << "Tree_" << typeNameAsString<BuildType>();
+        for (size_t i = 1, N = dims.size(); i < N; ++i) { // start from 1 to skip the RootNode
+            ostr << "_" << dims[i];
+        }
+        Name* s = new Name(ostr.str());
+        if (sTreeTypeName.compare_and_swap(s, NULL) != NULL) delete s;
+    }
+    return *sTreeTypeName;
+}
+
+
+template<typename RootNodeType>
+template<typename OtherRootNodeType>
+inline bool
+Tree<RootNodeType>::hasSameTopology(const Tree<OtherRootNodeType>& other) const
+{
+    return mRoot.hasSameTopology(other.root());
+}
+
+
+template<typename RootNodeType>
+Index64
+Tree<RootNodeType>::inactiveVoxelCount() const
+{
+    Coord dim(0, 0, 0);
+    this->evalActiveVoxelDim(dim);
+    const Index64
+        totalVoxels = dim.x() * dim.y() * dim.z(),
+        activeVoxels = this->activeVoxelCount();
+    assert(totalVoxels >= activeVoxels);
+    return totalVoxels - activeVoxels;
+}
+
+
+template<typename RootNodeType>
+inline bool
+Tree<RootNodeType>::evalLeafBoundingBox(CoordBBox& bbox) const
+{
+    bbox.reset(); // default invalid bbox
+
+    if (this->empty()) return false;  // empty
+
+    mRoot.evalActiveBoundingBox(bbox, false);
+
+    return true;// not empty
+}
+
+template<typename RootNodeType>
+inline bool
+Tree<RootNodeType>::evalActiveVoxelBoundingBox(CoordBBox& bbox) const
+{
+    bbox.reset(); // default invalid bbox
+
+    if (this->empty()) return false;  // empty
+
+    mRoot.evalActiveBoundingBox(bbox, true);
+
+    return true;// not empty
+}
+
+
+template<typename RootNodeType>
+inline bool
+Tree<RootNodeType>::evalActiveVoxelDim(Coord& dim) const
+{
+    CoordBBox bbox;
+    bool notEmpty = this->evalActiveVoxelBoundingBox(bbox);
+    dim = bbox.extents();
+    return notEmpty;
+}
+
+
+template<typename RootNodeType>
+inline bool
+Tree<RootNodeType>::evalLeafDim(Coord& dim) const
+{
+    CoordBBox bbox;
+    bool notEmpty = this->evalLeafBoundingBox(bbox);
+    dim = bbox.extents();
+    return notEmpty;
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::evalMinMax(ValueType& minVal, ValueType& maxVal) const
+{
+    minVal = maxVal = zeroVal<ValueType>();
+    if (ValueOnCIter iter = this->cbeginValueOn()) {
+        minVal = maxVal = *iter;
+        for (++iter; iter; ++iter) {
+            const ValueType& val = *iter;
+            if (val < minVal) minVal = val;
+            if (val > maxVal) maxVal = val;
+        }
+    }
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::getNodeLog2Dims(std::vector<Index>& dims)
+{
+    dims.clear();
+    RootNodeType::getNodeLog2Dims(dims);
+}
+
+
+template<typename RootNodeType>
+inline void
+Tree<RootNodeType>::print(std::ostream& os, int verboseLevel) const
+{
+    if (verboseLevel <= 0) return;
+
+    /// @todo Consider using boost::io::ios_precision_saver instead.
+    struct OnExit {
+        std::ostream& os;
+        std::streamsize savedPrecision;
+        OnExit(std::ostream& _os): os(_os), savedPrecision(os.precision()) {}
+        ~OnExit() { os.precision(savedPrecision); }
+    };
+    OnExit restorePrecision(os);
+
+    std::vector<Index> dims;
+    Tree::getNodeLog2Dims(dims);
+
+    os << "Information about Tree:\n"
+        << "  Type: " << this->type() << "\n";
+
+    os << "  Configuration:\n";
+
+    if (verboseLevel <= 1) {
+        // Print node types and sizes.
+        os << "    Root(" << mRoot.getTableSize() << ")";
+        if (dims.size() > 1) {
+            for (size_t i = 1, N = dims.size() - 1; i < N; ++i) {
+                os << ", Internal(" << (1 << dims[i]) << "^3)";
+            }
+            os << ", Leaf(" << (1 << *dims.rbegin()) << "^3)\n";
+        }
+        os << "  Background value: " << mRoot.background() << "\n";
+        return;
+    }
+
+    // The following is tree information that is expensive to extract.
+
+    ValueType minVal = zeroVal<ValueType>(), maxVal = zeroVal<ValueType>();
+    if (verboseLevel > 3) {
+        // This forces loading of all non-resident nodes.
+        this->evalMinMax(minVal, maxVal);
+    }
+
+    std::vector<Index64> nodeCount(dims.size());
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    Index64 unallocatedLeafCount = 0;
+#endif
+    for (NodeCIter it = cbeginNode(); it; ++it) {
+        ++(nodeCount[it.getDepth()]);
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (it.getLevel() == 0) {
+            const LeafNodeType* leaf = NULL;
+            it.getNode(leaf);
+            if (leaf && !leaf->isAllocated()) ++unallocatedLeafCount;
+        }
+#endif
+    }
+    Index64 totalNodeCount = 0;
+    for (size_t i = 0; i < nodeCount.size(); ++i) totalNodeCount += nodeCount[i];
+
+    // Print node types, counts and sizes.
+    os << "    Root(1 x " << mRoot.getTableSize() << ")";
+    if (dims.size() > 1) {
+        for (size_t i = 1, N = dims.size() - 1; i < N; ++i) {
+            os << ", Internal(" << util::formattedInt(nodeCount[i]);
+            os << " x " << (1 << dims[i]) << "^3)";
+        }
+        os << ", Leaf(" << util::formattedInt(*nodeCount.rbegin());
+        os << " x " << (1 << *dims.rbegin()) << "^3)\n";
+    }
+    os << "  Background value: " << mRoot.background() << "\n";
+
+    // Statistics of topology and values
+
+    if (verboseLevel > 3) {
+        os << "  Min value: " << minVal << "\n";
+        os << "  Max value: " << maxVal << "\n";
+    }
+
+    const uint64_t
+        leafCount = *nodeCount.rbegin(),
+        numActiveVoxels = this->activeVoxelCount(),
+        numActiveLeafVoxels = this->activeLeafVoxelCount(),
+        numActiveTiles = this->activeTileCount();
+
+    os << "  Number of active voxels:       " << util::formattedInt(numActiveVoxels) << "\n";
+    os << "  Number of active tiles:        " << util::formattedInt(numActiveTiles) << "\n";
+
+    Coord dim(0, 0, 0);
+    uint64_t totalVoxels = 0;
+    if (numActiveVoxels) { // nonempty
+        CoordBBox bbox;
+        this->evalActiveVoxelBoundingBox(bbox);
+        dim = bbox.extents();
+        totalVoxels = dim.x() * uint64_t(dim.y()) * dim.z();
+
+        os << "  Bounding box of active voxels: " << bbox << "\n";
+        os << "  Dimensions of active voxels:   "
+            << dim[0] << " x " << dim[1] << " x " << dim[2] << "\n";
+
+        const double activeRatio = (100.0 * double(numActiveVoxels)) / double(totalVoxels);
+        os << "  Percentage of active voxels:   " << std::setprecision(3) << activeRatio << "%\n";
+
+        if (leafCount > 0) {
+            const double fillRatio = (100.0 * double(numActiveLeafVoxels))
+                / (double(leafCount) * double(LeafNodeType::NUM_VOXELS));
+            os << "  Average leaf node fill ratio:  " << fillRatio << "%\n";
+        }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (verboseLevel > 2) {
+            os << "  Number of unallocated nodes:   "
+                << util::formattedInt(unallocatedLeafCount) << " ("
+                << (100.0 * double(unallocatedLeafCount) / double(totalNodeCount)) << "%)\n";
+        }
+#endif
+    } else {
+        os << "  Tree is empty!\n";
+    }
+    os << std::flush;
+
+    if (verboseLevel == 2) return;
+
+    // Memory footprint in bytes
+    const uint64_t
+        actualMem = this->memUsage(),
+        denseMem = sizeof(ValueType) * totalVoxels,
+        voxelsMem = sizeof(ValueType) * numActiveLeafVoxels;
+            ///< @todo not accurate for BoolTree (and probably should count tile values)
+
+    os << "Memory footprint:\n";
+    util::printBytes(os, actualMem, "  Actual:             ");
+    util::printBytes(os, voxelsMem, "  Active leaf voxels: ");
+
+    if (numActiveVoxels) {
+        util::printBytes(os, denseMem, "  Dense equivalent:   ");
+        os << "  Actual footprint is " << (100.0 * double(actualMem) / double(denseMem))
+            << "% of an equivalent dense volume\n";
+        os << "  Leaf voxel footprint is " << (100.0 * double(voxelsMem) / double(actualMem))
+           << "% of actual footprint\n";
+    }
+}
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_TREE_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/TreeIterator.h b/nuparu/include/openvdb_new/tree/TreeIterator.h
new file mode 100644
index 00000000..aadc1414
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/TreeIterator.h
@@ -0,0 +1,1405 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file TreeIterator.h
+
+#ifndef OPENVDB_TREE_TREEITERATOR_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_TREEITERATOR_HAS_BEEN_INCLUDED
+
+#include <boost/mpl/front.hpp>
+#include <boost/mpl/pop_front.hpp>
+#include <boost/mpl/push_back.hpp>
+#include <boost/mpl/size.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/type_traits/remove_const.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+
+// Prior to 0.96.1, depth-bounded value iterators always descended to the leaf level
+// and iterated past leaf nodes.  Now, they never descend past the maximum depth.
+// Comment out the following line to restore the older, less-efficient behavior:
+#define ENABLE_TREE_VALUE_DEPTH_BOUND_OPTIMIZATION
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+/// CopyConstness<T1, T2>::Type is either const T2 or T2 with no const qualifier,
+/// depending on whether T1 is const.  For example,
+/// - CopyConstness<int, int>::Type is int
+/// - CopyConstness<int, const int>::Type is int
+/// - CopyConstness<const int, int>::Type is const int
+/// - CopyConstness<const int, const int>::Type is const int
+template<typename FromType, typename ToType> struct CopyConstness {
+    typedef typename boost::remove_const<ToType>::type Type;
+};
+template<typename FromType, typename ToType> struct CopyConstness<const FromType, ToType> {
+    typedef const ToType Type;
+};
+
+
+////////////////////////////////////////
+
+
+namespace iter {
+
+template<typename HeadT, int HeadLevel>
+struct InvertedTree {
+    typedef typename InvertedTree<typename HeadT::ChildNodeType, HeadLevel-1>::Type SubtreeT;
+    typedef typename boost::mpl::push_back<SubtreeT, HeadT>::type Type;
+};
+template<typename HeadT>
+struct InvertedTree<HeadT, /*HeadLevel=*/1> {
+    typedef typename boost::mpl::vector<typename HeadT::ChildNodeType, HeadT>::type Type;
+};
+
+} // namespace iter
+
+
+////////////////////////////////////////
+
+
+/// IterTraits provides the following for iterators of the standard types,
+/// i.e., for {Child,Value}{On,Off,All}{Iter,CIter}:
+/// - a NodeConverter template to convert an iterator for one type of node
+///   to an iterator of the same type for another type of node; for example,
+///   IterTraits<RootNode, RootNode::ValueOnIter>::NodeConverter<LeafNode>::Type
+///   is synonymous with LeafNode::ValueOnIter.
+/// - a begin(node) function that returns a begin iterator for a node of arbitrary type;
+///   for example, IterTraits<LeafNode, LeafNode::ValueOnIter>::begin(leaf) returns
+///   leaf.beginValueOn()
+/// - a getChild() function that returns a pointer to the child node to which the iterator
+///   is currently pointing (always NULL if the iterator is a Value iterator)
+template<typename NodeT, typename IterT>
+struct IterTraits
+{
+    template<typename ChildT> static ChildT* getChild(const IterT&) { return NULL; }
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ChildOnIter>
+{
+    typedef typename NodeT::ChildOnIter IterT;
+    static IterT begin(NodeT& node) { return node.beginChildOn(); }
+    template<typename ChildT> static ChildT* getChild(const IterT& iter) {
+        return &iter.getValue();
+    }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ChildOnIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ChildOnCIter>
+{
+    typedef typename NodeT::ChildOnCIter IterT;
+    static IterT begin(const NodeT& node) { return node.cbeginChildOn(); }
+    template<typename ChildT> static const ChildT* getChild(const IterT& iter) {
+        return &iter.getValue();
+    }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ChildOnCIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ChildOffIter>
+{
+    typedef typename NodeT::ChildOffIter IterT;
+    static IterT begin(NodeT& node) { return node.beginChildOff(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ChildOffIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ChildOffCIter>
+{
+    typedef typename NodeT::ChildOffCIter IterT;
+    static IterT begin(const NodeT& node) { return node.cbeginChildOff(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ChildOffCIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ChildAllIter>
+{
+    typedef typename NodeT::ChildAllIter IterT;
+    static IterT begin(NodeT& node) { return node.beginChildAll(); }
+    template<typename ChildT> static ChildT* getChild(const IterT& iter) {
+        typename IterT::NonConstValueType val;
+        return iter.probeChild(val);
+    }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ChildAllIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ChildAllCIter>
+{
+    typedef typename NodeT::ChildAllCIter IterT;
+    static IterT begin(const NodeT& node) { return node.cbeginChildAll(); }
+    template<typename ChildT> static ChildT* getChild(const IterT& iter) {
+        typename IterT::NonConstValueType val;
+        return iter.probeChild(val);
+    }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ChildAllCIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ValueOnIter>
+{
+    typedef typename NodeT::ValueOnIter IterT;
+    static IterT begin(NodeT& node) { return node.beginValueOn(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ValueOnIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ValueOnCIter>
+{
+    typedef typename NodeT::ValueOnCIter IterT;
+    static IterT begin(const NodeT& node) { return node.cbeginValueOn(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ValueOnCIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ValueOffIter>
+{
+    typedef typename NodeT::ValueOffIter IterT;
+    static IterT begin(NodeT& node) { return node.beginValueOff(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ValueOffIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ValueOffCIter>
+{
+    typedef typename NodeT::ValueOffCIter IterT;
+    static IterT begin(const NodeT& node) { return node.cbeginValueOff(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ValueOffCIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ValueAllIter>
+{
+    typedef typename NodeT::ValueAllIter IterT;
+    static IterT begin(NodeT& node) { return node.beginValueAll(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ValueAllIter Type;
+    };
+};
+
+template<typename NodeT>
+struct IterTraits<NodeT, typename NodeT::ValueAllCIter>
+{
+    typedef typename NodeT::ValueAllCIter IterT;
+    static IterT begin(const NodeT& node) { return node.cbeginValueAll(); }
+    template<typename OtherNodeT> struct NodeConverter {
+        typedef typename OtherNodeT::ValueAllCIter Type;
+    };
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief An IterListItem is an element of a compile-time linked list of iterators
+/// to nodes of different types.
+///
+/// The list is constructed by traversing the template hierarchy of a Tree in reverse order,
+/// so typically the elements will be a LeafNode iterator of some type (e.g., ValueOnCIter),
+/// followed by one or more InternalNode iterators of the same type, followed by a RootNode
+/// iterator of the same type.
+///
+/// The length of the list is fixed at compile time, and because it is implemented using
+/// nested, templated classes, much of the list traversal logic can be optimized away.
+template<typename PrevItemT, typename NodeVecT, size_t VecSize, Index _Level>
+class IterListItem
+{
+public:
+    /// The type of iterator stored in the previous list item
+    typedef typename PrevItemT::IterT PrevIterT;
+    /// The type of node (non-const) whose iterator is stored in this list item
+    typedef typename boost::mpl::front<NodeVecT>::type _NodeT;
+    /// The type of iterator stored in this list item (e.g., InternalNode::ValueOnCIter)
+    typedef typename IterTraits<typename PrevIterT::NonConstNodeType, PrevIterT>::template
+        NodeConverter<_NodeT>::Type IterT;
+
+    /// The type of node (const or non-const) over which IterT iterates (e.g., const RootNode<...>)
+    typedef typename IterT::NodeType NodeT;
+    /// The type of the node with const qualifiers removed ("Non-Const")
+    typedef typename IterT::NonConstNodeType NCNodeT;
+    /// The type of value (with const qualifiers removed) to which the iterator points
+    typedef typename IterT::NonConstValueType NCValueT;
+    /// NodeT's child node type, with the same constness (e.g., const InternalNode<...>)
+    typedef typename CopyConstness<NodeT, typename NodeT::ChildNodeType>::Type ChildT;
+    /// NodeT's child node type with const qualifiers removed
+    typedef typename CopyConstness<NCNodeT, typename NCNodeT::ChildNodeType>::Type NCChildT;
+    typedef IterTraits<NCNodeT, IterT> ITraits;
+    /// NodeT's level in its tree (0 = LeafNode)
+    static const Index Level = _Level;
+
+    IterListItem(PrevItemT* prev): mNext(this), mPrev(prev) {}
+
+    IterListItem(const IterListItem& other): mIter(other.mIter), mNext(other.mNext), mPrev(NULL) {}
+    IterListItem& operator=(const IterListItem& other)
+    {
+        if (&other != this) {
+            mIter = other.mIter;
+            mNext = other.mNext;
+            mPrev = NULL; ///< @note external call to updateBackPointers() required
+        }
+        return *this;
+    }
+
+    void updateBackPointers(PrevItemT* prev) { mPrev = prev; mNext.updateBackPointers(this); }
+
+    void setIter(const IterT& iter) { mIter = iter; }
+    template<typename OtherIterT>
+    void setIter(const OtherIterT& iter) { mNext.setIter(iter); }
+
+    /// Return the node over which this list element's iterator iterates.
+    void getNode(Index lvl, NodeT*& node) const
+    {
+        node = (lvl <= Level) ? mIter.getParentNode() : NULL;
+    }
+    /// Return the node over which one of the following list elements' iterator iterates.
+    template<typename OtherNodeT>
+    void getNode(Index lvl, OtherNodeT*& node) const { mNext.getNode(lvl, node); }
+
+    /// @brief Initialize the iterator for level @a lvl of the tree with the node
+    /// over which the corresponding iterator of @a otherListItem is iterating.
+    ///
+    /// For example, if @a otherListItem contains a LeafNode::ValueOnIter,
+    /// initialize this list's leaf iterator with the same LeafNode.
+    template<typename OtherIterListItemT>
+    void initLevel(Index lvl, OtherIterListItemT& otherListItem)
+    {
+        if (lvl == Level) {
+            const NodeT* node = NULL;
+            otherListItem.getNode(lvl, node);
+            mIter = (node == NULL) ? IterT() : ITraits::begin(*const_cast<NodeT*>(node));
+        } else {
+            // Forward to one of the following list elements.
+            mNext.initLevel(lvl, otherListItem);
+        }
+    }
+
+    /// Return The table offset of the iterator at level @a lvl of the tree.
+    Index pos(Index lvl) const { return (lvl == Level) ? mIter.pos() : mNext.pos(lvl); }
+
+    /// Return @c true if the iterator at level @a lvl of the tree has not yet reached its end.
+    bool test(Index lvl) const { return (lvl == Level) ? mIter.test() : mNext.test(lvl); }
+
+    /// Increment the iterator at level @a lvl of the tree.
+    bool next(Index lvl) { return (lvl == Level) ? mIter.next() : mNext.next(lvl); }
+
+    /// @brief If the iterator at level @a lvl of the tree points to a child node,
+    /// initialize the next iterator in this list with that child node.
+    bool down(Index lvl)
+    {
+        if (lvl == Level && mPrev != NULL && mIter) {
+            if (ChildT* child = ITraits::template getChild<ChildT>(mIter)) {
+                mPrev->setIter(PrevItemT::ITraits::begin(*child));
+                return true;
+            }
+        }
+        return (lvl > Level) ? mNext.down(lvl) : false;
+    }
+
+    /// @brief Return the global coordinates of the voxel or tile to which the iterator
+    /// at level @a lvl of the tree is currently pointing.
+    Coord getCoord(Index lvl) const
+    {
+        return (lvl == Level) ? mIter.getCoord() : mNext.getCoord(lvl);
+    }
+    Index getChildDim(Index lvl) const
+    {
+        return (lvl == Level) ? NodeT::getChildDim() : mNext.getChildDim(lvl);
+    }
+    /// Return the number of (virtual) voxels spanned by a tile value or child node
+    Index64 getVoxelCount(Index lvl) const
+    {
+        return (lvl == Level) ? ChildT::NUM_VOXELS : mNext.getVoxelCount(lvl);
+    }
+
+    /// Return @c true if the iterator at level @a lvl of the tree points to an active value.
+    bool isValueOn(Index lvl) const
+    {
+        return (lvl == Level) ? mIter.isValueOn() : mNext.isValueOn(lvl);
+    }
+
+    /// Return the value to which the iterator at level @a lvl of the tree points.
+    const NCValueT& getValue(Index lvl) const
+    {
+        if (lvl == Level) return mIter.getValue();
+        return mNext.getValue(lvl);
+    }
+
+    /// @brief Set the value (to @a val) to which the iterator at level @a lvl
+    /// of the tree points and mark the value as active.
+    /// @note Not valid when @c IterT is a const iterator type
+    void setValue(Index lvl, const NCValueT& val) const
+    {
+        if (lvl == Level) mIter.setValue(val); else mNext.setValue(lvl, val);
+    }
+    /// @brief Set the value (to @a val) to which the iterator at level @a lvl of the tree
+    /// points and mark the value as active if @a on is @c true, or inactive otherwise.
+    /// @note Not valid when @c IterT is a const iterator type
+    void setValueOn(Index lvl, bool on = true) const
+    {
+        if (lvl == Level) mIter.setValueOn(on); else mNext.setValueOn(lvl, on);
+    }
+    /// @brief Mark the value to which the iterator at level @a lvl of the tree points
+    /// as inactive.
+    /// @note Not valid when @c IterT is a const iterator type
+    void setValueOff(Index lvl) const
+    {
+        if (lvl == Level) mIter.setValueOff(); else mNext.setValueOff(lvl);
+    }
+
+    /// @brief Apply a functor to the item to which this iterator is pointing.
+    /// @note Not valid when @c IterT is a const iterator type
+    template<typename ModifyOp>
+    void modifyValue(Index lvl, const ModifyOp& op) const
+    {
+        if (lvl == Level) mIter.modifyValue(op); else mNext.modifyValue(lvl, op);
+    }
+
+private:
+    typedef typename boost::mpl::pop_front<NodeVecT>::type RestT; // NodeVecT minus its first item
+    typedef IterListItem<IterListItem, RestT, VecSize - 1, Level + 1> NextItem;
+
+    IterT mIter;
+    NextItem mNext;
+    PrevItemT* mPrev;
+};
+
+
+/// The initial element of a compile-time linked list of iterators to nodes of different types
+template<typename PrevItemT, typename NodeVecT, size_t VecSize>
+class IterListItem<PrevItemT, NodeVecT, VecSize, /*Level=*/0U>
+{
+public:
+    /// The type of iterator stored in the previous list item
+    typedef typename PrevItemT::IterT PrevIterT;
+    /// The type of node (non-const) whose iterator is stored in this list item
+    typedef typename boost::mpl::front<NodeVecT>::type _NodeT;
+    /// The type of iterator stored in this list item (e.g., InternalNode::ValueOnCIter)
+    typedef typename IterTraits<typename PrevIterT::NonConstNodeType, PrevIterT>::template
+        NodeConverter<_NodeT>::Type IterT;
+
+    /// The type of node (const or non-const) over which IterT iterates (e.g., const RootNode<...>)
+    typedef typename IterT::NodeType NodeT;
+    /// The type of the node with const qualifiers removed ("Non-Const")
+    typedef typename IterT::NonConstNodeType NCNodeT;
+    /// The type of value (with const qualifiers removed) to which the iterator points
+    typedef typename IterT::NonConstValueType NCValueT;
+    typedef IterTraits<NCNodeT, IterT> ITraits;
+    /// NodeT's level in its tree (0 = LeafNode)
+    static const Index Level = 0;
+
+    IterListItem(PrevItemT*): mNext(this), mPrev(NULL) {}
+
+    IterListItem(const IterListItem& other): mIter(other.mIter), mNext(other.mNext), mPrev(NULL) {}
+    IterListItem& operator=(const IterListItem& other)
+    {
+        if (&other != this) {
+            mIter = other.mIter;
+            mNext = other.mNext;
+            mPrev = NULL;
+        }
+        return *this;
+    }
+
+    void updateBackPointers(PrevItemT* = NULL) { mPrev = NULL; mNext.updateBackPointers(this); }
+
+    void setIter(const IterT& iter) { mIter = iter; }
+    template<typename OtherIterT>
+    void setIter(const OtherIterT& iter) { mNext.setIter(iter); }
+
+    void getNode(Index lvl, NodeT*& node) const
+    {
+        node = (lvl == 0) ? mIter.getParentNode() : NULL;
+    }
+    template<typename OtherNodeT>
+    void getNode(Index lvl, OtherNodeT*& node) const { mNext.getNode(lvl, node); }
+
+    template<typename OtherIterListItemT>
+    void initLevel(Index lvl, OtherIterListItemT& otherListItem)
+    {
+        if (lvl == 0) {
+            const NodeT* node = NULL;
+            otherListItem.getNode(lvl, node);
+            mIter = (node == NULL) ? IterT() : ITraits::begin(*const_cast<NodeT*>(node));
+        } else {
+            mNext.initLevel(lvl, otherListItem);
+        }
+    }
+
+    Index pos(Index lvl) const { return (lvl == 0) ? mIter.pos() : mNext.pos(lvl); }
+
+    bool test(Index lvl) const { return (lvl == 0) ? mIter.test() : mNext.test(lvl); }
+
+    bool next(Index lvl) { return (lvl == 0) ? mIter.next() : mNext.next(lvl); }
+
+    bool down(Index lvl) { return (lvl == 0) ? false : mNext.down(lvl); }
+
+    Coord getCoord(Index lvl) const
+    {
+        return (lvl == 0) ?  mIter.getCoord() : mNext.getCoord(lvl);
+    }
+    Index getChildDim(Index lvl) const
+    {
+        return (lvl == 0) ? NodeT::getChildDim() : mNext.getChildDim(lvl);
+    }
+
+    Index64 getVoxelCount(Index lvl) const
+    {
+        return (lvl == 0) ? 1 : mNext.getVoxelCount(lvl);
+    }
+
+    bool isValueOn(Index lvl) const
+    {
+        return (lvl == 0) ? mIter.isValueOn() : mNext.isValueOn(lvl);
+    }
+
+    const NCValueT& getValue(Index lvl) const
+    {
+        if (lvl == 0) return mIter.getValue();
+        return mNext.getValue(lvl);
+    }
+
+    void setValue(Index lvl, const NCValueT& val) const
+    {
+        if (lvl == 0) mIter.setValue(val); else mNext.setValue(lvl, val);
+    }
+    void setValueOn(Index lvl, bool on = true) const
+    {
+        if (lvl == 0) mIter.setValueOn(on); else mNext.setValueOn(lvl, on);
+    }
+    void setValueOff(Index lvl) const
+    {
+        if (lvl == 0) mIter.setValueOff(); else mNext.setValueOff(lvl);
+    }
+
+    template<typename ModifyOp>
+    void modifyValue(Index lvl, const ModifyOp& op) const
+    {
+        if (lvl == 0) mIter.modifyValue(op); else mNext.modifyValue(lvl, op);
+    }
+
+private:
+    typedef typename boost::mpl::pop_front<NodeVecT>::type RestT; // NodeVecT minus its first item
+    typedef IterListItem<IterListItem, RestT, VecSize - 1, /*Level=*/1> NextItem;
+
+    IterT mIter;
+    NextItem mNext;
+    PrevItemT* mPrev;
+};
+
+
+/// The final element of a compile-time linked list of iterators to nodes of different types
+template<typename PrevItemT, typename NodeVecT, Index _Level>
+class IterListItem<PrevItemT, NodeVecT, /*VecSize=*/1, _Level>
+{
+public:
+    typedef typename boost::mpl::front<NodeVecT>::type _NodeT;
+    /// The type of iterator stored in the previous list item
+    typedef typename PrevItemT::IterT PrevIterT;
+    /// The type of iterator stored in this list item (e.g., RootNode::ValueOnCIter)
+    typedef typename IterTraits<typename PrevIterT::NonConstNodeType, PrevIterT>::template
+        NodeConverter<_NodeT>::Type IterT;
+
+    /// The type of node over which IterT iterates (e.g., const RootNode<...>)
+    typedef typename IterT::NodeType NodeT;
+    /// The type of the node with const qualifiers removed ("Non-Const")
+    typedef typename IterT::NonConstNodeType NCNodeT;
+    /// The type of value (with const qualifiers removed) to which the iterator points
+    typedef typename IterT::NonConstValueType NCValueT;
+    /// NodeT's child node type, with the same constness (e.g., const InternalNode<...>)
+    typedef typename CopyConstness<NodeT, typename NodeT::ChildNodeType>::Type ChildT;
+    /// NodeT's child node type with const qualifiers removed
+    typedef typename CopyConstness<NCNodeT, typename NCNodeT::ChildNodeType>::Type NCChildT;
+    typedef IterTraits<NCNodeT, IterT> ITraits;
+    /// NodeT's level in its tree (0 = LeafNode)
+    static const Index Level = _Level;
+
+    IterListItem(PrevItemT* prev): mPrev(prev) {}
+
+    IterListItem(const IterListItem& other): mIter(other.mIter), mPrev(NULL) {}
+    IterListItem& operator=(const IterListItem& other)
+    {
+        if (&other != this) {
+            mIter = other.mIter;
+            mPrev = NULL; ///< @note external call to updateBackPointers() required
+        }
+        return *this;
+    }
+
+    void updateBackPointers(PrevItemT* prev) { mPrev = prev; }
+
+    // The following method specializations differ from the default template
+    // implementations mainly in that they don't forward.
+
+    void setIter(const IterT& iter) { mIter = iter; }
+
+    void getNode(Index lvl, NodeT*& node) const
+    {
+        node = (lvl <= Level) ? mIter.getParentNode() : NULL;
+    }
+
+    template<typename OtherIterListItemT>
+    void initLevel(Index lvl, OtherIterListItemT& otherListItem)
+    {
+        if (lvl == Level) {
+            const NodeT* node = NULL;
+            otherListItem.getNode(lvl, node);
+            mIter = (node == NULL) ? IterT() : ITraits::begin(*const_cast<NodeT*>(node));
+        }
+    }
+
+    Index pos(Index lvl) const { return (lvl == Level) ? mIter.pos() : Index(-1); }
+
+    bool test(Index lvl) const { return (lvl == Level) ? mIter.test() : false; }
+
+    bool next(Index lvl) { return (lvl == Level) ? mIter.next() : false; }
+
+    bool down(Index lvl)
+    {
+        if (lvl == Level && mPrev != NULL && mIter) {
+            if (ChildT* child = ITraits::template getChild<ChildT>(mIter)) {
+                mPrev->setIter(PrevItemT::ITraits::begin(*child));
+                return true;
+            }
+        }
+        return false;
+    }
+
+    Coord getCoord(Index lvl) const { return (lvl == Level) ? mIter.getCoord() : Coord(); }
+    Index getChildDim(Index lvl) const { return (lvl == Level) ? NodeT::getChildDim() : 0; }
+    Index64 getVoxelCount(Index lvl) const { return (lvl == Level) ? ChildT::NUM_VOXELS : 0; }
+
+    bool isValueOn(Index lvl) const { return (lvl == Level) ? mIter.isValueOn() : false; }
+
+    const NCValueT& getValue(Index lvl) const
+    {
+        assert(lvl == Level);
+        (void)lvl; // avoid unused variable warning in optimized builds
+        return mIter.getValue();
+    }
+
+    void setValue(Index lvl, const NCValueT& val) const { if (lvl == Level) mIter.setValue(val); }
+    void setValueOn(Index lvl, bool on = true) const { if (lvl == Level) mIter.setValueOn(on); }
+    void setValueOff(Index lvl) const { if (lvl == Level) mIter.setValueOff(); }
+
+    template<typename ModifyOp>
+    void modifyValue(Index lvl, const ModifyOp& op) const
+    {
+        if (lvl == Level) mIter.modifyValue(op);
+    }
+
+private:
+    IterT mIter;
+    PrevItemT* mPrev;
+};
+
+
+////////////////////////////////////////
+
+
+//#define DEBUG_TREE_VALUE_ITERATOR
+
+/// @brief Base class for tree-traversal iterators over tile and voxel values
+template<typename _TreeT, typename _ValueIterT>
+class TreeValueIteratorBase
+{
+public:
+    typedef _TreeT TreeT;
+    typedef _ValueIterT ValueIterT;
+    typedef typename ValueIterT::NodeType NodeT;
+    typedef typename ValueIterT::NonConstValueType ValueT;
+    typedef typename NodeT::ChildOnCIter ChildOnIterT;
+    static const Index ROOT_LEVEL = NodeT::LEVEL;
+    BOOST_STATIC_ASSERT(ValueIterT::NodeType::LEVEL == ROOT_LEVEL);
+    static const Index LEAF_LEVEL = 0, ROOT_DEPTH = 0, LEAF_DEPTH = ROOT_LEVEL;
+
+    TreeValueIteratorBase(TreeT&);
+
+    TreeValueIteratorBase(const TreeValueIteratorBase& other);
+    TreeValueIteratorBase& operator=(const TreeValueIteratorBase& other);
+
+    /// Specify the depth of the highest level of the tree to which to ascend (depth 0 = root).
+    void setMinDepth(Index minDepth);
+    /// Return the depth of the highest level of the tree to which this iterator ascends.
+    Index getMinDepth() const { return ROOT_LEVEL - Index(mMaxLevel); }
+    /// Specify the depth of the lowest level of the tree to which to descend (depth 0 = root).
+    void setMaxDepth(Index maxDepth);
+    /// Return the depth of the lowest level of the tree to which this iterator ascends.
+    Index getMaxDepth() const { return ROOT_LEVEL - Index(mMinLevel); }
+
+    //@{
+    /// Return @c true if this iterator is not yet exhausted.
+    bool test() const { return mValueIterList.test(mLevel); }
+    operator bool() const { return this->test(); }
+    //@}
+
+    /// @brief Advance to the next tile or voxel value.
+    /// Return @c true if this iterator is not yet exhausted.
+    bool next();
+    /// Advance to the next tile or voxel value.
+    TreeValueIteratorBase& operator++() { this->next(); return *this; }
+
+    /// @brief Return the level in the tree (0 = leaf) of the node to which
+    /// this iterator is currently pointing.
+    Index getLevel() const { return mLevel; }
+    /// @brief Return the depth in the tree (0 = root) of the node to which
+    /// this iterator is currently pointing.
+    Index getDepth() const { return ROOT_LEVEL - mLevel; }
+    static Index getLeafDepth() { return LEAF_DEPTH; }
+
+    /// @brief Return in @a node a pointer to the node over which this iterator is
+    /// currently iterating or one of that node's parents, as determined by @a NodeType.
+    /// @return a null pointer if @a NodeType specifies a node at a lower level
+    /// of the tree than that given by getLevel().
+    template<typename NodeType>
+    void getNode(NodeType*& node) const { mValueIterList.getNode(mLevel, node); }
+
+    /// @brief Return the global coordinates of the voxel or tile to which
+    /// this iterator is currently pointing.
+    Coord getCoord() const { return mValueIterList.getCoord(mLevel); }
+    /// @brief Return in @a bbox the axis-aligned bounding box of
+    /// the voxel or tile to which this iterator is currently pointing.
+    /// @return false if the bounding box is empty.
+    bool getBoundingBox(CoordBBox&) const;
+    /// @brief Return the axis-aligned bounding box of the voxel or tile to which
+    /// this iterator is currently pointing.
+    CoordBBox getBoundingBox() const { CoordBBox b; this->getBoundingBox(b); return b; }
+
+    /// Return the number of (virtual) voxels corresponding to the value
+    Index64 getVoxelCount() const { return mValueIterList.getVoxelCount(mLevel);}
+
+    /// Return @c true if this iterator is currently pointing to a (non-leaf) tile value.
+    bool isTileValue() const { return mLevel != 0 && this->test(); }
+    /// Return @c true if this iterator is currently pointing to a (leaf) voxel value.
+    bool isVoxelValue() const { return mLevel == 0 && this->test(); }
+    /// Return @c true if the value to which this iterator is currently pointing is active.
+    bool isValueOn() const { return mValueIterList.isValueOn(mLevel); }
+
+    //@{
+    /// Return the tile or voxel value to which this iterator is currently pointing.
+    const ValueT& getValue() const { return mValueIterList.getValue(mLevel); }
+    const ValueT& operator*() const { return this->getValue(); }
+    const ValueT* operator->() const { return &(this->operator*()); }
+    //@}
+
+    /// @brief Change the tile or voxel value to which this iterator is currently pointing
+    /// and mark it as active.
+    void setValue(const ValueT& val) const { mValueIterList.setValue(mLevel, val); }
+    /// @brief Change the active/inactive state of the tile or voxel value to which
+    /// this iterator is currently pointing.
+    void setActiveState(bool on) const { mValueIterList.setValueOn(mLevel, on); }
+    /// Mark the tile or voxel value to which this iterator is currently pointing as inactive.
+    void setValueOff() const { mValueIterList.setValueOff(mLevel); }
+
+    /// @brief Apply a functor to the item to which this iterator is pointing.
+    /// (Not valid for const iterators.)
+    /// @param op  a functor of the form <tt>void op(ValueType&) const</tt> that modifies
+    ///            its argument in place
+    /// @see Tree::modifyValue()
+    template<typename ModifyOp>
+    void modifyValue(const ModifyOp& op) const { mValueIterList.modifyValue(mLevel, op); }
+
+    /// Return a pointer to the tree over which this iterator is iterating.
+    TreeT* getTree() const { return mTree; }
+
+    /// Return a string (for debugging, mainly) describing this iterator's current state.
+    std::string summary() const;
+
+private:
+    bool advance(bool dontIncrement = false);
+
+    typedef typename iter::InvertedTree<NodeT, NodeT::LEVEL>::Type InvTreeT;
+    struct PrevChildItem { typedef ChildOnIterT IterT; };
+    struct PrevValueItem { typedef ValueIterT IterT; };
+
+    IterListItem<PrevChildItem, InvTreeT, /*VecSize=*/ROOT_LEVEL+1, /*Level=*/0> mChildIterList;
+    IterListItem<PrevValueItem, InvTreeT, /*VecSize=*/ROOT_LEVEL+1, /*Level=*/0> mValueIterList;
+    Index mLevel;
+    int mMinLevel, mMaxLevel;
+    TreeT* mTree;
+}; // class TreeValueIteratorBase
+
+
+template<typename TreeT, typename ValueIterT>
+inline
+TreeValueIteratorBase<TreeT, ValueIterT>::TreeValueIteratorBase(TreeT& tree):
+    mChildIterList(NULL),
+    mValueIterList(NULL),
+    mLevel(ROOT_LEVEL),
+    mMinLevel(int(LEAF_LEVEL)),
+    mMaxLevel(int(ROOT_LEVEL)),
+    mTree(&tree)
+{
+    mChildIterList.setIter(IterTraits<NodeT, ChildOnIterT>::begin(tree.root()));
+    mValueIterList.setIter(IterTraits<NodeT, ValueIterT>::begin(tree.root()));
+    this->advance(/*dontIncrement=*/true);
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline
+TreeValueIteratorBase<TreeT, ValueIterT>::TreeValueIteratorBase(const TreeValueIteratorBase& other):
+    mChildIterList(other.mChildIterList),
+    mValueIterList(other.mValueIterList),
+    mLevel(other.mLevel),
+    mMinLevel(other.mMinLevel),
+    mMaxLevel(other.mMaxLevel),
+    mTree(other.mTree)
+{
+    mChildIterList.updateBackPointers();
+    mValueIterList.updateBackPointers();
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline TreeValueIteratorBase<TreeT, ValueIterT>&
+TreeValueIteratorBase<TreeT, ValueIterT>::operator=(const TreeValueIteratorBase& other)
+{
+    if (&other != this) {
+        mChildIterList = other.mChildIterList;
+        mValueIterList = other.mValueIterList;
+        mLevel = other.mLevel;
+        mMinLevel = other.mMinLevel;
+        mMaxLevel = other.mMaxLevel;
+        mTree = other.mTree;
+        mChildIterList.updateBackPointers();
+        mValueIterList.updateBackPointers();
+    }
+    return *this;
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline void
+TreeValueIteratorBase<TreeT, ValueIterT>::setMinDepth(Index minDepth)
+{
+    mMaxLevel = int(ROOT_LEVEL - minDepth); // level = ROOT_LEVEL - depth
+    if (int(mLevel) > mMaxLevel) this->next();
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline void
+TreeValueIteratorBase<TreeT, ValueIterT>::setMaxDepth(Index maxDepth)
+{
+    // level = ROOT_LEVEL - depth
+    mMinLevel = int(ROOT_LEVEL - std::min(maxDepth, this->getLeafDepth()));
+    if (int(mLevel) < mMinLevel) this->next();
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline bool
+TreeValueIteratorBase<TreeT, ValueIterT>::next()
+{
+    do {
+        if (!this->advance()) return false;
+    } while (int(mLevel) < mMinLevel || int(mLevel) > mMaxLevel);
+    return true;
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline bool
+TreeValueIteratorBase<TreeT, ValueIterT>::advance(bool dontIncrement)
+{
+    bool recurse = false;
+    do {
+        recurse = false;
+        Index
+            vPos = mValueIterList.pos(mLevel),
+            cPos = mChildIterList.pos(mLevel);
+        if (vPos == cPos && mChildIterList.test(mLevel)) {
+            /// @todo Once ValueOff iterators properly skip child pointers, remove this block.
+            mValueIterList.next(mLevel);
+            vPos = mValueIterList.pos(mLevel);
+        }
+        if (vPos < cPos) {
+            if (dontIncrement) return true;
+            if (mValueIterList.next(mLevel)) {
+                if (mValueIterList.pos(mLevel) == cPos && mChildIterList.test(mLevel)) {
+                    /// @todo Once ValueOff iterators properly skip child pointers,
+                    /// remove this block.
+                    mValueIterList.next(mLevel);
+                }
+                // If there is a next value and it precedes the next child, return.
+                if (mValueIterList.pos(mLevel) < cPos) return true;
+            }
+        } else {
+            // Advance to the next child, which may or may not precede the next value.
+            if (!dontIncrement) mChildIterList.next(mLevel);
+        }
+#ifdef DEBUG_TREE_VALUE_ITERATOR
+        std::cout << "\n" << this->summary() << std::flush;
+#endif
+
+        // Descend to the lowest level at which the next value precedes the next child.
+        while (mChildIterList.pos(mLevel) < mValueIterList.pos(mLevel)) {
+#ifdef ENABLE_TREE_VALUE_DEPTH_BOUND_OPTIMIZATION
+            if (int(mLevel) == mMinLevel) {
+                // If the current node lies at the lowest allowed level, none of its
+                // children can be visited, so just advance its child iterator.
+                mChildIterList.next(mLevel);
+                if (mValueIterList.pos(mLevel) == mChildIterList.pos(mLevel)
+                    && mChildIterList.test(mLevel))
+                {
+                    /// @todo Once ValueOff iterators properly skip child pointers,
+                    /// remove this block.
+                    mValueIterList.next(mLevel);
+                }
+            } else
+#endif
+                if (mChildIterList.down(mLevel)) {
+                    --mLevel; // descend one level
+                    mValueIterList.initLevel(mLevel, mChildIterList);
+                    if (mValueIterList.pos(mLevel) == mChildIterList.pos(mLevel)
+                        && mChildIterList.test(mLevel))
+                    {
+                        /// @todo Once ValueOff iterators properly skip child pointers,
+                        /// remove this block.
+                        mValueIterList.next(mLevel);
+                    }
+                } else break;
+#ifdef DEBUG_TREE_VALUE_ITERATOR
+            std::cout << "\n" << this->summary() << std::flush;
+#endif
+        }
+        // Ascend to the nearest level at which one of the iterators is not yet exhausted.
+        while (!mChildIterList.test(mLevel) && !mValueIterList.test(mLevel)) {
+            if (mLevel == ROOT_LEVEL) return false;
+            ++mLevel;
+            mChildIterList.next(mLevel);
+            dontIncrement = true;
+            recurse = true;
+        }
+    } while (recurse);
+    return true;
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline bool
+TreeValueIteratorBase<TreeT, ValueIterT>::getBoundingBox(CoordBBox& bbox) const
+{
+    if (!this->test()) {
+        bbox = CoordBBox();
+        return false;
+    }
+    bbox.min() = mValueIterList.getCoord(mLevel);
+    bbox.max() = bbox.min().offsetBy(mValueIterList.getChildDim(mLevel) - 1);
+    return true;
+}
+
+
+template<typename TreeT, typename ValueIterT>
+inline std::string
+TreeValueIteratorBase<TreeT, ValueIterT>::summary() const
+{
+    std::ostringstream ostr;
+    for (int lvl = int(ROOT_LEVEL); lvl >= 0 && lvl >= int(mLevel); --lvl) {
+        if (lvl == 0) ostr << "leaf";
+        else if (lvl == int(ROOT_LEVEL)) ostr << "root";
+        else ostr << "int" << (ROOT_LEVEL - lvl);
+        ostr << " v" << mValueIterList.pos(lvl)
+            << " c" << mChildIterList.pos(lvl);
+        if (lvl > int(mLevel)) ostr << " / ";
+    }
+    if (this->test() && mValueIterList.pos(mLevel) < mChildIterList.pos(mLevel)) {
+        if (mLevel == 0) {
+            ostr << " " << this->getCoord();
+        } else {
+            ostr << " " << this->getBoundingBox();
+        }
+    }
+    return ostr.str();
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Base class for tree-traversal iterators over all nodes
+template<typename _TreeT, typename RootChildOnIterT>
+class NodeIteratorBase
+{
+public:
+    typedef _TreeT TreeT;
+    typedef RootChildOnIterT RootIterT;
+    typedef typename RootIterT::NodeType RootNodeT;
+    typedef typename RootIterT::NonConstNodeType NCRootNodeT;
+    static const Index ROOT_LEVEL = RootNodeT::LEVEL;
+    typedef typename iter::InvertedTree<NCRootNodeT, ROOT_LEVEL>::Type InvTreeT;
+    static const Index LEAF_LEVEL = 0, ROOT_DEPTH = 0, LEAF_DEPTH = ROOT_LEVEL;
+
+    typedef IterTraits<NCRootNodeT, RootIterT> RootIterTraits;
+
+    NodeIteratorBase();
+    NodeIteratorBase(TreeT&);
+
+    NodeIteratorBase(const NodeIteratorBase& other);
+    NodeIteratorBase& operator=(const NodeIteratorBase& other);
+
+    /// Specify the depth of the highest level of the tree to which to ascend (depth 0 = root).
+    void setMinDepth(Index minDepth);
+    /// Return the depth of the highest level of the tree to which this iterator ascends.
+    Index getMinDepth() const { return ROOT_LEVEL - Index(mMaxLevel); }
+    /// Specify the depth of the lowest level of the tree to which to descend (depth 0 = root).
+    void setMaxDepth(Index maxDepth);
+    /// Return the depth of the lowest level of the tree to which this iterator ascends.
+    Index getMaxDepth() const { return ROOT_LEVEL - Index(mMinLevel); }
+
+    //@{
+    /// Return @c true if this iterator is not yet exhausted.
+    bool test() const { return !mDone; }
+    operator bool() const { return this->test(); }
+    //@}
+
+    /// @brief Advance to the next tile or voxel value.
+    /// @return @c true if this iterator is not yet exhausted.
+    bool next();
+    /// Advance the iterator to the next leaf node.
+    void increment() { this->next(); }
+    NodeIteratorBase& operator++() { this->increment(); return *this; }
+    /// Increment the iterator n times.
+    void increment(Index n) { for (Index i = 0; i < n && this->next(); ++i) {} }
+
+    /// @brief Return the level in the tree (0 = leaf) of the node to which
+    /// this iterator is currently pointing.
+    Index getLevel() const { return mLevel; }
+    /// @brief Return the depth in the tree (0 = root) of the node to which
+    /// this iterator is currently pointing.
+    Index getDepth() const { return ROOT_LEVEL - mLevel; }
+    static Index getLeafDepth() { return LEAF_DEPTH; }
+
+    /// @brief Return the global coordinates of the voxel or tile to which
+    /// this iterator is currently pointing.
+    Coord getCoord() const;
+    /// @brief Return in @a bbox the axis-aligned bounding box of
+    /// the voxel or tile to which this iterator is currently pointing.
+    /// @return false if the bounding box is empty.
+    bool getBoundingBox(CoordBBox& bbox) const;
+    /// @brief Return the axis-aligned bounding box of the voxel or tile to which
+    /// this iterator is currently pointing.
+    CoordBBox getBoundingBox() const { CoordBBox b; this->getBoundingBox(b); return b; }
+
+    //@{
+    /// @brief Return the node to which the iterator is pointing.
+    /// @note This iterator doesn't have the usual dereference operators (* and ->),
+    /// because they would have to be overloaded by the returned node type.
+    template<typename NodeT>
+    void getNode(NodeT*& node) const { node = NULL; mIterList.getNode(mLevel, node); }
+    template<typename NodeT>
+    void getNode(const NodeT*& node) const { node = NULL; mIterList.getNode(mLevel, node); }
+    //@}
+
+    TreeT* getTree() const { return mTree; }
+
+    std::string summary() const;
+
+private:
+    struct PrevItem { typedef RootIterT IterT; };
+
+    IterListItem<PrevItem, InvTreeT, /*VecSize=*/ROOT_LEVEL+1, LEAF_LEVEL> mIterList;
+    Index mLevel;
+    int mMinLevel, mMaxLevel;
+    bool mDone;
+    TreeT* mTree;
+}; // class NodeIteratorBase
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline
+NodeIteratorBase<TreeT, RootChildOnIterT>::NodeIteratorBase():
+    mIterList(NULL),
+    mLevel(ROOT_LEVEL),
+    mMinLevel(int(LEAF_LEVEL)),
+    mMaxLevel(int(ROOT_LEVEL)),
+    mDone(true),
+    mTree(NULL)
+{
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline
+NodeIteratorBase<TreeT, RootChildOnIterT>::NodeIteratorBase(TreeT& tree):
+    mIterList(NULL),
+    mLevel(ROOT_LEVEL),
+    mMinLevel(int(LEAF_LEVEL)),
+    mMaxLevel(int(ROOT_LEVEL)),
+    mDone(false),
+    mTree(&tree)
+{
+    mIterList.setIter(RootIterTraits::begin(tree.root()));
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline
+NodeIteratorBase<TreeT, RootChildOnIterT>::NodeIteratorBase(const NodeIteratorBase& other):
+    mIterList(other.mIterList),
+    mLevel(other.mLevel),
+    mMinLevel(other.mMinLevel),
+    mMaxLevel(other.mMaxLevel),
+    mDone(other.mDone),
+    mTree(other.mTree)
+{
+    mIterList.updateBackPointers();
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline NodeIteratorBase<TreeT, RootChildOnIterT>&
+NodeIteratorBase<TreeT, RootChildOnIterT>::operator=(const NodeIteratorBase& other)
+{
+    if (&other != this) {
+        mLevel = other.mLevel;
+        mMinLevel = other.mMinLevel;
+        mMaxLevel = other.mMaxLevel;
+        mDone = other.mDone;
+        mTree = other.mTree;
+        mIterList = other.mIterList;
+        mIterList.updateBackPointers();
+    }
+    return *this;
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline void
+NodeIteratorBase<TreeT, RootChildOnIterT>::setMinDepth(Index minDepth)
+{
+    mMaxLevel = int(ROOT_LEVEL - minDepth); // level = ROOT_LEVEL - depth
+    if (int(mLevel) > mMaxLevel) this->next();
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline void
+NodeIteratorBase<TreeT, RootChildOnIterT>::setMaxDepth(Index maxDepth)
+{
+    // level = ROOT_LEVEL - depth
+    mMinLevel = int(ROOT_LEVEL - std::min(maxDepth, this->getLeafDepth()));
+    if (int(mLevel) < mMinLevel) this->next();
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline bool
+NodeIteratorBase<TreeT, RootChildOnIterT>::next()
+{
+    do {
+        if (mDone) return false;
+
+        // If the iterator over the current node points to a child,
+        // descend to the child (depth-first traversal).
+        if (int(mLevel) > mMinLevel && mIterList.test(mLevel)) {
+            if (!mIterList.down(mLevel)) return false;
+            --mLevel;
+        } else {
+            // Ascend to the nearest ancestor that has other children.
+            while (!mIterList.test(mLevel)) {
+                if (mLevel == ROOT_LEVEL) {
+                    // Can't ascend higher than the root.
+                    mDone = true;
+                    return false;
+                }
+                ++mLevel; // ascend one level
+                mIterList.next(mLevel); // advance to the next child, if there is one
+            }
+            // Descend to the child.
+            if (!mIterList.down(mLevel)) return false;
+            --mLevel;
+        }
+    } while (int(mLevel) < mMinLevel || int(mLevel) > mMaxLevel);
+    return true;
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline Coord
+NodeIteratorBase<TreeT, RootChildOnIterT>::getCoord() const
+{
+    if (mLevel != ROOT_LEVEL) return  mIterList.getCoord(mLevel + 1);
+    RootNodeT* root = NULL;
+    this->getNode(root);
+    return root ? root->getMinIndex() : Coord::min();
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline bool
+NodeIteratorBase<TreeT, RootChildOnIterT>::getBoundingBox(CoordBBox& bbox) const
+{
+    if (mLevel == ROOT_LEVEL) {
+        RootNodeT* root = NULL;
+        this->getNode(root);
+        if (root == NULL) {
+            bbox = CoordBBox();
+            return false;
+        }
+        root->getIndexRange(bbox);
+        return true;
+    }
+    bbox.min() = mIterList.getCoord(mLevel + 1);
+    bbox.max() = bbox.min().offsetBy(mIterList.getChildDim(mLevel + 1) - 1);
+    return true;
+}
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline std::string
+NodeIteratorBase<TreeT, RootChildOnIterT>::summary() const
+{
+    std::ostringstream ostr;
+    for (int lvl = int(ROOT_LEVEL); lvl >= 0 && lvl >= int(mLevel); --lvl) {
+        if (lvl == 0) ostr << "leaf";
+        else if (lvl == int(ROOT_LEVEL)) ostr << "root";
+        else ostr << "int" << (ROOT_LEVEL - lvl);
+        ostr << " c" << mIterList.pos(lvl);
+        if (lvl > int(mLevel)) ostr << " / ";
+    }
+    CoordBBox bbox;
+    this->getBoundingBox(bbox);
+    ostr << " " << bbox;
+    return ostr.str();
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Base class for tree-traversal iterators over all leaf nodes (but not leaf voxels)
+template<typename TreeT, typename RootChildOnIterT>
+class LeafIteratorBase
+{
+public:
+    typedef RootChildOnIterT RootIterT;
+    typedef typename RootIterT::NodeType RootNodeT;
+    typedef typename RootIterT::NonConstNodeType NCRootNodeT;
+    static const Index ROOT_LEVEL = RootNodeT::LEVEL;
+    typedef typename iter::InvertedTree<NCRootNodeT, ROOT_LEVEL>::Type InvTreeT;
+    typedef typename boost::mpl::front<InvTreeT>::type NCLeafNodeT;
+    typedef typename CopyConstness<RootNodeT, NCLeafNodeT>::Type LeafNodeT;
+    static const Index LEAF_LEVEL = 0, LEAF_PARENT_LEVEL = LEAF_LEVEL + 1;
+
+    typedef IterTraits<NCRootNodeT, RootIterT> RootIterTraits;
+
+    LeafIteratorBase(): mIterList(NULL), mTree(NULL) {}
+
+    LeafIteratorBase(TreeT& tree): mIterList(NULL), mTree(&tree)
+    {
+        // Initialize the iterator list with a root node iterator.
+        mIterList.setIter(RootIterTraits::begin(tree.root()));
+        // Descend along the first branch, initializing the node iterator at each level.
+        Index lvl = ROOT_LEVEL;
+        for ( ; lvl > 0 && mIterList.down(lvl); --lvl) {}
+        // If the first branch terminated above the leaf level, backtrack to the next branch.
+        if (lvl > 0) this->next();
+    }
+
+    LeafIteratorBase(const LeafIteratorBase& other): mIterList(other.mIterList), mTree(other.mTree)
+    {
+        mIterList.updateBackPointers();
+    }
+    LeafIteratorBase& operator=(const LeafIteratorBase& other)
+    {
+        if (&other != this) {
+            mTree = other.mTree;
+            mIterList = other.mIterList;
+            mIterList.updateBackPointers();
+        }
+        return *this;
+    }
+
+    //@{
+    /// Return the leaf node to which the iterator is pointing.
+    LeafNodeT* getLeaf() const { LeafNodeT* n = NULL; mIterList.getNode(LEAF_LEVEL, n); return n; }
+    LeafNodeT& operator*() const { return *this->getLeaf(); }
+    LeafNodeT* operator->() const { return this->getLeaf(); }
+    //@}
+
+    bool test() const { return mIterList.test(LEAF_PARENT_LEVEL); }
+    operator bool() const { return this->test(); }
+
+    //@{
+    /// Advance the iterator to the next leaf node.
+    bool next();
+    void increment() { this->next(); }
+    LeafIteratorBase& operator++() { this->increment(); return *this; }
+    //@}
+    /// Increment the iterator n times.
+    void increment(Index n) { for (Index i = 0; i < n && this->next(); ++i) {} }
+
+    TreeT* getTree() const { return mTree; }
+
+private:
+    struct PrevItem { typedef RootIterT IterT; };
+
+    /// @note Even though a LeafIterator doesn't iterate over leaf voxels,
+    /// the first item of this linked list of node iterators is a leaf node iterator,
+    /// whose purpose is only to provide access to its parent leaf node.
+    IterListItem<PrevItem, InvTreeT, /*VecSize=*/ROOT_LEVEL+1, LEAF_LEVEL> mIterList;
+    TreeT* mTree;
+}; // class LeafIteratorBase
+
+
+template<typename TreeT, typename RootChildOnIterT>
+inline bool
+LeafIteratorBase<TreeT, RootChildOnIterT>::next()
+{
+    // If the iterator is valid for the current node one level above the leaf level,
+    // advance the iterator to the node's next child.
+    if (mIterList.test(LEAF_PARENT_LEVEL) && mIterList.next(LEAF_PARENT_LEVEL)) {
+        mIterList.down(LEAF_PARENT_LEVEL); // initialize the leaf iterator
+        return true;
+    }
+
+    Index lvl = LEAF_PARENT_LEVEL;
+    while (!mIterList.test(LEAF_PARENT_LEVEL)) {
+        if (mIterList.test(lvl)) {
+            mIterList.next(lvl);
+        } else {
+            do {
+                // Ascend to the nearest level at which
+                // one of the iterators is not yet exhausted.
+                if (lvl == ROOT_LEVEL) return false;
+                ++lvl;
+                if (mIterList.test(lvl)) mIterList.next(lvl);
+            } while (!mIterList.test(lvl));
+        }
+        // Descend to the lowest child, but not as far as the leaf iterator.
+        while (lvl > LEAF_PARENT_LEVEL && mIterList.down(lvl)) --lvl;
+    }
+    mIterList.down(LEAF_PARENT_LEVEL); // initialize the leaf iterator
+    return true;
+}
+
+
+////////////////////////////////////////
+
+
+/// An IteratorRange wraps a tree or node iterator, giving the iterator TBB
+/// splittable range semantics.
+template<typename IterT>
+class IteratorRange
+{
+public:
+    IteratorRange(const IterT& iter, size_t grainSize = 8):
+        mIter(iter),
+        mGrainSize(grainSize),
+        mSize(0)
+    {
+        mSize = this->size();
+    }
+    IteratorRange(IteratorRange& other, tbb::split):
+        mIter(other.mIter),
+        mGrainSize(other.mGrainSize),
+        mSize(other.mSize >> 1)
+    {
+        other.increment(mSize);
+    }
+
+    /// @brief Return a reference to this range's iterator.
+    /// @note The reference is const, because the iterator should not be
+    /// incremented directly.  Use this range object's increment() instead.
+    const IterT& iterator() const { return mIter; }
+
+    bool empty() const { return mSize == 0 || !mIter.test(); }
+    bool test() const { return !this->empty(); }
+    operator bool() const { return !this->empty(); }
+
+    /// @brief Return @c true if this range is splittable (i.e., if the iterator
+    /// can be advanced more than mGrainSize times).
+    bool is_divisible() const { return mSize > mGrainSize; }
+
+    /// Advance the iterator @a n times.
+    void increment(Index n = 1) { for ( ; n > 0 && mSize > 0; --n, --mSize, ++mIter) {} }
+    /// Advance the iterator to the next item.
+    IteratorRange& operator++() { this->increment(); return *this; }
+    /// @brief Advance the iterator to the next item.
+    /// @return @c true if the iterator is not yet exhausted.
+    bool next() { this->increment(); return this->test(); }
+
+private:
+    Index size() const { Index n = 0; for (IterT it(mIter); it.test(); ++n, ++it) {} return n; }
+
+    IterT mIter;
+    size_t mGrainSize;
+    /// @note mSize is only an estimate of the number of times mIter can be incremented
+    /// before it is exhausted (because the topology of the underlying tree could change
+    /// during iteration).  For the purpose of range splitting, though, that should be
+    /// sufficient, since the two halves need not be of exactly equal size.
+    Index mSize;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief Base class for tree-traversal iterators over real and virtual voxel values
+/// @todo class TreeVoxelIteratorBase;
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_TREEITERATOR_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/tree/ValueAccessor.h b/nuparu/include/openvdb_new/tree/ValueAccessor.h
new file mode 100644
index 00000000..87c3f41e
--- /dev/null
+++ b/nuparu/include/openvdb_new/tree/ValueAccessor.h
@@ -0,0 +1,2666 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file ValueAccessor.h
+///
+/// When traversing a grid in a spatially coherent pattern (e.g., iterating
+/// over neighboring voxels), request a @c ValueAccessor from the grid
+/// (with Grid::getAccessor()) and use the accessor's @c getValue() and
+/// @c setValue() methods.  These will typically be significantly faster
+/// than accessing voxels directly in the grid's tree.
+///
+/// @par Example:
+///
+/// @code
+/// FloatGrid grid;
+/// FloatGrid::Accessor acc = grid.getAccessor();
+/// // First access is slow:
+/// acc.setValue(Coord(0, 0, 0), 100);
+/// // Subsequent nearby accesses are fast, since the accessor now holds pointers
+/// // to nodes that contain (0, 0, 0) along the path from the root of the grid's
+/// // tree to the leaf:
+/// acc.setValue(Coord(0, 0, 1), 100);
+/// acc.getValue(Coord(0, 2, 0), 100);
+/// // Slow, because the accessor must be repopulated:
+/// acc.getValue(Coord(-1, -1, -1));
+/// // Fast:
+/// acc.getValue(Coord(-1, -1, -2));
+/// acc.setValue(Coord(-1, -2, 0), -100);
+/// @endcode
+
+#ifndef OPENVDB_TREE_VALUEACCESSOR_HAS_BEEN_INCLUDED
+#define OPENVDB_TREE_VALUEACCESSOR_HAS_BEEN_INCLUDED
+
+#include <boost/mpl/front.hpp>
+#include <boost/mpl/pop_front.hpp>
+#include <boost/mpl/push_back.hpp>
+#include <boost/mpl/size.hpp>
+#include <boost/mpl/at.hpp>
+#include <boost/mpl/equal_to.hpp>
+#include <boost/mpl/comparison.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/mpl/assert.hpp>
+#include <boost/mpl/erase.hpp>
+#include <boost/mpl/find.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/type_traits/is_const.hpp>
+#include <tbb/null_mutex.h>
+#include <tbb/spin_mutex.h>
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tree {
+
+// Forward declarations of local classes that are not intended for general use
+// The IsSafe template parameter is explained in the warning below.
+template<typename TreeType, bool IsSafe = true>
+class ValueAccessor0;
+template<typename TreeType, bool IsSafe = true, Index L0 = 0>
+class ValueAccessor1;
+template<typename TreeType, bool IsSafe = true, Index L0 = 0, Index L1 = 1>
+class ValueAccessor2;
+template<typename TreeType, bool IsSafe = true, Index L0 = 0, Index L1 = 1, Index L2 = 2>
+class ValueAccessor3;
+template<typename TreeCacheT, typename NodeVecT, bool AtRoot> class CacheItem;
+
+
+/// @brief This base class for ValueAccessors manages registration of an accessor
+/// with a tree so that the tree can automatically clear the accessor whenever
+/// one of its nodes is deleted.
+///
+/// @internal A base class is needed because ValueAccessor is templated on both
+/// a Tree type and a mutex type.  The various instantiations of the template
+/// are distinct, unrelated types, so they can't easily be stored in a container
+/// such as the Tree's CacheRegistry.  This base class, in contrast, is templated
+/// only on the Tree type, so for any given Tree, only two distinct instantiations
+/// are possible, ValueAccessorBase<Tree> and ValueAccessorBase<const Tree>.
+///
+/// @warning If IsSafe = false then the ValueAccessor will not register itself 
+/// with the tree from which it is constructed. While in some rare cases this can       
+/// lead to better performance (since it avoids the small overhead of insertion
+/// on creation and deletion on destruction) it is also unsafe if the tree is 
+/// modified. So unless you're an expert it is highly recommended to set 
+/// IsSafe = true, which is the default in all derived ValueAccessors defined
+/// below. However if you know that the tree is no being modifed for the lifespan 
+/// of the ValueAccessor AND the work performed per ValueAccessor is small relative 
+/// to overhead of registering it you should consider setting IsSafe = false. If 
+/// this turns out to improve performance you should really rewrite your code so as
+/// to better amortize the construction of the ValueAccessor, i.e. reuse it as much 
+/// as possible!  
+template<typename TreeType, bool IsSafe>
+class ValueAccessorBase
+{
+public:
+    static const bool IsConstTree = boost::is_const<TreeType>::value;
+
+    /// @brief Return true if this accessor is safe, i.e. registered
+    /// by the tree from which it is constructed. Un-registered
+    /// accessors can in rare cases be faster because it avoids the
+    /// (small) overhead of registration, but they are unsafe if the
+    /// tree is modified. So unless you're an expert it is highly
+    /// recommended to set IsSafe = true (which is the default).
+    static bool isSafe() { return IsSafe; }
+
+    ValueAccessorBase(TreeType& tree): mTree(&tree)
+    {
+        if (IsSafe) tree.attachAccessor(*this);
+    }
+
+    virtual ~ValueAccessorBase() { if (IsSafe && mTree) mTree->releaseAccessor(*this); }
+
+    /// @brief Return a pointer to the tree associated with this accessor.
+    /// @details The pointer will be null only if the tree from which this accessor
+    /// was constructed was subsequently deleted (which generally leaves the
+    /// accessor in an unsafe state).
+    TreeType* getTree() const { return mTree; }
+    /// Return a reference to the tree associated with this accessor.
+    TreeType& tree() const { assert(mTree); return *mTree; }
+
+    ValueAccessorBase(const ValueAccessorBase& other): mTree(other.mTree)
+    {
+        if (IsSafe && mTree) mTree->attachAccessor(*this);
+    }
+
+    ValueAccessorBase& operator=(const ValueAccessorBase& other)
+    {
+        if (&other != this) {
+            if (IsSafe && mTree) mTree->releaseAccessor(*this);
+            mTree = other.mTree;
+            if (IsSafe && mTree) mTree->attachAccessor(*this);
+        }
+        return *this;
+    }
+
+    virtual void clear() = 0;
+
+protected:
+    // Allow trees to deregister themselves.
+    template<typename> friend class Tree;
+
+    virtual void release() { mTree = NULL; }
+
+    TreeType* mTree;
+}; // class ValueAccessorBase
+
+
+////////////////////////////////////////
+
+
+/// When traversing a grid in a spatially coherent pattern (e.g., iterating
+/// over neighboring voxels), request a @c ValueAccessor from the grid
+/// (with Grid::getAccessor()) and use the accessor's @c getValue() and
+/// @c setValue() methods.  These will typically be significantly faster
+/// than accessing voxels directly in the grid's tree.
+///
+/// A ValueAccessor caches pointers to tree nodes along the path to a voxel (x, y, z).
+/// A subsequent access to voxel (x', y', z') starts from the cached leaf node and
+/// moves up until a cached node that encloses (x', y', z') is found, then traverses
+/// down the tree from that node to a leaf, updating the cache with the new path.
+/// This leads to significant acceleration of spatially-coherent accesses.
+///
+/// @param _TreeType    the type of the tree to be accessed [required]
+/// @param IsSafe       if IsSafe = false then the ValueAccessor will
+///                     not register itself with the tree from which
+///                     it is consturcted (see warning).                     
+/// @param CacheLevels  the number of nodes to be cached, starting from the leaf level
+///                     and not including the root (i.e., CacheLevels < DEPTH),
+///                     and defaulting to all non-root nodes
+/// @param MutexType    the type of mutex to use (see note)
+///    
+/// @warning If IsSafe = false then the ValueAccessor will not register itself 
+/// with the tree from which it is constructed. While in some rare cases this can       
+/// lead to better performance (since it avoids the small overhead of insertion
+/// on creation and deletion on destruction) it is also unsafe if the tree is 
+/// modified. So unless you're an expert it is highly recommended to set 
+/// IsSafe = true, which is the default. However if you know that the tree is no 
+/// being modifed for the lifespan of the ValueAccessor AND the work performed 
+/// per ValueAccessor is small relative to overhead of registering it you should 
+/// consider setting IsSafe = false. If this improves performance you should 
+/// really rewrite your code so as to better amortize the construction of the 
+/// ValueAccessor, i.e. reuse it as much as possible!
+///
+/// @note If @c MutexType is a TBB-compatible mutex, then multiple threads may
+/// safely access a single, shared accessor.  However, it is highly recommended
+/// that, instead, each thread be assigned its own, non-mutex-protected accessor.
+template<typename _TreeType,
+         bool IsSafe = true,
+         Index CacheLevels = _TreeType::DEPTH-1,
+         typename MutexType = tbb::null_mutex>
+class ValueAccessor: public ValueAccessorBase<_TreeType, IsSafe>
+{
+public:
+    BOOST_STATIC_ASSERT(CacheLevels < _TreeType::DEPTH);
+
+    typedef _TreeType                           TreeType;
+    typedef typename TreeType::RootNodeType     RootNodeT;
+    typedef typename TreeType::LeafNodeType     LeafNodeT;
+    typedef typename RootNodeT::ValueType       ValueType;
+    typedef ValueAccessorBase<TreeType, IsSafe> BaseT;
+    typedef typename MutexType::scoped_lock     LockT;
+    using BaseT::IsConstTree;
+
+    ValueAccessor(TreeType& tree): BaseT(tree), mCache(*this)
+    {
+        mCache.insert(Coord(), &tree.root());
+    }
+
+    ValueAccessor(const ValueAccessor& other): BaseT(other), mCache(*this, other.mCache) {}
+
+    ValueAccessor& operator=(const ValueAccessor& other)
+    {
+        if (&other != this) {
+            this->BaseT::operator=(other);
+            mCache.copy(*this, other.mCache);
+        }
+        return *this;
+    }
+    virtual ~ValueAccessor() {}
+
+    /// Return the number of cache levels employed by this accessor.
+    static Index numCacheLevels() { return CacheLevels; }
+
+    /// Return @c true if nodes along the path to the given voxel have been cached.
+    bool isCached(const Coord& xyz) const { LockT lock(mMutex); return mCache.isCached(xyz); }
+
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const
+    {
+        LockT lock(mMutex);
+        return mCache.getValue(xyz);
+    }
+
+    /// Return the active state of the voxel at the given coordinates.
+    bool isValueOn(const Coord& xyz) const { LockT lock(mMutex); return mCache.isValueOn(xyz); }
+
+    /// Return the active state of the voxel as well as its value
+    bool probeValue(const Coord& xyz, ValueType& value) const
+    {
+        LockT lock(mMutex);
+        return mCache.probeValue(xyz,value);
+    }
+
+    /// Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides,
+    /// or -1 if (x, y, z) isn't explicitly represented in the tree (i.e., if it is
+    /// implicitly a background voxel).
+    int getValueDepth(const Coord& xyz) const
+    {
+        LockT lock(mMutex);
+        return mCache.getValueDepth(xyz);
+    }
+
+    /// Return @c true if the value of voxel (x, y, z) resides at the leaf level
+    /// of the tree, i.e., if it is not a tile value.
+    bool isVoxel(const Coord& xyz) const { LockT lock(mMutex); return mCache.isVoxel(xyz); }
+
+    //@{
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        LockT lock(mMutex);
+        mCache.setValue(xyz, value);
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+    //@}
+
+    /// Set the value of the voxel at the given coordinate but don't change its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        LockT lock(mMutex);
+        mCache.setValueOnly(xyz, value);
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel
+    /// as active.  [Experimental]
+    void newSetValue(const Coord& xyz, const ValueType& value)
+    {
+        LockT lock(mMutex);
+        mCache.newSetValue(xyz, value);
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        LockT lock(mMutex);
+        mCache.setValueOff(xyz, value);
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details See Tree::modifyValue() for details.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        LockT lock(mMutex);
+        mCache.modifyValue(xyz, op);
+    }
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details See Tree::modifyValueAndActiveState() for details.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        LockT lock(mMutex);
+        mCache.modifyValueAndActiveState(xyz, op);
+    }
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on = true)
+    {
+        LockT lock(mMutex);
+        mCache.setActiveState(xyz, on);
+    }
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz) { this->setActiveState(xyz, true); }
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz) { this->setActiveState(xyz, false); }
+
+    /// Return the cached node of type @a NodeType.  [Mainly for internal use]
+    template<typename NodeType>
+    NodeType* getNode()
+    {
+        LockT lock(mMutex);
+        NodeType* node = NULL;
+        mCache.getNode(node);
+        return node;
+    }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).  [Mainly for internal use]
+    template<typename NodeType>
+    void insertNode(const Coord& xyz, NodeType& node)
+    {
+        LockT lock(mMutex);
+        mCache.insert(xyz, &node);
+    }
+
+    /// If a node of the given type exists in the cache, remove it, so that
+    /// isCached(xyz) returns @c false for any voxel (x, y, z) contained in
+    /// that node.  [Mainly for internal use]
+    template<typename NodeType>
+    void eraseNode() { LockT lock(mMutex); NodeType* node = NULL; mCache.erase(node); }
+
+    /// @brief Add the specified leaf to this tree, possibly creating a child branch
+    /// in the process.  If the leaf node already exists, replace it.
+    void addLeaf(LeafNodeT* leaf)
+    {
+        LockT lock(mMutex);
+        mCache.addLeaf(leaf);
+    }
+
+    /// @brief Add a tile at the specified tree level that contains voxel (x, y, z),
+    /// possibly deleting existing nodes or creating new nodes in the process.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        LockT lock(mMutex);
+        mCache.addTile(level, xyz, value, state);
+    }
+
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z).
+    /// If no such node exists, create one, but preserve the values and
+    /// active states of all voxels.
+    /// @details Use this method to preallocate a static tree topology
+    /// over which to safely perform multithreaded processing.
+    LeafNodeT* touchLeaf(const Coord& xyz)
+    {
+        LockT lock(mMutex);
+        return mCache.touchLeaf(xyz);
+    }
+
+    //@{
+    /// @brief Return a pointer to the node of the specified type that contains
+    /// voxel (x, y, z), or NULL if no such node exists.
+    template<typename NodeT>
+    NodeT* probeNode(const Coord& xyz)
+    {
+        LockT lock(mMutex);
+        return mCache.template probeNode<NodeT>(xyz);
+    }
+    template<typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz) const
+    {
+        LockT lock(mMutex);
+        return mCache.template probeConstNode<NodeT>(xyz);
+    }
+    template<typename NodeT>
+    const NodeT* probeNode(const Coord& xyz) const
+    {
+        return this->template probeConstNode<NodeT>(xyz);
+    }
+    //@}
+
+    //@{
+    /// @brief Return a pointer to the leaf node that contains voxel (x, y, z),
+    /// or NULL if no such node exists.
+    LeafNodeT* probeLeaf(const Coord& xyz)
+    {
+        LockT lock(mMutex);
+        return mCache.probeLeaf(xyz);
+    }
+    const LeafNodeT* probeConstLeaf(const Coord& xyz) const
+    {
+        LockT lock(mMutex);
+        return mCache.probeConstLeaf(xyz);
+    }
+    const LeafNodeT* probeLeaf(const Coord& xyz) const { return this->probeConstLeaf(xyz); }
+    //@}
+
+    /// Remove all nodes from this cache, then reinsert the root node.
+    virtual void clear()
+    {
+        LockT lock(mMutex);
+        mCache.clear();
+        if (this->mTree) mCache.insert(Coord(), &(this->mTree->root()));
+    }
+
+private:
+    // Allow nodes to insert themselves into the cache.
+    template<typename> friend class RootNode;
+    template<typename, Index> friend class InternalNode;
+    template<typename, Index> friend class LeafNode;
+    // Allow trees to deregister themselves.
+    template<typename> friend class Tree;
+
+    /// Prevent this accessor from calling Tree::releaseCache() on a tree that
+    /// no longer exists.  (Called by mTree when it is destroyed.)
+    virtual void release()
+    {
+        LockT lock(mMutex);
+        this->BaseT::release();
+        mCache.clear();
+    }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).
+    /// @note This operation is not mutex-protected and is intended to be called
+    /// only by nodes and only in the context of a getValue() or setValue() call.
+    template<typename NodeType>
+    void insert(const Coord& xyz, NodeType* node) { mCache.insert(xyz, node); }
+
+    // Define a list of all tree node types from LeafNode to RootNode
+    typedef typename RootNodeT::NodeChainType InvTreeT;
+    // Remove all tree node types that are excluded from the cache
+    typedef typename boost::mpl::begin<InvTreeT>::type BeginT;
+    typedef typename boost::mpl::advance<BeginT,boost::mpl::int_<CacheLevels> >::type FirstT;
+    typedef typename boost::mpl::find<InvTreeT, RootNodeT>::type LastT;
+    typedef typename boost::mpl::erase<InvTreeT,FirstT,LastT>::type SubtreeT;
+    typedef CacheItem<ValueAccessor, SubtreeT, boost::mpl::size<SubtreeT>::value==1> CacheItemT;
+
+    // Private member data
+    mutable CacheItemT mCache;
+    mutable MutexType  mMutex;
+
+}; // class ValueAccessor
+
+
+/// @brief Template specialization of the ValueAccessor with no mutex and no cache levels
+/// @details This specialization is provided mainly for benchmarking.
+/// Accessors with caching will almost always be faster.
+template<typename TreeType, bool IsSafe>
+class ValueAccessor<TreeType, IsSafe, 0, tbb::null_mutex>
+    : public ValueAccessor0<TreeType, IsSafe>
+{
+public:
+    ValueAccessor(TreeType& tree): ValueAccessor0<TreeType, IsSafe>(tree) {}
+    ValueAccessor(const ValueAccessor& other): ValueAccessor0<TreeType, IsSafe>(other) {}
+    virtual ~ValueAccessor() {}
+};
+
+
+/// Template specialization of the ValueAccessor with no mutex and one cache level
+template<typename TreeType, bool IsSafe>
+class ValueAccessor<TreeType, IsSafe, 1, tbb::null_mutex>
+    : public ValueAccessor1<TreeType, IsSafe>
+{
+public:
+    ValueAccessor(TreeType& tree): ValueAccessor1<TreeType, IsSafe>(tree) {}
+    ValueAccessor(const ValueAccessor& other): ValueAccessor1<TreeType, IsSafe>(other) {}
+    virtual ~ValueAccessor() {}
+};
+
+
+/// Template specialization of the ValueAccessor with no mutex and two cache levels
+template<typename TreeType, bool IsSafe>
+class ValueAccessor<TreeType, IsSafe, 2, tbb::null_mutex>
+    : public ValueAccessor2<TreeType, IsSafe>
+{
+public:
+    ValueAccessor(TreeType& tree): ValueAccessor2<TreeType, IsSafe>(tree) {}
+    ValueAccessor(const ValueAccessor& other): ValueAccessor2<TreeType, IsSafe>(other) {}
+    virtual ~ValueAccessor() {}
+};
+
+
+/// Template specialization of the ValueAccessor with no mutex and three cache levels
+template<typename TreeType, bool IsSafe>
+class ValueAccessor<TreeType, IsSafe, 3, tbb::null_mutex>
+    : public ValueAccessor3<TreeType, IsSafe>
+{
+public:
+    ValueAccessor(TreeType& tree): ValueAccessor3<TreeType, IsSafe>(tree) {}
+    ValueAccessor(const ValueAccessor& other): ValueAccessor3<TreeType, IsSafe>(other) {}
+    virtual ~ValueAccessor() {}
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief This accessor is thread-safe (at the cost of speed) for both reading and
+/// writing to a tree.  That is, multiple threads may safely access a single,
+/// shared ValueAccessorRW.
+///
+/// @warning Since the mutex-locking employed by the ValueAccessorRW
+/// can seriously impair performance of multithreaded applications, it
+/// is recommended that, instead, each thread be assigned its own
+/// (non-mutex protected) accessor.
+template<typename TreeType, bool IsSafe = true>
+class ValueAccessorRW: public ValueAccessor<TreeType, IsSafe, TreeType::DEPTH-1, tbb::spin_mutex>
+{
+public:
+    ValueAccessorRW(TreeType& tree)
+        : ValueAccessor<TreeType, IsSafe, TreeType::DEPTH-1, tbb::spin_mutex>(tree)
+    {
+    }
+};
+
+
+////////////////////////////////////////
+
+
+//
+// The classes below are for internal use and should rarely be used directly.
+//
+
+// An element of a compile-time linked list of node pointers, ordered from LeafNode to RootNode
+template<typename TreeCacheT, typename NodeVecT, bool AtRoot>
+class CacheItem
+{
+public:
+    typedef typename boost::mpl::front<NodeVecT>::type NodeType;
+    typedef typename NodeType::ValueType               ValueType;
+    typedef typename NodeType::LeafNodeType            LeafNodeType;
+    typedef std::numeric_limits<Int32>                 CoordLimits;
+
+    CacheItem(TreeCacheT& parent):
+        mParent(&parent),
+        mHash(CoordLimits::max()),
+        mNode(NULL),
+        mNext(parent)
+    {
+    }
+
+    //@{
+    /// Copy another CacheItem's node pointers and hash keys, but not its parent pointer.
+    CacheItem(TreeCacheT& parent, const CacheItem& other):
+        mParent(&parent),
+        mHash(other.mHash),
+        mNode(other.mNode),
+        mNext(parent, other.mNext)
+    {
+    }
+
+    CacheItem& copy(TreeCacheT& parent, const CacheItem& other)
+    {
+        mParent = &parent;
+        mHash = other.mHash;
+        mNode = other.mNode;
+        mNext.copy(parent, other.mNext);
+        return *this;
+    }
+    //@}
+
+    bool isCached(const Coord& xyz) const
+    {
+        return (this->isHashed(xyz) || mNext.isCached(xyz));
+    }
+
+    /// Cache the given node at this level.
+    void insert(const Coord& xyz, const NodeType* node)
+    {
+        mHash = (node != NULL) ? xyz & ~(NodeType::DIM-1) : Coord::max();
+        mNode = node;
+    }
+    /// Forward the given node to another level of the cache.
+    template<typename OtherNodeType>
+    void insert(const Coord& xyz, const OtherNodeType* node) { mNext.insert(xyz, node); }
+
+    /// Erase the node at this level.
+    void erase(const NodeType*) { mHash = Coord::max(); mNode = NULL; }
+    /// Erase the node at another level of the cache.
+    template<typename OtherNodeType>
+    void erase(const OtherNodeType* node) { mNext.erase(node); }
+
+    /// Erase the nodes at this and lower levels of the cache.
+    void clear() { mHash = Coord::max(); mNode = NULL; mNext.clear(); }
+
+    /// Return the cached node (if any) at this level.
+    void getNode(const NodeType*& node) const { node = mNode; }
+    void getNode(const NodeType*& node) { node = mNode; }
+    void getNode(NodeType*& node)
+    {
+        // This combination of a static assertion and a const_cast might not be elegant,
+        // but it is a lot simpler than specializing TreeCache for const Trees.
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        node = const_cast<NodeType*>(mNode);
+    }
+    /// Forward the request to another level of the cache.
+    template<typename OtherNodeType>
+    void getNode(OtherNodeType*& node) { mNext.getNode(node); }
+
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return mNode->getValueAndCache(xyz, *mParent);
+        }
+        return mNext.getValue(xyz);
+    }
+
+    void addLeaf(LeafNodeType* leaf)
+    {
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        if (NodeType::LEVEL == 0) return;
+        if (this->isHashed(leaf->origin())) {
+            assert(mNode);
+            return const_cast<NodeType*>(mNode)->addLeafAndCache(leaf, *mParent);
+        }
+        mNext.addLeaf(leaf);
+    }
+
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        if (NodeType::LEVEL < level) return;
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return const_cast<NodeType*>(mNode)->addTileAndCache(
+                level, xyz, value, state, *mParent);
+        }
+        mNext.addTile(level, xyz, value, state);
+    }
+
+    LeafNodeType* touchLeaf(const Coord& xyz)
+    {
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return const_cast<NodeType*>(mNode)->touchLeafAndCache(xyz, *mParent);
+        }
+        return mNext.touchLeaf(xyz);
+    }
+
+    LeafNodeType* probeLeaf(const Coord& xyz)
+    {
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return const_cast<NodeType*>(mNode)->probeLeafAndCache(xyz, *mParent);
+        }
+        return mNext.probeLeaf(xyz);
+    }
+
+    const LeafNodeType* probeConstLeaf(const Coord& xyz)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return mNode->probeConstLeafAndCache(xyz, *mParent);
+        }
+        return mNext.probeConstLeaf(xyz);
+    }
+
+    template<typename NodeT>
+    NodeT* probeNode(const Coord& xyz)
+    {
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (this->isHashed(xyz)) {
+            if ((boost::is_same<NodeT, NodeType>::value)) {
+                assert(mNode);
+                return reinterpret_cast<NodeT*>(const_cast<NodeType*>(mNode));
+            }
+            return const_cast<NodeType*>(mNode)->template probeNodeAndCache<NodeT>(xyz, *mParent);
+        }
+        return mNext.template probeNode<NodeT>(xyz);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    template<typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (this->isHashed(xyz)) {
+            if ((boost::is_same<NodeT, NodeType>::value)) {
+                assert(mNode);
+                return reinterpret_cast<const NodeT*>(mNode);
+            }
+            return mNode->template probeConstNodeAndCache<NodeT>(xyz, *mParent);
+        }
+        return mNext.template probeConstNode<NodeT>(xyz);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    /// Return the active state of the voxel at the given coordinates.
+    bool isValueOn(const Coord& xyz)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return mNode->isValueOnAndCache(xyz, *mParent);
+        }
+        return mNext.isValueOn(xyz);
+    }
+
+    /// Return the active state and value of the voxel at the given coordinates.
+    bool probeValue(const Coord& xyz, ValueType& value)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return mNode->probeValueAndCache(xyz, value, *mParent);
+        }
+        return mNext.probeValue(xyz, value);
+    }
+
+     int getValueDepth(const Coord& xyz)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return static_cast<int>(TreeCacheT::RootNodeT::LEVEL) -
+                   static_cast<int>(mNode->getValueLevelAndCache(xyz, *mParent));
+        } else {
+            return mNext.getValueDepth(xyz);
+        }
+    }
+
+    bool isVoxel(const Coord& xyz)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            return mNode->getValueLevelAndCache(xyz, *mParent)==0;
+        } else {
+            return mNext.isVoxel(xyz);
+        }
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+            const_cast<NodeType*>(mNode)->setValueAndCache(xyz, value, *mParent);
+        } else {
+            mNext.setValue(xyz, value);
+        }
+    }
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+            const_cast<NodeType*>(mNode)->setValueOnlyAndCache(xyz, value, *mParent);
+        } else {
+            mNext.setValueOnly(xyz, value);
+        }
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details See Tree::modifyValue() for details.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+            const_cast<NodeType*>(mNode)->modifyValueAndCache(xyz, op, *mParent);
+        } else {
+            mNext.modifyValue(xyz, op);
+        }
+    }
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details See Tree::modifyValueAndActiveState() for details.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+            const_cast<NodeType*>(mNode)->modifyValueAndActiveStateAndCache(xyz, op, *mParent);
+        } else {
+            mNext.modifyValueAndActiveState(xyz, op);
+        }
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+            const_cast<NodeType*>(mNode)->setValueOffAndCache(xyz, value, *mParent);
+        } else {
+            mNext.setValueOff(xyz, value);
+        }
+    }
+
+    /// Set the active state of the voxel at the given coordinates.
+    void setActiveState(const Coord& xyz, bool on)
+    {
+        if (this->isHashed(xyz)) {
+            assert(mNode);
+            BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+            const_cast<NodeType*>(mNode)->setActiveStateAndCache(xyz, on, *mParent);
+        } else {
+            mNext.setActiveState(xyz, on);
+        }
+    }
+
+private:
+    CacheItem(const CacheItem&);
+    CacheItem& operator=(const CacheItem&);
+
+    bool isHashed(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeType::DIM-1)) == mHash[0]
+            && (xyz[1] & ~Coord::ValueType(NodeType::DIM-1)) == mHash[1]
+            && (xyz[2] & ~Coord::ValueType(NodeType::DIM-1)) == mHash[2];
+    }
+
+    TreeCacheT* mParent;
+    Coord mHash;
+    const NodeType* mNode;
+    typedef typename boost::mpl::pop_front<NodeVecT>::type RestT; // NodeVecT minus its first item
+    CacheItem<TreeCacheT, RestT, /*AtRoot=*/boost::mpl::size<RestT>::value == 1> mNext;
+};// end of CacheItem
+
+
+/// The tail of a compile-time list of cached node pointers, ordered from LeafNode to RootNode
+template<typename TreeCacheT, typename NodeVecT>
+class CacheItem<TreeCacheT, NodeVecT, /*AtRoot=*/true>
+{
+public:
+    typedef typename boost::mpl::front<NodeVecT>::type RootNodeType;
+    typedef typename RootNodeType::ValueType           ValueType;
+    typedef typename RootNodeType::LeafNodeType        LeafNodeType;
+
+    CacheItem(TreeCacheT& parent): mParent(&parent), mRoot(NULL) {}
+    CacheItem(TreeCacheT& parent, const CacheItem& other): mParent(&parent), mRoot(other.mRoot) {}
+
+    CacheItem& copy(TreeCacheT& parent, const CacheItem& other)
+    {
+        mParent = &parent;
+        mRoot = other.mRoot;
+        return *this;
+    }
+
+    bool isCached(const Coord& xyz) const { return this->isHashed(xyz); }
+
+    void insert(const Coord&, const RootNodeType* root) { mRoot = root; }
+
+    // Needed for node types that are not cached
+    template <typename OtherNodeType>
+    void insert(const Coord&, const OtherNodeType*) {}
+
+    void erase(const RootNodeType*) { mRoot = NULL; }
+
+    void clear() { mRoot = NULL; }
+
+    void getNode(RootNodeType*& node)
+    {
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        node = const_cast<RootNodeType*>(mRoot);
+    }
+    void getNode(const RootNodeType*& node) const { node = mRoot; }
+
+    void addLeaf(LeafNodeType* leaf)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->addLeafAndCache(leaf, *mParent);
+    }
+
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->addTileAndCache(level, xyz, value, state, *mParent);
+    }
+
+    LeafNodeType* touchLeaf(const Coord& xyz)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        return const_cast<RootNodeType*>(mRoot)->touchLeafAndCache(xyz, *mParent);
+    }
+
+    LeafNodeType* probeLeaf(const Coord& xyz)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        return const_cast<RootNodeType*>(mRoot)->probeLeafAndCache(xyz, *mParent);
+    }
+
+    const LeafNodeType* probeConstLeaf(const Coord& xyz)
+    {
+        assert(mRoot);
+        return mRoot->probeConstLeafAndCache(xyz, *mParent);
+    }
+
+    template<typename NodeType>
+    NodeType* probeNode(const Coord& xyz)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        return const_cast<RootNodeType*>(mRoot)->template probeNodeAndCache<NodeType>(xyz, *mParent);
+    }
+
+    template<typename NodeType>
+    const NodeType* probeConstNode(const Coord& xyz)
+    {
+        assert(mRoot);
+        return mRoot->template probeConstNodeAndCache<NodeType>(xyz, *mParent);
+    }
+
+    int getValueDepth(const Coord& xyz)
+    {
+        assert(mRoot);
+        return mRoot->getValueDepthAndCache(xyz, *mParent);
+    }
+    bool isValueOn(const Coord& xyz)
+    {
+        assert(mRoot);
+        return mRoot->isValueOnAndCache(xyz, *mParent);
+    }
+
+    bool probeValue(const Coord& xyz, ValueType& value)
+    {
+        assert(mRoot);
+        return mRoot->probeValueAndCache(xyz, value, *mParent);
+    }
+    bool isVoxel(const Coord& xyz)
+    {
+        assert(mRoot);
+        return mRoot->getValueDepthAndCache(xyz, *mParent) ==
+               static_cast<int>(RootNodeType::LEVEL);
+    }
+    const ValueType& getValue(const Coord& xyz)
+    {
+        assert(mRoot);
+        return mRoot->getValueAndCache(xyz, *mParent);
+    }
+
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->setValueAndCache(xyz, value, *mParent);
+    }
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->setValueOnlyAndCache(xyz, value, *mParent);
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->modifyValueAndCache(xyz, op, *mParent);
+    }
+
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->modifyValueAndActiveStateAndCache(xyz, op, *mParent);
+    }
+
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->setValueOffAndCache(xyz, value, *mParent);
+    }
+
+    void setActiveState(const Coord& xyz, bool on)
+    {
+        assert(mRoot);
+        BOOST_STATIC_ASSERT(!TreeCacheT::IsConstTree);
+        const_cast<RootNodeType*>(mRoot)->setActiveStateAndCache(xyz, on, *mParent);
+    }
+
+private:
+    CacheItem(const CacheItem&);
+    CacheItem& operator=(const CacheItem&);
+
+    bool isHashed(const Coord&) const { return false; }
+
+    TreeCacheT* mParent;
+    const RootNodeType* mRoot;
+};// end of CacheItem specialized for RootNode
+
+
+////////////////////////////////////////
+
+
+/// @brief ValueAccessor with no mutex and no node caching.
+/// @details This specialization is provided mainly for benchmarking.
+/// Accessors with caching will almost always be faster.
+template<typename _TreeType, bool IsSafe>
+class ValueAccessor0: public ValueAccessorBase<_TreeType, IsSafe>
+{
+public:
+    typedef _TreeType                           TreeType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename TreeType::RootNodeType     RootNodeT;
+    typedef typename TreeType::LeafNodeType     LeafNodeT;
+    typedef ValueAccessorBase<TreeType, IsSafe> BaseT;
+
+    ValueAccessor0(TreeType& tree): BaseT(tree) {}
+
+    ValueAccessor0(const ValueAccessor0& other): BaseT(other) {}
+
+    /// Return the number of cache levels employed by this accessor.
+    static Index numCacheLevels() { return 0; }
+
+    ValueAccessor0& operator=(const ValueAccessor0& other)
+    {
+        if (&other != this) this->BaseT::operator=(other);
+        return *this;
+    }
+
+    virtual ~ValueAccessor0() {}
+
+    /// Return @c true if nodes along the path to the given voxel have been cached.
+    bool isCached(const Coord&) const { return false; }
+
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return BaseT::mTree->getValue(xyz);
+    }
+
+    /// Return the active state of the voxel at the given coordinates.
+    bool isValueOn(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return BaseT::mTree->isValueOn(xyz);
+    }
+
+    /// Return the active state and, in @a value, the value of the voxel at the given coordinates.
+    bool probeValue(const Coord& xyz, ValueType& value) const
+    {
+        assert(BaseT::mTree);
+        return BaseT::mTree->probeValue(xyz, value);
+    }
+
+    /// Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides,
+    /// or -1 if (x, y, z) isn't explicitly represented in the tree (i.e., if it is
+    /// implicitly a background voxel).
+    int getValueDepth(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return BaseT::mTree->getValueDepth(xyz);
+    }
+
+    /// Return @c true if the value of voxel (x, y, z) resides at the leaf level
+    /// of the tree, i.e., if it is not a tile value.
+    bool isVoxel(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return BaseT::mTree->getValueDepth(xyz) == static_cast<int>(RootNodeT::LEVEL);
+    }
+
+    //@{
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->setValue(xyz, value);
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+    //@}
+
+    /// Set the value of the voxel at the given coordinate but don't change its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->setValueOnly(xyz, value);
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->root().setValueOff(xyz, value);
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details See Tree::modifyValue() for details.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->modifyValue(xyz, op);
+    }
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details See Tree::modifyValueAndActiveState() for details.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->modifyValueAndActiveState(xyz, op);
+    }
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on = true)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->setActiveState(xyz, on);
+    }
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz) { this->setActiveState(xyz, true); }
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz) { this->setActiveState(xyz, false); }
+
+    /// Return the cached node of type @a NodeType.  [Mainly for internal use]
+    template<typename NodeT> NodeT* getNode() { return NULL; }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).  [Mainly for internal use]
+    template<typename NodeT> void insertNode(const Coord&, NodeT&) {}
+
+    /// @brief Add the specified leaf to this tree, possibly creating a child branch
+    /// in the process.  If the leaf node already exists, replace it.
+    void addLeaf(LeafNodeT* leaf)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->root().addLeaf(leaf);
+    }
+
+    /// @brief Add a tile at the specified tree level that contains voxel (x, y, z),
+    /// possibly deleting existing nodes or creating new nodes in the process.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->root().addTile(level, xyz, value, state);
+    }
+
+    /// If a node of the given type exists in the cache, remove it, so that
+    /// isCached(xyz) returns @c false for any voxel (x, y, z) contained in
+    /// that node.  [Mainly for internal use]
+    template<typename NodeT> void eraseNode() {}
+
+    LeafNodeT* touchLeaf(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        return BaseT::mTree->touchLeaf(xyz);
+    }
+
+    template <typename NodeT>
+    NodeT* probeNode(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        return BaseT::mTree->template probeNode<NodeT>(xyz);
+    }
+
+    template <typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return BaseT::mTree->template probeConstNode<NodeT>(xyz);
+    }
+
+    LeafNodeT* probeLeaf(const Coord& xyz)
+    {
+        return this->template probeNode<LeafNodeT>(xyz);
+    }
+
+    const LeafNodeT* probeConstLeaf(const Coord& xyz) const
+    {
+        return this->template probeConstNode<LeafNodeT>(xyz);
+    }
+
+    const LeafNodeT* probeLeaf(const Coord& xyz) const
+    {
+        return this->probeConstLeaf(xyz);
+    }
+
+    /// Remove all nodes from this cache, then reinsert the root node.
+    virtual void clear() {}
+
+private:
+    // Allow trees to deregister themselves.
+    template<typename> friend class Tree;
+
+    /// Prevent this accessor from calling Tree::releaseCache() on a tree that
+    /// no longer exists.  (Called by mTree when it is destroyed.)
+    virtual void release() { this->BaseT::release(); }
+
+}; // ValueAccessor0
+
+
+/// @brief Value accessor with one level of node caching.
+/// @details The node cache level is specified by L0 with the default value 0
+/// (defined in the forward declaration) corresponding to a LeafNode.
+///
+/// @note This class is for experts only and should rarely be used
+/// directly. Instead use ValueAccessor with its default template arguments.
+template<typename _TreeType, bool IsSafe, Index L0>
+class ValueAccessor1 : public ValueAccessorBase<_TreeType, IsSafe>
+{
+public:
+    BOOST_STATIC_ASSERT(_TreeType::DEPTH >= 2);
+    BOOST_STATIC_ASSERT( L0 < _TreeType::RootNodeType::LEVEL );
+    typedef _TreeType                           TreeType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename TreeType::RootNodeType     RootNodeT;
+    typedef typename TreeType::LeafNodeType     LeafNodeT;
+    typedef ValueAccessorBase<TreeType, IsSafe> BaseT;
+    typedef typename RootNodeT::NodeChainType   InvTreeT;
+    typedef typename boost::mpl::at<InvTreeT, boost::mpl::int_<L0> >::type NodeT0;
+
+    /// Constructor from a tree
+    ValueAccessor1(TreeType& tree) : BaseT(tree), mKey0(Coord::max()), mNode0(NULL)
+    {
+    }
+
+    /// Copy constructor
+    ValueAccessor1(const ValueAccessor1& other) : BaseT(other) { this->copy(other); }
+
+    /// Return the number of cache levels employed by this ValueAccessor
+    static Index numCacheLevels() { return 1; }
+
+    /// Asignment operator
+    ValueAccessor1& operator=(const ValueAccessor1& other)
+    {
+        if (&other != this) {
+            this->BaseT::operator=(other);
+            this->copy(other);
+        }
+        return *this;
+    }
+
+    /// Virtual destructor
+    virtual ~ValueAccessor1() {}
+
+    /// Return @c true if any of the nodes along the path to the given
+    /// voxel have been cached.
+    bool isCached(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return this->isHashed(xyz);
+    }
+
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            return mNode0->getValueAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().getValueAndCache(xyz, this->self());
+    }
+
+    /// Return the active state of the voxel at the given coordinates.
+    bool isValueOn(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            return mNode0->isValueOnAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().isValueOnAndCache(xyz, this->self());
+    }
+
+    /// Return the active state of the voxel as well as its value
+    bool probeValue(const Coord& xyz, ValueType& value) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            return mNode0->probeValueAndCache(xyz, value, this->self());
+        }
+        return BaseT::mTree->root().probeValueAndCache(xyz, value, this->self());
+    }
+
+    /// Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides,
+    /// or -1 if (x, y, z) isn't explicitly represented in the tree (i.e., if it is
+    /// implicitly a background voxel).
+    int getValueDepth(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            return RootNodeT::LEVEL - mNode0->getValueLevelAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().getValueDepthAndCache(xyz, this->self());
+    }
+
+    /// Return @c true if the value of voxel (x, y, z) resides at the leaf level
+    /// of the tree, i.e., if it is not a tile value.
+    bool isVoxel(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            return mNode0->getValueLevelAndCache(xyz, this->self()) == 0;
+        }
+        return BaseT::mTree->root().getValueDepthAndCache(xyz, this->self()) ==
+               static_cast<int>(RootNodeT::LEVEL);
+    }
+
+    //@{
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueAndCache(xyz, value, *this);
+        }
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+    //@}
+
+    /// Set the value of the voxel at the given coordinate but preserves its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueOnlyAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueOnlyAndCache(xyz, value, *this);
+        }
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueOffAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueOffAndCache(xyz, value, *this);
+        }
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details See Tree::modifyValue() for details.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->modifyValueAndCache(xyz, op, *this);
+        } else {
+            BaseT::mTree->root().modifyValueAndCache(xyz, op, *this);
+        }
+    }
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details See Tree::modifyValueAndActiveState() for details.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->modifyValueAndActiveStateAndCache(xyz, op, *this);
+        } else {
+            BaseT::mTree->root().modifyValueAndActiveStateAndCache(xyz, op, *this);
+        }
+    }
+
+    /// Set the active state of the voxel at the given coordinates but don't change its value.
+    void setActiveState(const Coord& xyz, bool on = true)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setActiveStateAndCache(xyz, on, *this);
+        } else {
+            BaseT::mTree->root().setActiveStateAndCache(xyz, on, *this);
+        }
+    }
+    /// Mark the voxel at the given coordinates as active but don't change its value.
+    void setValueOn(const Coord& xyz) { this->setActiveState(xyz, true); }
+    /// Mark the voxel at the given coordinates as inactive but don't change its value.
+    void setValueOff(const Coord& xyz) { this->setActiveState(xyz, false); }
+
+    /// Return the cached node of type @a NodeType.  [Mainly for internal use]
+    template<typename NodeT>
+    NodeT* getNode()
+    {
+        const NodeT* node = NULL;
+        this->getNode(node);
+        return const_cast<NodeT*>(node);
+    }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).  [Mainly for internal use]
+    template<typename NodeT>
+    void insertNode(const Coord& xyz, NodeT& node) { this->insert(xyz, &node); }
+
+    /// If a node of the given type exists in the cache, remove it, so that
+    /// isCached(xyz) returns @c false for any voxel (x, y, z) contained in
+    /// that node.  [Mainly for internal use]
+    template<typename NodeT>
+    void eraseNode()
+    {
+        const NodeT* node = NULL;
+        this->eraseNode(node);
+    }
+
+    /// @brief Add the specified leaf to this tree, possibly creating a child branch
+    /// in the process.  If the leaf node already exists, replace it.
+    void addLeaf(LeafNodeT* leaf)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->root().addLeaf(leaf);
+    }
+
+    /// @brief Add a tile at the specified tree level that contains voxel (x, y, z),
+    /// possibly deleting existing nodes or creating new nodes in the process.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        BaseT::mTree->root().addTile(level, xyz, value, state);
+    }
+
+    /// @brief @return the leaf node that contains voxel (x, y, z) and
+    /// if it doesn't exist, create it, but preserve the values and
+    /// active states of all voxels.
+    ///
+    /// Use this method to preallocate a static tree topology over which to
+    /// safely perform multithreaded processing.
+    LeafNodeT* touchLeaf(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed(xyz)) {
+            assert(mNode0);
+            return const_cast<NodeT0*>(mNode0)->touchLeafAndCache(xyz, *this);
+        }
+        return BaseT::mTree->root().touchLeafAndCache(xyz, *this);
+    }
+
+    /// @brief @return a pointer to the node of the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    NodeT* probeNode(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<NodeT*>(const_cast<NodeT0*>(mNode0));
+            }
+            return BaseT::mTree->root().template probeNodeAndCache<NodeT>(xyz, *this);
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    LeafNodeT* probeLeaf(const Coord& xyz)
+    {
+        return this->template probeNode<LeafNodeT>(xyz);
+    }
+
+    /// @brief @return a const pointer to the nodeof the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<const NodeT*>(mNode0);
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    const LeafNodeT* probeConstLeaf(const Coord& xyz) const
+    {
+        return this->template probeConstNode<LeafNodeT>(xyz);
+    }
+    const LeafNodeT* probeLeaf(const Coord& xyz) const { return this->probeConstLeaf(xyz); }
+
+    /// Remove all the cached nodes and invalidate the corresponding hash-keys.
+    virtual void clear()
+    {
+        mKey0  = Coord::max();
+        mNode0 = NULL;
+    }
+
+private:
+    // Allow nodes to insert themselves into the cache.
+    template<typename> friend class RootNode;
+    template<typename, Index> friend class InternalNode;
+    template<typename, Index> friend class LeafNode;
+    // Allow trees to deregister themselves.
+    template<typename> friend class Tree;
+
+    // This private method is merely for convenience.
+    inline ValueAccessor1& self() const { return const_cast<ValueAccessor1&>(*this); }
+
+    void getNode(const NodeT0*& node) { node = mNode0; }
+    void getNode(const RootNodeT*& node)
+    {
+        node = (BaseT::mTree ? &BaseT::mTree->root() : NULL);
+    }
+    template <typename OtherNodeType> void getNode(const OtherNodeType*& node) { node = NULL; }
+    void eraseNode(const NodeT0*) { mKey0 = Coord::max(); mNode0 = NULL; }
+    template <typename OtherNodeType> void eraseNode(const OtherNodeType*) {}
+
+    /// Private copy method
+    inline void copy(const ValueAccessor1& other)
+    {
+        mKey0  = other.mKey0;
+        mNode0 = other.mNode0;
+    }
+
+    /// Prevent this accessor from calling Tree::releaseCache() on a tree that
+    /// no longer exists.  (Called by mTree when it is destroyed.)
+    virtual void release()
+    {
+        this->BaseT::release();
+        this->clear();
+    }
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).
+    /// @note This operation is not mutex-protected and is intended to be called
+    /// only by nodes and only in the context of a getValue() or setValue() call.
+    inline void insert(const Coord& xyz, const NodeT0* node)
+    {
+        assert(node);
+        mKey0  = xyz & ~(NodeT0::DIM-1);
+        mNode0 = node;
+    }
+
+    /// No-op in case a tree traversal attemps to insert a node that
+    /// is not cached by the ValueAccessor
+    template<typename OtherNodeType> inline void insert(const Coord&, const OtherNodeType*) {}
+
+    inline bool isHashed(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[0]
+            && (xyz[1] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[1]
+            && (xyz[2] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[2];
+    }
+    mutable Coord mKey0;
+    mutable const NodeT0* mNode0;
+}; // ValueAccessor1
+
+
+/// @brief Value accessor with two levels of node caching.
+/// @details The node cache levels are specified by L0 and L1
+/// with the default values 0 and 1 (defined in the forward declaration)
+/// corresponding to a LeafNode and its parent InternalNode.
+///
+/// @note This class is for experts only and should rarely be used directly.
+/// Instead use ValueAccessor with its default template arguments.
+template<typename _TreeType, bool IsSafe, Index L0, Index L1>
+class ValueAccessor2 : public ValueAccessorBase<_TreeType, IsSafe>
+{
+public:
+    BOOST_STATIC_ASSERT(_TreeType::DEPTH >= 3);
+    BOOST_STATIC_ASSERT( L0 < L1 && L1 < _TreeType::RootNodeType::LEVEL );
+    typedef _TreeType                           TreeType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename TreeType::RootNodeType     RootNodeT;
+    typedef typename TreeType::LeafNodeType     LeafNodeT;
+    typedef ValueAccessorBase<TreeType, IsSafe> BaseT;
+    typedef typename RootNodeT::NodeChainType   InvTreeT;
+    typedef typename boost::mpl::at<InvTreeT, boost::mpl::int_<L0> >::type NodeT0;
+    typedef typename boost::mpl::at<InvTreeT, boost::mpl::int_<L1> >::type NodeT1;
+
+    /// Constructor from a tree
+    ValueAccessor2(TreeType& tree) : BaseT(tree),
+                                     mKey0(Coord::max()), mNode0(NULL),
+                                     mKey1(Coord::max()), mNode1(NULL) {}
+
+    /// Copy constructor
+    ValueAccessor2(const ValueAccessor2& other) : BaseT(other) { this->copy(other); }
+
+    /// Return the number of cache levels employed by this ValueAccessor
+    static Index numCacheLevels() { return 2; }
+
+    /// Asignment operator
+    ValueAccessor2& operator=(const ValueAccessor2& other)
+    {
+        if (&other != this) {
+            this->BaseT::operator=(other);
+            this->copy(other);
+        }
+        return *this;
+    }
+
+    /// Virtual destructor
+    virtual ~ValueAccessor2() {}
+
+    /// Return @c true if any of the nodes along the path to the given
+    /// voxel have been cached.
+    bool isCached(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return this->isHashed1(xyz) || this->isHashed0(xyz);
+    }
+
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->getValueAndCache(xyz, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->getValueAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().getValueAndCache(xyz, this->self());
+    }
+
+    /// Return the active state of the voxel at the given coordinates.
+    bool isValueOn(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->isValueOnAndCache(xyz, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->isValueOnAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().isValueOnAndCache(xyz, this->self());
+    }
+
+    /// Return the active state of the voxel as well as its value
+    bool probeValue(const Coord& xyz, ValueType& value) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->probeValueAndCache(xyz, value, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->probeValueAndCache(xyz, value, this->self());
+        }
+        return BaseT::mTree->root().probeValueAndCache(xyz, value, this->self());
+    }
+
+    /// Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides,
+    /// or -1 if (x, y, z) isn't explicitly represented in the tree (i.e., if it is
+    /// implicitly a background voxel).
+    int getValueDepth(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return RootNodeT::LEVEL - mNode0->getValueLevelAndCache(xyz, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return RootNodeT::LEVEL - mNode1->getValueLevelAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().getValueDepthAndCache(xyz, this->self());
+    }
+
+    /// Return @c true if the value of voxel (x, y, z) resides at the leaf level
+    /// of the tree, i.e., if it is not a tile value.
+    bool isVoxel(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->getValueLevelAndCache(xyz, this->self())==0;
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->getValueLevelAndCache(xyz, this->self())==0;
+        }
+        return BaseT::mTree->root().getValueDepthAndCache(xyz, this->self()) ==
+               static_cast<int>(RootNodeT::LEVEL);
+    }
+
+    //@{
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueAndCache(xyz, value, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setValueAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueAndCache(xyz, value, *this);
+        }
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+    //@}
+
+    /// Set the value of the voxel at the given coordinate but preserves its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueOnlyAndCache(xyz, value, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setValueOnlyAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueOnlyAndCache(xyz, value, *this);
+        }
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueOffAndCache(xyz, value, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setValueOffAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueOffAndCache(xyz, value, *this);
+        }
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details See Tree::modifyValue() for details.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->modifyValueAndCache(xyz, op, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->modifyValueAndCache(xyz, op, *this);
+        } else {
+            BaseT::mTree->root().modifyValueAndCache(xyz, op, *this);
+        }
+    }
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details See Tree::modifyValueAndActiveState() for details.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->modifyValueAndActiveStateAndCache(xyz, op, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->modifyValueAndActiveStateAndCache(xyz, op, *this);
+        } else {
+            BaseT::mTree->root().modifyValueAndActiveStateAndCache(xyz, op, *this);
+        }
+    }
+
+    /// Set the active state of the voxel at the given coordinates without changing its value.
+    void setActiveState(const Coord& xyz, bool on = true)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setActiveStateAndCache(xyz, on, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setActiveStateAndCache(xyz, on, *this);
+        } else {
+            BaseT::mTree->root().setActiveStateAndCache(xyz, on, *this);
+        }
+    }
+    /// Mark the voxel at the given coordinates as active without changing its value.
+    void setValueOn(const Coord& xyz) { this->setActiveState(xyz, true); }
+    /// Mark the voxel at the given coordinates as inactive without changing its value.
+    void setValueOff(const Coord& xyz) { this->setActiveState(xyz, false); }
+
+    /// Return the cached node of type @a NodeType.  [Mainly for internal use]
+    template<typename NodeT>
+    NodeT* getNode()
+    {
+        const NodeT* node = NULL;
+        this->getNode(node);
+        return const_cast<NodeT*>(node);
+    }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).  [Mainly for internal use]
+    template<typename NodeT>
+    void insertNode(const Coord& xyz, NodeT& node) { this->insert(xyz, &node); }
+
+    /// If a node of the given type exists in the cache, remove it, so that
+    /// isCached(xyz) returns @c false for any voxel (x, y, z) contained in
+    /// that node.  [Mainly for internal use]
+    template<typename NodeT>
+    void eraseNode()
+    {
+        const NodeT* node = NULL;
+        this->eraseNode(node);
+    }
+
+    /// @brief Add the specified leaf to this tree, possibly creating a child branch
+    /// in the process.  If the leaf node already exists, replace it.
+    void addLeaf(LeafNodeT* leaf)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed1(leaf->origin())) {
+            assert(mNode1);
+            return const_cast<NodeT1*>(mNode1)->addLeafAndCache(leaf, *this);
+        }
+        BaseT::mTree->root().addLeafAndCache(leaf, *this);
+    }
+
+    /// @brief Add a tile at the specified tree level that contains voxel (x, y, z),
+    /// possibly deleting existing nodes or creating new nodes in the process.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return const_cast<NodeT1*>(mNode1)->addTileAndCache(level, xyz, value, state, *this);
+        }
+        BaseT::mTree->root().addTileAndCache(level, xyz, value, state, *this);
+    }
+
+    /// @brief @return the leaf node that contains voxel (x, y, z) and
+    /// if it doesn't exist, create it, but preserve the values and
+    /// active states of all voxels.
+    ///
+    /// Use this method to preallocate a static tree topology over which to
+    /// safely perform multithreaded processing.
+    LeafNodeT* touchLeaf(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return const_cast<NodeT0*>(mNode0)->touchLeafAndCache(xyz, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return const_cast<NodeT1*>(mNode1)->touchLeafAndCache(xyz, *this);
+        }
+        return BaseT::mTree->root().touchLeafAndCache(xyz, *this);
+    }
+    /// @brief @return a pointer to the node of the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    NodeT* probeNode(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed0(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<NodeT*>(const_cast<NodeT0*>(mNode0));
+            } else if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return const_cast<NodeT1*>(mNode1)->template probeNodeAndCache<NodeT>(xyz, *this);
+            }
+            return BaseT::mTree->root().template probeNodeAndCache<NodeT>(xyz, *this);
+        } else if ((boost::is_same<NodeT, NodeT1>::value)) {
+            if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return reinterpret_cast<NodeT*>(const_cast<NodeT1*>(mNode1));
+            }
+            return BaseT::mTree->root().template probeNodeAndCache<NodeT>(xyz, *this);
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    /// @brief @return a pointer to the leaf node that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    LeafNodeT* probeLeaf(const Coord& xyz) { return this->template probeNode<LeafNodeT>(xyz); }
+
+    /// @brief @return a const pointer to the node of the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    const NodeT* probeConstLeaf(const Coord& xyz) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed0(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<const NodeT*>(mNode0);
+            } else if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return mNode1->template probeConstNodeAndCache<NodeT>(xyz, this->self());
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        } else if ((boost::is_same<NodeT, NodeT1>::value)) {
+            if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return reinterpret_cast<const NodeT*>(mNode1);
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    /// @brief @return a const pointer to the leaf node that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    const LeafNodeT* probeConstLeaf(const Coord& xyz) const
+    {
+        return this->template probeConstNode<LeafNodeT>(xyz);
+    }
+    const LeafNodeT* probeLeaf(const Coord& xyz) const { return this->probeConstLeaf(xyz); }
+
+    /// @brief @return a const pointer to the node of the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed0(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<const NodeT*>(mNode0);
+            } else if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return mNode1->template probeConstNodeAndCache<NodeT>(xyz, this->self());
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        } else if ((boost::is_same<NodeT, NodeT1>::value)) {
+            if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return reinterpret_cast<const NodeT*>(mNode1);
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+
+    /// Remove all the cached nodes and invalidate the corresponding hash-keys.
+    virtual void clear()
+    {
+        mKey0  = Coord::max();
+        mNode0 = NULL;
+        mKey1  = Coord::max();
+        mNode1 = NULL;
+    }
+
+private:
+    // Allow nodes to insert themselves into the cache.
+    template<typename> friend class RootNode;
+    template<typename, Index> friend class InternalNode;
+    template<typename, Index> friend class LeafNode;
+    // Allow trees to deregister themselves.
+    template<typename> friend class Tree;
+
+    // This private method is merely for convenience.
+    inline ValueAccessor2& self() const { return const_cast<ValueAccessor2&>(*this); }
+
+    void getNode(const NodeT0*& node) { node = mNode0; }
+    void getNode(const NodeT1*& node) { node = mNode1; }
+    void getNode(const RootNodeT*& node)
+    {
+        node = (BaseT::mTree ? &BaseT::mTree->root() : NULL);
+    }
+    template <typename OtherNodeType> void getNode(const OtherNodeType*& node) { node = NULL; }
+
+    void eraseNode(const NodeT0*) { mKey0 = Coord::max(); mNode0 = NULL; }
+    void eraseNode(const NodeT1*) { mKey1 = Coord::max(); mNode1 = NULL; }
+    template <typename OtherNodeType> void eraseNode(const OtherNodeType*) {}
+
+    /// Private copy method
+    inline void copy(const ValueAccessor2& other)
+    {
+        mKey0  = other.mKey0;
+        mNode0 = other.mNode0;
+        mKey1  = other.mKey1;
+        mNode1 = other.mNode1;
+    }
+
+    /// Prevent this accessor from calling Tree::releaseCache() on a tree that
+    /// no longer exists.  (Called by mTree when it is destroyed.)
+    virtual void release()
+    {
+        this->BaseT::release();
+        this->clear();
+    }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).
+    /// @note This operation is not mutex-protected and is intended to be called
+    /// only by nodes and only in the context of a getValue() or setValue() call.
+    inline void insert(const Coord& xyz, const NodeT0* node)
+    {
+        assert(node);
+        mKey0  = xyz & ~(NodeT0::DIM-1);
+        mNode0 = node;
+    }
+    inline void insert(const Coord& xyz, const NodeT1* node)
+    {
+        assert(node);
+        mKey1  = xyz & ~(NodeT1::DIM-1);
+        mNode1 = node;
+    }
+    /// No-op in case a tree traversal attemps to insert a node that
+    /// is not cached by the ValueAccessor
+    template<typename NodeT> inline void insert(const Coord&, const NodeT*) {}
+
+    inline bool isHashed0(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[0]
+            && (xyz[1] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[1]
+            && (xyz[2] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[2];
+    }
+    inline bool isHashed1(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeT1::DIM-1)) == mKey1[0]
+            && (xyz[1] & ~Coord::ValueType(NodeT1::DIM-1)) == mKey1[1]
+            && (xyz[2] & ~Coord::ValueType(NodeT1::DIM-1)) == mKey1[2];
+    }
+    mutable Coord mKey0;
+    mutable const NodeT0* mNode0;
+    mutable Coord mKey1;
+    mutable const NodeT1* mNode1;
+}; // ValueAccessor2
+
+
+/// @brief Value accessor with three levels of node caching.
+/// @details The node cache levels are specified by L0, L1, and L2
+/// with the default values 0, 1 and 2 (defined in the forward declaration)
+/// corresponding to a LeafNode, its parent InternalNode, and its parent InternalNode.
+/// Since the default configuration of all typed trees and grids, e.g.,
+/// FloatTree or FloatGrid, has a depth of four, this value accessor is the one
+/// used by default.
+///
+/// @note This class is for experts only and should rarely be used
+/// directly. Instead use ValueAccessor with its default template arguments
+template<typename _TreeType, bool IsSafe, Index L0, Index L1, Index L2>
+class ValueAccessor3 : public ValueAccessorBase<_TreeType, IsSafe>
+{
+public:
+    BOOST_STATIC_ASSERT(_TreeType::DEPTH >= 4);
+    BOOST_STATIC_ASSERT(L0 < L1 && L1 < L2 && L2 < _TreeType::RootNodeType::LEVEL);
+    typedef _TreeType                           TreeType;
+    typedef typename TreeType::ValueType        ValueType;
+    typedef typename TreeType::RootNodeType     RootNodeT;
+    typedef typename TreeType::LeafNodeType     LeafNodeT;
+    typedef ValueAccessorBase<TreeType, IsSafe> BaseT;
+    typedef typename RootNodeT::NodeChainType   InvTreeT;
+    typedef typename boost::mpl::at<InvTreeT, boost::mpl::int_<L0> >::type NodeT0;
+    typedef typename boost::mpl::at<InvTreeT, boost::mpl::int_<L1> >::type NodeT1;
+    typedef typename boost::mpl::at<InvTreeT, boost::mpl::int_<L2> >::type NodeT2;
+
+    /// Constructor from a tree
+    ValueAccessor3(TreeType& tree) : BaseT(tree),
+                                     mKey0(Coord::max()), mNode0(NULL),
+                                     mKey1(Coord::max()), mNode1(NULL),
+                                     mKey2(Coord::max()), mNode2(NULL) {}
+
+    /// Copy constructor
+    ValueAccessor3(const ValueAccessor3& other) : BaseT(other) { this->copy(other); }
+
+    /// Asignment operator
+    ValueAccessor3& operator=(const ValueAccessor3& other)
+    {
+        if (&other != this) {
+            this->BaseT::operator=(other);
+            this->copy(other);
+        }
+        return *this;
+    }
+
+    /// Return the number of cache levels employed by this ValueAccessor
+    static Index numCacheLevels() { return 3; }
+
+    /// Virtual destructor
+    virtual ~ValueAccessor3() {}
+
+    /// Return @c true if any of the nodes along the path to the given
+    /// voxel have been cached.
+    bool isCached(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        return this->isHashed2(xyz) || this->isHashed1(xyz) || this->isHashed0(xyz);
+    }
+
+    /// Return the value of the voxel at the given coordinates.
+    const ValueType& getValue(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->getValueAndCache(xyz, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->getValueAndCache(xyz, this->self());
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return mNode2->getValueAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().getValueAndCache(xyz, this->self());
+    }
+
+    /// Return the active state of the voxel at the given coordinates.
+    bool isValueOn(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->isValueOnAndCache(xyz, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->isValueOnAndCache(xyz, this->self());
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return mNode2->isValueOnAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().isValueOnAndCache(xyz, this->self());
+    }
+
+    /// Return the active state of the voxel as well as its value
+    bool probeValue(const Coord& xyz, ValueType& value) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->probeValueAndCache(xyz, value, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->probeValueAndCache(xyz, value, this->self());
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return mNode2->probeValueAndCache(xyz, value, this->self());
+        }
+        return BaseT::mTree->root().probeValueAndCache(xyz, value, this->self());
+    }
+
+    /// Return the tree depth (0 = root) at which the value of voxel (x, y, z) resides,
+    /// or -1 if (x, y, z) isn't explicitly represented in the tree (i.e., if it is
+    /// implicitly a background voxel).
+    int getValueDepth(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return RootNodeT::LEVEL - mNode0->getValueLevelAndCache(xyz, this->self());
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return RootNodeT::LEVEL - mNode1->getValueLevelAndCache(xyz, this->self());
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return RootNodeT::LEVEL - mNode2->getValueLevelAndCache(xyz, this->self());
+        }
+        return BaseT::mTree->root().getValueDepthAndCache(xyz, this->self());
+    }
+
+    /// Return @c true if the value of voxel (x, y, z) resides at the leaf level
+    /// of the tree, i.e., if it is not a tile value.
+    bool isVoxel(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return mNode0->getValueLevelAndCache(xyz, this->self())==0;
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return mNode1->getValueLevelAndCache(xyz, this->self())==0;
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return mNode2->getValueLevelAndCache(xyz, this->self())==0;
+        }
+        return BaseT::mTree->root().getValueDepthAndCache(xyz, this->self()) ==
+               static_cast<int>(RootNodeT::LEVEL);
+    }
+
+    //@{
+    /// Set the value of the voxel at the given coordinates and mark the voxel as active.
+    void setValue(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueAndCache(xyz, value, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setValueAndCache(xyz, value, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            const_cast<NodeT2*>(mNode2)->setValueAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueAndCache(xyz, value, *this);
+        }
+    }
+    void setValueOn(const Coord& xyz, const ValueType& value) { this->setValue(xyz, value); }
+    //@}
+
+    /// Set the value of the voxel at the given coordinate but preserves its active state.
+    void setValueOnly(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueOnlyAndCache(xyz, value, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setValueOnlyAndCache(xyz, value, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            const_cast<NodeT2*>(mNode2)->setValueOnlyAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueOnlyAndCache(xyz, value, *this);
+        }
+    }
+
+    /// Set the value of the voxel at the given coordinates and mark the voxel as inactive.
+    void setValueOff(const Coord& xyz, const ValueType& value)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setValueOffAndCache(xyz, value, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setValueOffAndCache(xyz, value, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            const_cast<NodeT2*>(mNode2)->setValueOffAndCache(xyz, value, *this);
+        } else {
+            BaseT::mTree->root().setValueOffAndCache(xyz, value, *this);
+        }
+    }
+
+    /// @brief Apply a functor to the value of the voxel at the given coordinates
+    /// and mark the voxel as active.
+    /// @details See Tree::modifyValue() for details.
+    template<typename ModifyOp>
+    void modifyValue(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->modifyValueAndCache(xyz, op, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->modifyValueAndCache(xyz, op, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            const_cast<NodeT2*>(mNode2)->modifyValueAndCache(xyz, op, *this);
+        } else {
+            BaseT::mTree->root().modifyValueAndCache(xyz, op, *this);
+        }
+    }
+
+    /// @brief Apply a functor to the voxel at the given coordinates.
+    /// @details See Tree::modifyValueAndActiveState() for details.
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord& xyz, const ModifyOp& op)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->modifyValueAndActiveStateAndCache(xyz, op, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->modifyValueAndActiveStateAndCache(xyz, op, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            const_cast<NodeT2*>(mNode2)->modifyValueAndActiveStateAndCache(xyz, op, *this);
+        } else {
+            BaseT::mTree->root().modifyValueAndActiveStateAndCache(xyz, op, *this);
+        }
+    }
+
+    /// Set the active state of the voxel at the given coordinates without changing its value.
+    void setActiveState(const Coord& xyz, bool on = true)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            const_cast<NodeT0*>(mNode0)->setActiveStateAndCache(xyz, on, *this);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            const_cast<NodeT1*>(mNode1)->setActiveStateAndCache(xyz, on, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            const_cast<NodeT2*>(mNode2)->setActiveStateAndCache(xyz, on, *this);
+        } else {
+            BaseT::mTree->root().setActiveStateAndCache(xyz, on, *this);
+        }
+    }
+    /// Mark the voxel at the given coordinates as active without changing its value.
+    void setValueOn(const Coord& xyz) { this->setActiveState(xyz, true); }
+    /// Mark the voxel at the given coordinates as inactive without changing its value.
+    void setValueOff(const Coord& xyz) { this->setActiveState(xyz, false); }
+
+    /// Return the cached node of type @a NodeType.  [Mainly for internal use]
+    template<typename NodeT>
+    NodeT* getNode()
+    {
+        const NodeT* node = NULL;
+        this->getNode(node);
+        return const_cast<NodeT*>(node);
+    }
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).  [Mainly for internal use]
+    template<typename NodeT>
+    void insertNode(const Coord& xyz, NodeT& node) { this->insert(xyz, &node); }
+
+    /// If a node of the given type exists in the cache, remove it, so that
+    /// isCached(xyz) returns @c false for any voxel (x, y, z) contained in
+    /// that node.  [Mainly for internal use]
+    template<typename NodeT>
+    void eraseNode()
+    {
+        const NodeT* node = NULL;
+        this->eraseNode(node);
+    }
+
+    /// @brief Add the specified leaf to this tree, possibly creating a child branch
+    /// in the process.  If the leaf node already exists, replace it.
+    void addLeaf(LeafNodeT* leaf)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed1(leaf->origin())) {
+            assert(mNode1);
+            return const_cast<NodeT1*>(mNode1)->addLeafAndCache(leaf, *this);
+        } else if (this->isHashed2(leaf->origin())) {
+            assert(mNode2);
+            return const_cast<NodeT2*>(mNode2)->addLeafAndCache(leaf, *this);
+        }
+        BaseT::mTree->root().addLeafAndCache(leaf, *this);
+    }
+
+    /// @brief Add a tile at the specified tree level that contains voxel (x, y, z),
+    /// possibly deleting existing nodes or creating new nodes in the process.
+    void addTile(Index level, const Coord& xyz, const ValueType& value, bool state)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return const_cast<NodeT1*>(mNode1)->addTileAndCache(level, xyz, value, state, *this);
+        } if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return const_cast<NodeT2*>(mNode2)->addTileAndCache(level, xyz, value, state, *this);
+        }
+        BaseT::mTree->root().addTileAndCache(level, xyz, value, state, *this);
+    }
+
+    /// @brief @return the leaf node that contains voxel (x, y, z) and
+    /// if it doesn't exist, create it, but preserve the values and
+    /// active states of all voxels.
+    ///
+    /// Use this method to preallocate a static tree topology over which to
+    /// safely perform multithreaded processing.
+    LeafNodeT* touchLeaf(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        if (this->isHashed0(xyz)) {
+            assert(mNode0);
+            return const_cast<NodeT0*>(mNode0);
+        } else if (this->isHashed1(xyz)) {
+            assert(mNode1);
+            return const_cast<NodeT1*>(mNode1)->touchLeafAndCache(xyz, *this);
+        } else if (this->isHashed2(xyz)) {
+            assert(mNode2);
+            return const_cast<NodeT2*>(mNode2)->touchLeafAndCache(xyz, *this);
+        }
+        return BaseT::mTree->root().touchLeafAndCache(xyz, *this);
+    }
+    /// @brief @return a pointer to the node of the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    NodeT* probeNode(const Coord& xyz)
+    {
+        assert(BaseT::mTree);
+        BOOST_STATIC_ASSERT(!BaseT::IsConstTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed0(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<NodeT*>(const_cast<NodeT0*>(mNode0));
+            } else if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return const_cast<NodeT1*>(mNode1)->template probeNodeAndCache<NodeT>(xyz, *this);
+            } else if (this->isHashed2(xyz)) {
+                assert(mNode2);
+                return const_cast<NodeT2*>(mNode2)->template probeNodeAndCache<NodeT>(xyz, *this);
+            }
+            return BaseT::mTree->root().template probeNodeAndCache<NodeT>(xyz, *this);
+        } else if ((boost::is_same<NodeT, NodeT1>::value)) {
+            if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return reinterpret_cast<NodeT*>(const_cast<NodeT1*>(mNode1));
+            } else if (this->isHashed2(xyz)) {
+                assert(mNode2);
+                return const_cast<NodeT2*>(mNode2)->template probeNodeAndCache<NodeT>(xyz, *this);
+            }
+            return BaseT::mTree->root().template probeNodeAndCache<NodeT>(xyz, *this);
+        } else if ((boost::is_same<NodeT, NodeT2>::value)) {
+            if (this->isHashed2(xyz)) {
+                assert(mNode2);
+                return reinterpret_cast<NodeT*>(const_cast<NodeT2*>(mNode2));
+            }
+            return BaseT::mTree->root().template probeNodeAndCache<NodeT>(xyz, *this);
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    /// @brief @return a pointer to the leaf node that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    LeafNodeT* probeLeaf(const Coord& xyz) { return this->template probeNode<LeafNodeT>(xyz); }
+
+    /// @brief @return a const pointer to the node of the specified type that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    template <typename NodeT>
+    const NodeT* probeConstNode(const Coord& xyz) const
+    {
+        assert(BaseT::mTree);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if ((boost::is_same<NodeT, NodeT0>::value)) {
+            if (this->isHashed0(xyz)) {
+                assert(mNode0);
+                return reinterpret_cast<const NodeT*>(mNode0);
+            } else if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return mNode1->template probeConstNodeAndCache<NodeT>(xyz, this->self());
+            } else if (this->isHashed2(xyz)) {
+                assert(mNode2);
+                return mNode2->template probeConstNodeAndCache<NodeT>(xyz, this->self());
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        } else if ((boost::is_same<NodeT, NodeT1>::value)) {
+            if (this->isHashed1(xyz)) {
+                assert(mNode1);
+                return reinterpret_cast<const NodeT*>(mNode1);
+            } else if (this->isHashed2(xyz)) {
+                assert(mNode2);
+                return mNode2->template probeConstNodeAndCache<NodeT>(xyz, this->self());
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        } else if ((boost::is_same<NodeT, NodeT2>::value)) {
+            if (this->isHashed2(xyz)) {
+                assert(mNode2);
+                return reinterpret_cast<const NodeT*>(mNode2);
+            }
+            return BaseT::mTree->root().template probeConstNodeAndCache<NodeT>(xyz, this->self());
+        }
+        return NULL;
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    /// @brief @return a const pointer to the leaf node that contains
+    /// voxel (x, y, z) and if it doesn't exist, return NULL.
+    const LeafNodeT* probeConstLeaf(const Coord& xyz) const
+    {
+        return this->template probeConstNode<LeafNodeT>(xyz);
+    }
+    const LeafNodeT* probeLeaf(const Coord& xyz) const { return this->probeConstLeaf(xyz); }
+
+    /// Remove all the cached nodes and invalidate the corresponding hash-keys.
+    virtual void clear()
+    {
+        mKey0  = Coord::max();
+        mNode0 = NULL;
+        mKey1  = Coord::max();
+        mNode1 = NULL;
+        mKey2  = Coord::max();
+        mNode2 = NULL;
+    }
+
+private:
+    // Allow nodes to insert themselves into the cache.
+    template<typename> friend class RootNode;
+    template<typename, Index> friend class InternalNode;
+    template<typename, Index> friend class LeafNode;
+    // Allow trees to deregister themselves.
+    template<typename> friend class Tree;
+
+    // This private method is merely for convenience.
+    inline ValueAccessor3& self() const { return const_cast<ValueAccessor3&>(*this); }
+
+    /// Private copy method
+    inline void copy(const ValueAccessor3& other)
+    {
+        mKey0  = other.mKey0;
+        mNode0 = other.mNode0;
+        mKey1  = other.mKey1;
+        mNode1 = other.mNode1;
+        mKey2  = other.mKey2;
+        mNode2 = other.mNode2;
+    }
+
+    /// Prevent this accessor from calling Tree::releaseCache() on a tree that
+    /// no longer exists.  (Called by mTree when it is destroyed.)
+    virtual void release()
+    {
+        this->BaseT::release();
+        this->clear();
+    }
+    void getNode(const NodeT0*& node) { node = mNode0; }
+    void getNode(const NodeT1*& node) { node = mNode1; }
+    void getNode(const NodeT2*& node) { node = mNode2; }
+    void getNode(const RootNodeT*& node)
+    {
+        node = (BaseT::mTree ? &BaseT::mTree->root() : NULL);
+    }
+    template <typename OtherNodeType> void getNode(const OtherNodeType*& node) { node = NULL; }
+
+    void eraseNode(const NodeT0*) { mKey0 = Coord::max(); mNode0 = NULL; }
+    void eraseNode(const NodeT1*) { mKey1 = Coord::max(); mNode1 = NULL; }
+    void eraseNode(const NodeT2*) { mKey2 = Coord::max(); mNode2 = NULL; }
+    template <typename OtherNodeType> void eraseNode(const OtherNodeType*) {}
+
+    /// Cache the given node, which should lie along the path from the root node to
+    /// the node containing voxel (x, y, z).
+    /// @note This operation is not mutex-protected and is intended to be called
+    /// only by nodes and only in the context of a getValue() or setValue() call.
+    inline void insert(const Coord& xyz, const NodeT0* node)
+    {
+        assert(node);
+        mKey0  = xyz & ~(NodeT0::DIM-1);
+        mNode0 = node;
+    }
+    inline void insert(const Coord& xyz, const NodeT1* node)
+    {
+        assert(node);
+        mKey1  = xyz & ~(NodeT1::DIM-1);
+        mNode1 = node;
+    }
+    inline void insert(const Coord& xyz, const NodeT2* node)
+    {
+        assert(node);
+        mKey2  = xyz & ~(NodeT2::DIM-1);
+        mNode2 = node;
+    }
+    /// No-op in case a tree traversal attemps to insert a node that
+    /// is not cached by the ValueAccessor
+    template<typename OtherNodeType>
+    inline void insert(const Coord&, const OtherNodeType*)
+    {
+    }
+    inline bool isHashed0(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[0]
+            && (xyz[1] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[1]
+            && (xyz[2] & ~Coord::ValueType(NodeT0::DIM-1)) == mKey0[2];
+    }
+    inline bool isHashed1(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeT1::DIM-1)) == mKey1[0]
+            && (xyz[1] & ~Coord::ValueType(NodeT1::DIM-1)) == mKey1[1]
+            && (xyz[2] & ~Coord::ValueType(NodeT1::DIM-1)) == mKey1[2];
+    }
+    inline bool isHashed2(const Coord& xyz) const
+    {
+        return (xyz[0] & ~Coord::ValueType(NodeT2::DIM-1)) == mKey2[0]
+            && (xyz[1] & ~Coord::ValueType(NodeT2::DIM-1)) == mKey2[1]
+            && (xyz[2] & ~Coord::ValueType(NodeT2::DIM-1)) == mKey2[2];
+    }
+    mutable Coord mKey0;
+    mutable const NodeT0* mNode0;
+    mutable Coord mKey1;
+    mutable const NodeT1* mNode1;
+    mutable Coord mKey2;
+    mutable const NodeT2* mNode2;
+}; // ValueAccessor3
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_TREE_VALUEACCESSOR_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/CpuTimer.h b/nuparu/include/openvdb_new/util/CpuTimer.h
new file mode 100644
index 00000000..f3c5e5d9
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/CpuTimer.h
@@ -0,0 +1,128 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_UTIL_CPUTIMER_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_CPUTIMER_HAS_BEEN_INCLUDED
+
+#include <string>
+#include <tbb/tick_count.h>
+#include <sstream>// for ostringstream
+#include <iomanip>//for setprecision
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+/// @brief Simple timer for basic profiling.
+///
+/// @code
+///    CpuTimer timer;
+///    // code here will not be timed!    
+///    timer.start("algorithm");
+///    // code to be timed goes here
+///    timer.stop();
+/// @endcode
+///    
+/// or to time multiple blocks of code    
+///
+/// @code
+///    CpuTimer timer("algorithm 1");
+///    // code to be timed goes here
+///    timer.restart("algorithm 2");
+///    // code to be timed goes here
+///    timer.stop();
+/// @endcode
+class CpuTimer
+{
+public:
+
+    /// @brief Initiate timer
+    CpuTimer() : mT0(tbb::tick_count::now()) {}
+
+    /// @brief Prints message and re-start timer.
+    ///
+    /// @note Should normally be followed by a call to stop()
+    CpuTimer(const std::string& msg) { this->start(msg); }
+
+    /// @brief Start timer.
+    ///
+    /// @note Should normally be followed by a call to time()
+    inline void start() { mT0 = tbb::tick_count::now(); }
+
+    /// @brief Print message and re-start timer.
+    ///
+    /// @note Should normally be followed by a call to stop()
+    inline void start(const std::string& msg)
+    {
+        std::cerr << msg << " ... ";
+        this->start();
+    }
+
+    /// @brief Stop previous timer, print message and re-start timer.
+    ///
+    /// @note Should normally be followed by a call to stop()
+    inline void restart(const std::string& msg)
+    {
+        this->stop();
+        this->start(msg);
+    }
+
+    /// Return Time diference in milliseconds since construction or start was called.
+    inline double delta() const
+    {
+        tbb::tick_count::interval_t dt = tbb::tick_count::now() - mT0;
+        return 1000.0*dt.seconds();
+    }
+
+    /// @brief Print time in milliseconds since construction or start was called.
+    inline void stop() const
+    {
+        const double t = this->delta();
+        std::ostringstream ostr;
+        ostr << "completed in " << std::setprecision(3) << t << " ms\n";
+        std::cerr << ostr.str();
+    }
+
+private:
+
+    tbb::tick_count mT0;
+};// CpuTimer
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_UTIL_CPUTIMER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/Formats.h b/nuparu/include/openvdb_new/util/Formats.h
new file mode 100644
index 00000000..52a8306e
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/Formats.h
@@ -0,0 +1,140 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file Formats.h
+///
+/// @brief Utility routines to output nicely-formatted numeric values
+
+
+#ifndef OPENVDB_UTIL_FORMATS_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_FORMATS_HAS_BEEN_INCLUDED
+
+#include <iosfwd>
+#include <sstream>
+#include <string>
+#include <openvdb/version.h>
+#include <openvdb/Platform.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+/// Output a byte count with the correct binary suffix (KB, MB, GB or TB).
+/// @param os         the output stream
+/// @param bytes      the byte count to be output
+/// @param head       a string to be output before the numeric text
+/// @param tail       a string to be output after the numeric text
+/// @param exact      if true, also output the unmodified count, e.g., "4.6 KB (4620 Bytes)"
+/// @param width      a fixed width for the numeric text
+/// @param precision  the number of digits after the decimal point
+/// @return 0, 1, 2, 3 or 4, denoting the order of magnitude of the count.
+OPENVDB_API int
+printBytes(std::ostream& os, uint64_t bytes,
+    const std::string& head = "",
+    const std::string& tail = "\n",
+    bool exact = false, int width = 8, int precision = 3);
+
+/// Output a number with the correct SI suffix (thousand, million, billion or trillion)
+/// @param os         the output stream
+/// @param number     the number to be output
+/// @param head       a string to be output before the numeric text
+/// @param tail       a string to be output after the numeric text
+/// @param exact      if true, also output the unmodified count, e.g., "4.6 Thousand (4620)"
+/// @param width      a fixed width for the numeric text
+/// @param precision  the number of digits after the decimal point
+/// @return 0, 1, 2, 3 or 4, denoting the order of magnitude of the number.
+OPENVDB_API int
+printNumber(std::ostream& os, uint64_t number,
+    const std::string& head = "",
+    const std::string& tail = "\n",
+    bool exact = true, int width = 8, int precision = 3);
+
+
+////////////////////////////////////////
+
+
+/// @brief I/O manipulator that formats integer values with thousands separators
+template<typename IntT>
+class FormattedInt
+{
+public:
+    static char sep() { return ','; }
+
+    FormattedInt(IntT n): mInt(n) {}
+
+    std::ostream& put(std::ostream& os) const
+    {
+        // Convert the integer to a string.
+        std::ostringstream ostr;
+        ostr << mInt;
+        std::string s = ostr.str();
+        // Prefix the string with spaces if its length is not a multiple of three.
+        size_t padding = (s.size() % 3) ? 3 - (s.size() % 3) : 0;
+        s = std::string(padding, ' ') + s;
+        // Construct a new string in which groups of three digits are followed
+        // by a separator character.
+        ostr.str("");
+        for (size_t i = 0, N = s.size(); i < N; ) {
+            ostr << s[i];
+            ++i;
+            if (i >= padding && i % 3 == 0 && i < s.size()) {
+                ostr << sep();
+            }
+        }
+        // Remove any padding that was added and output the string.
+        s = ostr.str();
+        os << s.substr(padding, s.size());
+        return os;
+    }
+
+private:
+    IntT mInt;
+};
+
+template<typename IntT>
+std::ostream& operator<<(std::ostream& os, const FormattedInt<IntT>& n) { return n.put(os); }
+
+/// @return an I/O manipulator that formats the given integer value for output to a stream.
+template<typename IntT>
+FormattedInt<IntT> formattedInt(IntT n) { return FormattedInt<IntT>(n); }
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_FORMATS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/MapsUtil.h b/nuparu/include/openvdb_new/util/MapsUtil.h
new file mode 100644
index 00000000..6a21ecdd
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/MapsUtil.h
@@ -0,0 +1,321 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file MapsUtil.h
+
+#ifndef OPENVDB_UTIL_MAPSUTIL_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_MAPSUTIL_HAS_BEEN_INCLUDED
+
+#include <openvdb/math/Maps.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+// Utility methods for calculating bounding boxes
+
+/// @brief Calculate an axis-aligned bounding box in the given map's domain
+/// (e.g., index space) from an axis-aligned bounding box in its range
+/// (e.g., world space)
+template<typename MapType>
+inline void
+calculateBounds(const MapType& map, const BBoxd& in, BBoxd& out)
+{
+    const Vec3d& min = in.min();
+    const Vec3d& max = in.max();
+
+    // the pre-image of the 8 corners of the box
+    Vec3d corners[8];
+    corners[0] = in.min();;
+    corners[1] = Vec3d(min(0), min(1), min(2));
+    corners[2] = Vec3d(max(0), max(1), min(2));
+    corners[3] = Vec3d(min(0), max(1), min(2));
+    corners[4] = Vec3d(min(0), min(1), max(2));
+    corners[5] = Vec3d(max(0), min(1), max(2));
+    corners[6] = max;
+    corners[7] = Vec3d(min(0), max(1), max(2));
+
+    Vec3d pre_image;
+    Vec3d& out_min = out.min();
+    Vec3d& out_max = out.max();
+    out_min = map.applyInverseMap(corners[0]);
+    out_max = min;
+    for (int i = 1; i < 8; ++i) {
+        pre_image = map.applyInverseMap(corners[i]);
+        for (int j = 0; j < 3; ++j) {
+            out_min(j) = std::min( out_min(j), pre_image(j));
+            out_max(j) = std::max( out_max(j), pre_image(j));
+        }
+    }
+}
+
+
+/// @brief Calculate an axis-aligned bounding box in the given map's domain
+/// from a spherical bounding box in its range.
+template<typename MapType>
+inline void
+calculateBounds(const MapType& map, const Vec3d& center, const Real radius, BBoxd& out)
+{
+    // On return, out gives a bounding box in continuous index space
+    // that encloses the sphere.
+    //
+    // the image of a sphere under the inverse of the linearMap will be an ellipsoid.
+
+    if (math::is_linear<MapType>::value) {
+        // I want to find extrema for three functions f(x', y', z') = x', or = y', or = z'
+        // with the constraint that g = (x-xo)^2 + (y-yo)^2 + (z-zo)^2 = r^2.
+        // Where the point x,y,z is the image of x',y',z'
+        // Solve: \lambda Grad(g) = Grad(f) and g = r^2.
+        // Note: here (x,y,z) is the image of (x',y',z'), and the gradient
+        // is w.r.t the (') space.
+        //
+        // This can be solved exactly: e_a^T (x' -xo') =\pm r\sqrt(e_a^T J^(-1)J^(-T)e_a)
+        // where e_a is one of the three unit vectors.   -  djh.
+
+        /// find the image of the center of the sphere
+        Vec3d center_pre_image = map.applyInverseMap(center);
+
+        std::vector<Vec3d> coordinate_units;
+        coordinate_units.push_back(Vec3d(1,0,0));
+        coordinate_units.push_back(Vec3d(0,1,0));
+        coordinate_units.push_back(Vec3d(0,0,1));
+
+        Vec3d& out_min = out.min();
+        Vec3d& out_max = out.max();
+        for (int direction = 0; direction < 3; ++direction) {
+            Vec3d temp  = map.applyIJT(coordinate_units[direction]);
+            double offset =
+                radius * sqrt(temp.x()*temp.x() + temp.y()*temp.y() + temp.z()*temp.z());
+            out_min(direction) = center_pre_image(direction) - offset;
+            out_max(direction) = center_pre_image(direction) + offset;
+        }
+
+    } else {
+        // This is some unknown map type.  In this case, we form an axis-aligned
+        // bounding box for the sphere in world space and find the pre-images of
+        // the corners in index space.  From these corners we compute an axis-aligned
+        // bounding box in index space.
+        BBoxd bounding_box(center - radius*Vec3d(1,1,1), center + radius*Vec3d(1,1,1));
+        calculateBounds<MapType>(map, bounding_box, out);
+    }
+}
+
+
+namespace { // anonymous namespace for this helper function
+
+/// @brief Find the intersection of a line passing through the point
+/// \f$ (x=0, z=-1/g)\f$ with the circle \f$ (x-xo)^2 + (z-zo)^2 = r^2 \f$
+/// at a point tangent to the circle.
+/// @return 0 if the focal point (0, -1/g) is inside the circle,
+/// 1 if the focal point touches the circle, or 2 when both points are found.
+inline int
+findTangentPoints(const double g, const double xo, const double zo,
+    const double r, double& xp, double& zp, double& xm, double& zm)
+{
+    double x2 = xo * xo;
+    double r2 = r * r;
+    double xd = g * xo;
+    double xd2 = xd*xd;
+    double zd = g * zo + 1.;
+    double zd2 = zd*zd;
+    double rd2 = r2*g*g;
+
+    double distA = xd2 + zd2;
+    double distB = distA - rd2;
+
+    if (distB > 0) {
+        double discriminate = sqrt(distB);
+
+        xp = xo - xo*rd2/distA + r * zd *discriminate / distA;
+        xm = xo - xo*rd2/distA - r * zd *discriminate / distA;
+
+        zp = (zo*zd2 + zd*g*(x2 - r2) - xo*xo*g - r*xd*discriminate) / distA;
+        zm = (zo*zd2 + zd*g*(x2 - r2) - xo*xo*g + r*xd*discriminate) / distA;
+
+        return 2;
+
+    } if (0 >= distB && distB >= -1e-9) {
+        // the circle touches the focal point (x=0, z = -1/g)
+        xp = 0;    xm = 0;
+        zp = -1/g; zm = -1/g;
+
+        return 1;
+    }
+
+    return 0;
+}
+
+} // end anonymous namespace
+
+
+/// @brief Calculate an axis-aligned bounding box in index space
+/// from a spherical bounding box in world space.
+/// @note This specialization is optimized for a frustum map
+template<>
+inline void
+calculateBounds<math::NonlinearFrustumMap>(const math::NonlinearFrustumMap& frustum,
+    const Vec3d& center, const Real radius, BBoxd& out)
+{
+    // The frustum is a nonlinear map followed by a uniform scale, rotation, translation.
+    // First we invert the translation, rotation and scale to find the spherical pre-image
+    // of the sphere in "local" coordinates where the frustum is aligned with the near plane
+    // on the z=0 plane and the "camera" is located at (x=0, y=0, z=-1/g).
+
+    // check that the internal map has no shear.
+    const math::AffineMap& secondMap = frustum.secondMap();
+    // test if the linear part has shear or non-uniform scaling
+    if (!frustum.hasSimpleAffine()) {
+
+        // In this case, we form an axis-aligned bounding box for sphere in world space
+        // and find the pre_images of the corners in voxel space.  From these corners we
+        // compute an axis-algined bounding box in voxel-spae
+        BBoxd bounding_box(center - radius*Vec3d(1,1,1), center + radius*Vec3d(1,1,1));
+        calculateBounds<math::NonlinearFrustumMap>(frustum, bounding_box, out);
+        return;
+    }
+
+    // for convenience
+    Vec3d& out_min = out.min();
+    Vec3d& out_max = out.max();
+
+    Vec3d centerLS = secondMap.applyInverseMap(center);
+    Vec3d voxelSize = secondMap.voxelSize();
+
+    // all the voxels have the same size since we know this is a simple affine map
+    double radiusLS = radius / voxelSize(0);
+
+    double gamma = frustum.getGamma();
+    double xp;
+    double zp;
+    double xm;
+    double zm;
+    int soln_number;
+
+    // the bounding box in index space for the points in the frustum
+    const BBoxd&  bbox = frustum.getBBox();
+    // initialize min and max
+    const double x_min = bbox.min().x();
+    const double y_min = bbox.min().y();
+    const double z_min = bbox.min().z();
+
+    const double x_max = bbox.max().x();
+    const double y_max = bbox.max().y();
+    const double z_max = bbox.max().z();
+
+    out_min.x() = x_min;
+    out_max.x() = x_max;
+    out_min.y() = y_min;
+    out_max.y() = y_max;
+
+    Vec3d extreme;
+    Vec3d extreme2;
+    Vec3d pre_image;
+    // find the x-range
+    soln_number = findTangentPoints(gamma, centerLS.x(), centerLS.z(), radiusLS, xp, zp, xm, zm);
+    if (soln_number == 2) {
+        extreme.x() = xp;
+        extreme.y() = centerLS.y();
+        extreme.z() = zp;
+
+        // location in world space of the tangent point
+        extreme2 = secondMap.applyMap(extreme);
+        // convert back to voxel space
+        pre_image = frustum.applyInverseMap(extreme2);
+        out_max.x() = std::max(x_min, std::min(x_max, pre_image.x()));
+
+        extreme.x() = xm;
+        extreme.y() = centerLS.y();
+        extreme.z() = zm;
+        // location in world space of the tangent point
+        extreme2 = secondMap.applyMap(extreme);
+
+        // convert back to voxel space
+        pre_image = frustum.applyInverseMap(extreme2);
+        out_min.x() = std::max(x_min, std::min(x_max, pre_image.x()));
+
+    } else if (soln_number == 1) {
+        // the circle was tangent at the focal point
+    } else if (soln_number == 0) {
+        // the focal point was inside the circle
+    }
+
+    // find the y-range
+    soln_number = findTangentPoints(gamma, centerLS.y(), centerLS.z(), radiusLS, xp, zp, xm, zm);
+    if (soln_number == 2) {
+        extreme.x() = centerLS.x();
+        extreme.y() = xp;
+        extreme.z() = zp;
+
+        // location in world space of the tangent point
+        extreme2 = secondMap.applyMap(extreme);
+        // convert back to voxel space
+        pre_image = frustum.applyInverseMap(extreme2);
+        out_max.y() = std::max(y_min, std::min(y_max, pre_image.y()));
+
+        extreme.x() = centerLS.x();
+        extreme.y() = xm;
+        extreme.z() = zm;
+        extreme2 = secondMap.applyMap(extreme);
+
+        // convert back to voxel space
+        pre_image = frustum.applyInverseMap(extreme2);
+        out_min.y() = std::max(y_min, std::min(y_max, pre_image.y()));
+
+    } else if (soln_number == 1) {
+        // the circle was tangent at the focal point
+    } else if (soln_number == 0) {
+        // the focal point was inside the circle
+    }
+
+    // the near and far
+    // the closest point.  The front of the frustum is at 0 in index space
+    double near_dist = std::max(centerLS.z() - radiusLS, 0.);
+    // the farthest point.  The back of the frustum is at mDepth in index space
+    double far_dist = std::min(centerLS.z() + radiusLS, frustum.getDepth() );
+
+    Vec3d near_point(0.f, 0.f, near_dist);
+    Vec3d far_point(0.f, 0.f, far_dist);
+
+    out_min.z() = std::max(z_min, frustum.applyInverseMap(secondMap.applyMap(near_point)).z());
+    out_max.z() = std::min(z_max, frustum.applyInverseMap(secondMap.applyMap(far_point)).z());
+
+}
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_MAPSUTIL_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/Name.h b/nuparu/include/openvdb_new/util/Name.h
new file mode 100644
index 00000000..b5df425f
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/Name.h
@@ -0,0 +1,72 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_UTIL_NAME_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_NAME_HAS_BEEN_INCLUDED
+
+#include <openvdb/Platform.h>
+#include <openvdb/version.h>
+#include <string>
+#include <iostream>
+#include <vector>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+typedef std::string Name;
+
+inline Name
+readString(std::istream& is)
+{
+    uint32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(uint32_t));
+    std::string buffer(size, ' ');
+    if (size>0) is.read(&buffer[0], size);
+    return buffer;
+}
+
+
+inline void
+writeString(std::ostream& os, const Name& name)
+{
+    uint32_t size = uint32_t(name.size());
+    os.write(reinterpret_cast<char*>(&size), sizeof(uint32_t));
+    os.write(&name[0], size);
+}
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_NAME_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/NodeMasks.h b/nuparu/include/openvdb_new/util/NodeMasks.h
new file mode 100644
index 00000000..8c525a51
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/NodeMasks.h
@@ -0,0 +1,1411 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Ken Museth
+///
+/// @file NodeMasks.h
+
+#ifndef OPENVDB_UTIL_NODEMASKS_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_NODEMASKS_HAS_BEEN_INCLUDED
+
+#include <cassert>
+#include <cstring>
+#include <iostream>// for cout
+#include <openvdb/Platform.h>
+#include <openvdb/Types.h>
+//#include <boost/mpl/if.hpp>
+//#include <strings.h> // for ffs
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+/// Return the number of on bits in the given 8-bit value.
+inline Index32
+CountOn(Byte v)
+{
+    // Simple LUT:
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const Byte numBits[256] = {
+#   define B2(n)  n,     n+1,     n+1,     n+2
+#   define B4(n)  B2(n), B2(n+1), B2(n+1), B2(n+2)
+#   define B6(n)  B4(n), B4(n+1), B4(n+1), B4(n+2)
+           B6(0), B6(1), B6(1),   B6(2)
+    };
+    return numBits[v];
+
+    // Sequentially clear least significant bits
+    //Index32 c;
+    //for (c = 0; v; c++)  v &= v - 0x01U;
+    //return c;
+
+    // This version is only fast on CPUs with fast "%" and "*" operations
+    //return (v * UINT64_C(0x200040008001) & UINT64_C(0x111111111111111)) % 0xF;
+}
+/// Return the number of off bits in the given 8-bit value.
+inline Index32 CountOff(Byte v) { return CountOn(static_cast<Byte>(~v)); }
+
+/// Return the number of on bits in the given 32-bit value.
+inline Index32
+CountOn(Index32 v)
+{
+    v = v - ((v >> 1) & 0x55555555U);
+    v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U);
+    return (((v + (v >> 4)) & 0xF0F0F0FU) * 0x1010101U) >> 24;
+}
+
+/// Return the number of off bits in the given 32-bit value.
+inline Index32 CountOff(Index32 v) { return CountOn(~v); }
+
+/// Return the number of on bits in the given 64-bit value.
+inline Index32
+CountOn(Index64 v)
+{
+    v = v - ((v >> 1) & UINT64_C(0x5555555555555555));
+    v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333));
+    return static_cast<Index32>(
+        (((v + (v >> 4)) & UINT64_C(0xF0F0F0F0F0F0F0F)) * UINT64_C(0x101010101010101)) >> 56);
+}
+
+/// Return the number of off bits in the given 64-bit value.
+inline Index32 CountOff(Index64 v) { return CountOn(~v); }
+
+/// Return the least significant on bit of the given 8-bit value.
+inline Index32
+FindLowestOn(Byte v)
+{
+    assert(v);
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const Byte DeBruijn[8] = {0, 1, 6, 2, 7, 5, 4, 3};
+    return DeBruijn[Byte((v & -v) * 0x1DU) >> 5];
+}
+
+/// Return the least significant on bit of the given 32-bit value.
+inline Index32
+FindLowestOn(Index32 v)
+{
+    assert(v);
+    //return ffs(v);
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const Byte DeBruijn[32] = {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+        31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+    };
+    return DeBruijn[Index32((v & -v) * 0x077CB531U) >> 27];
+}
+
+/// Return the least significant on bit of the given 64-bit value.
+inline Index32
+FindLowestOn(Index64 v)
+{
+    assert(v);
+    //return ffsll(v);
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const Byte DeBruijn[64] = {
+        0,   1,  2, 53,  3,  7, 54, 27, 4,  38, 41,  8, 34, 55, 48, 28,
+        62,  5, 39, 46, 44, 42, 22,  9, 24, 35, 59, 56, 49, 18, 29, 11,
+        63, 52,  6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12,
+    };
+    return DeBruijn[Index64((v & -v) * UINT64_C(0x022FDD63CC95386D)) >> 58];
+}
+
+/// Return the most significant on bit of the given 32-bit value.
+inline Index32
+FindHighestOn(Index32 v)
+{
+#ifndef _MSC_VER // Visual C++ doesn't guarantee thread-safe initialization of local statics
+    static
+#endif
+    const Byte DeBruijn[32] = {
+        0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+        8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+    };
+    v |= v >> 1; // first round down to one less than a power of 2
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return DeBruijn[Index32(v * 0x07C4ACDDU) >> 27];
+}
+
+
+////////////////////////////////////////
+
+
+/// Base class for the bit mask iterators
+template <typename NodeMask>
+class BaseMaskIterator
+{
+protected:
+    Index32          mPos;//bit position
+    const NodeMask*  mParent;//this iterator can't change the parent_mask!
+public:
+    BaseMaskIterator() : mPos(NodeMask::SIZE), mParent(NULL) {}
+    BaseMaskIterator(Index32 pos,const NodeMask *parent) : mPos(pos), mParent(parent)
+    {
+        assert( (parent==NULL && pos==0 ) ||  (parent!=NULL && pos<=NodeMask::SIZE) );
+    }
+    bool operator==(const BaseMaskIterator &iter) const {return mPos == iter.mPos;}
+    bool operator!=(const BaseMaskIterator &iter) const {return mPos != iter.mPos;}
+    bool operator< (const BaseMaskIterator &iter) const {return mPos <  iter.mPos;}
+    BaseMaskIterator& operator=(const BaseMaskIterator& iter)
+    {
+        mPos = iter.mPos; mParent = iter.mParent; return *this;
+    }
+    Index32 offset() const {return mPos;}
+    Index32 pos() const {return mPos;}
+    bool test() const
+    {
+        assert(mPos  <= NodeMask::SIZE);
+        return (mPos != NodeMask::SIZE);
+    }
+    operator bool() const {return this->test();}
+}; // class BaseMaskIterator
+
+
+/// @note This happens to be a const-iterator!
+template <typename NodeMask>
+class OnMaskIterator: public BaseMaskIterator<NodeMask>
+{
+private:
+    typedef BaseMaskIterator<NodeMask> BaseType;
+    using BaseType::mPos;//bit position;
+    using BaseType::mParent;//this iterator can't change the parent_mask!
+public:
+    OnMaskIterator() : BaseType() {}
+    OnMaskIterator(Index32 pos,const NodeMask *parent) : BaseType(pos,parent) {}
+    void increment()
+    {
+        assert(mParent != NULL);
+        mPos = mParent->findNextOn(mPos+1);
+        assert(mPos <= NodeMask::SIZE);
+    }
+    void increment(Index n) { while(n-- && this->next()) ; }
+    bool next()
+    {
+        this->increment();
+        return this->test();
+    }
+    bool operator*() const {return true;}
+    OnMaskIterator& operator++()
+    {
+        this->increment();
+        return *this;
+    }
+}; // class OnMaskIterator
+
+
+template <typename NodeMask>
+class OffMaskIterator: public BaseMaskIterator<NodeMask>
+{
+private:
+    typedef BaseMaskIterator<NodeMask> BaseType;
+    using BaseType::mPos;//bit position;
+    using BaseType::mParent;//this iterator can't change the parent_mask!
+public:
+    OffMaskIterator() : BaseType()  {}
+    OffMaskIterator(Index32 pos,const NodeMask *parent) : BaseType(pos,parent) {}
+    void increment()
+    {
+        assert(mParent != NULL);
+        mPos=mParent->findNextOff(mPos+1);
+        assert(mPos <= NodeMask::SIZE);
+    }
+    void increment(Index n) { while(n-- && this->next()) ; }
+    bool next()
+    {
+        this->increment();
+        return this->test();
+    }
+    bool operator*() const {return false;}
+    OffMaskIterator& operator++()
+    {
+        this->increment();
+        return *this;
+    }
+}; // class OffMaskIterator
+
+
+template <typename NodeMask>
+class DenseMaskIterator: public BaseMaskIterator<NodeMask>
+{
+private:
+    typedef BaseMaskIterator<NodeMask> BaseType;
+    using BaseType::mPos;//bit position;
+    using BaseType::mParent;//this iterator can't change the parent_mask!
+
+public:
+    DenseMaskIterator() : BaseType() {}
+    DenseMaskIterator(Index32 pos,const NodeMask *parent) : BaseType(pos,parent) {}
+    void increment()
+    {
+        assert(mParent != NULL);
+        mPos += 1;//careful - the increment might go beyond the end
+        assert(mPos<= NodeMask::SIZE);
+    }
+    void increment(Index n) { while(n-- && this->next()) ; }
+    bool next()
+    {
+        this->increment();
+        return this->test();
+    }
+    bool operator*() const {return mParent->isOn(mPos);}
+    DenseMaskIterator& operator++()
+    {
+        this->increment();
+        return *this;
+    }
+}; // class DenseMaskIterator
+
+
+/// @brief Bit mask for the internal and leaf nodes of VDB. This
+/// is a 64-bit implementation.
+///
+/// @note A template specialization for Log2Dim=1 and Log2Dim=2 are
+/// given below.
+template<Index Log2Dim>
+class NodeMask
+{
+public:
+    BOOST_STATIC_ASSERT( Log2Dim>2 );
+
+    static const Index32 LOG2DIM    = Log2Dim;
+    static const Index32 DIM        = 1<<Log2Dim;
+    static const Index32 SIZE       = 1<<3*Log2Dim;
+    static const Index32 WORD_COUNT = SIZE >> 6;// 2^6=64
+    typedef Index64 Word;
+
+private:
+
+    // The bits are represented as a linear array of Words, and the
+    // size of a Word is 32 or 64 bits depending on the platform.
+    // The BIT_MASK is defined as the number of bits in a Word - 1
+    //static const Index32 BIT_MASK   = sizeof(void*) == 8 ? 63 : 31;
+    //static const Index32 LOG2WORD   = BIT_MASK == 63 ? 6 : 5;
+    //static const Index32 WORD_COUNT = SIZE >> LOG2WORD;
+    //typedef boost::mpl::if_c<BIT_MASK == 63, Index64, Index32>::type Word;
+
+    Word mWords[WORD_COUNT];//only member data!
+
+public:
+    /// Default constructor sets all bits off
+    NodeMask() { this->setOff(); }
+    /// All bits are set to the specified state
+     NodeMask(bool on) { this->set(on); }
+    /// Copy constructor
+    NodeMask(const NodeMask &other) { *this = other; }
+    /// Destructor
+    ~NodeMask() {}
+    /// Assignment operator
+    NodeMask& operator=(const NodeMask& other)
+    {
+        Index32 n = WORD_COUNT;
+        const Word* w2 = other.mWords;
+        for (Word* w1 = mWords; n--; ++w1, ++w2) *w1 = *w2;
+        return *this;
+    }
+
+    typedef OnMaskIterator<NodeMask>    OnIterator;
+    typedef OffMaskIterator<NodeMask>   OffIterator;
+    typedef DenseMaskIterator<NodeMask> DenseIterator;
+
+    OnIterator beginOn() const       { return OnIterator(this->findFirstOn(),this); }
+    OnIterator endOn() const         { return OnIterator(SIZE,this); }
+    OffIterator beginOff() const     { return OffIterator(this->findFirstOff(),this); }
+    OffIterator endOff() const       { return OffIterator(SIZE,this); }
+    DenseIterator beginDense() const { return DenseIterator(0,this); }
+    DenseIterator endDense() const   { return DenseIterator(SIZE,this); }
+
+    bool operator == (const NodeMask &other) const
+    {
+        int n = WORD_COUNT;
+        for (const Word *w1=mWords, *w2=other.mWords; n-- && *w1++ == *w2++;) ;
+        return n == -1;
+    }
+
+    bool operator != (const NodeMask &other) const { return !(*this == other); }
+
+    //
+    // Bitwise logical operations
+    //
+    
+    /// @brief Apply a functor to the words of the this and the other mask.
+    ///
+    /// @details An example that implements the "operator&=" method:
+    /// @code
+    /// struct Op { inline void operator()(W &w1, const W& w2) const { w1 &= w2; } };
+    /// @endcode
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other, const WordOp& op)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2) op( *w1, *w2);
+        return *this;
+    }
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other1, const NodeMask& other2, const WordOp& op)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other1.mWords, *w3 = other2.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2, ++w3) op( *w1, *w2, *w3);
+        return *this;
+    }
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other1, const NodeMask& other2, const NodeMask& other3,
+                            const WordOp& op)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other1.mWords, *w3 = other2.mWords, *w4 = other3.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2, ++w3, ++w4) op( *w1, *w2, *w3, *w4);
+        return *this;
+    }
+    /// @brief Bitwise intersection
+    const NodeMask& operator&=(const NodeMask& other)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2) *w1 &= *w2;
+        return *this;
+    }
+    /// @brief Bitwise union 
+    const NodeMask& operator|=(const NodeMask& other)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2) *w1 |= *w2;
+        return *this;
+    }
+    /// @brief Bitwise difference
+    const NodeMask& operator-=(const NodeMask& other)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2) *w1 &= ~*w2;
+        return *this;
+    }
+    /// @brief Bitwise XOR
+    const NodeMask& operator^=(const NodeMask& other)
+    {
+        Word *w1 = mWords;
+        const Word *w2 = other.mWords;
+        for (Index32 n = WORD_COUNT; n--;  ++w1, ++w2) *w1 ^= *w2;
+        return *this;
+    }
+    NodeMask operator!()                      const { NodeMask m(*this); m.toggle(); return m; }
+    NodeMask operator&(const NodeMask& other) const { NodeMask m(*this); m &= other; return m; }
+    NodeMask operator|(const NodeMask& other) const { NodeMask m(*this); m |= other; return m; }
+    NodeMask operator^(const NodeMask& other) const { NodeMask m(*this); m ^= other; return m; }
+   
+    /// Return the byte size of this NodeMask
+    static Index32 memUsage() { return static_cast<Index32>(WORD_COUNT*sizeof(Word)); }
+    /// Return the total number of on bits
+    Index32 countOn() const
+    {
+        Index32 sum = 0, n = WORD_COUNT;
+        for (const Word* w = mWords; n--; ++w) sum += CountOn(*w);
+        return sum;
+    }
+    /// Return the total number of on bits
+    Index32 countOff() const { return SIZE-this->countOn(); }
+    /// Set the <i>n</i>th  bit on
+    void setOn(Index32 n) {
+        assert( (n >> 6) < WORD_COUNT );
+        mWords[n >> 6] |=  Word(1) << (n & 63);
+    }
+    /// Set the <i>n</i>th bit off
+    void setOff(Index32 n) {
+        assert( (n >> 6) < WORD_COUNT );
+        mWords[n >> 6] &=  ~(Word(1) << (n & 63));
+    }
+    /// Set the <i>n</i>th bit to the specified state
+    void set(Index32 n, bool On) { On ? this->setOn(n) : this->setOff(n); }
+    /// Set all bits to the specified state
+    void set(bool on)
+    {
+        const Word state = on ? ~Word(0) : Word(0);
+        Index32 n = WORD_COUNT;
+        for (Word* w = mWords; n--; ++w) *w = state;
+    }
+    /// Set all bits on
+    void setOn()
+    {
+        Index32 n = WORD_COUNT;
+        for (Word* w = mWords; n--; ++w) *w = ~Word(0);
+    }
+    /// Set all bits off
+    void setOff()
+    {
+        Index32 n = WORD_COUNT;
+        for (Word* w = mWords; n--; ++w) *w = Word(0);
+    }
+    /// Toggle the state of the <i>n</i>th bit
+    void toggle(Index32 n) {
+        assert( (n >> 6) < WORD_COUNT );
+        mWords[n >> 6] ^= Word(1) << (n & 63);
+    }
+    /// Toggle the state of all bits in the mask
+    void toggle()
+    {
+        Index32 n = WORD_COUNT;
+        for (Word* w = mWords; n--; ++w) *w = ~*w;
+    }
+    /// Set the first bit on
+    void setFirstOn()  { this->setOn(0); }
+    /// Set the last bit on
+    void setLastOn()   { this->setOn(SIZE-1); }
+    /// Set the first bit off
+    void setFirstOff() { this->setOff(0); }
+    /// Set the last bit off
+    void setLastOff()  { this->setOff(SIZE-1); }
+    /// Return @c true if the <i>n</i>th bit is on
+    bool isOn(Index32 n) const
+    {
+        assert( (n >> 6) < WORD_COUNT );
+        return 0 != (mWords[n >> 6] & (Word(1) << (n & 63)));
+    }
+    /// Return @c true if the <i>n</i>th bit is off
+    bool isOff(Index32 n) const {return !this->isOn(n); }
+    /// Return @c true if all the bits are on
+    bool isOn() const
+    {
+        int n = WORD_COUNT;
+        for (const Word *w = mWords; n-- && *w++ == ~Word(0);) ;
+        return n == -1;
+    }
+    /// Return @c true if all the bits are off
+    bool isOff() const
+    {
+        int n = WORD_COUNT;
+        for (const Word *w = mWords; n-- && *w++ == Word(0);) ;
+        return n == -1;
+    }
+    Index32 findFirstOn() const
+    {
+        Index32 n = 0;
+        const Word* w = mWords;
+        for (; n<WORD_COUNT && !*w; ++w, ++n) ;
+        return n==WORD_COUNT ? SIZE : (n << 6) + FindLowestOn(*w);
+    }
+    Index32 findFirstOff() const
+    {
+        Index32 n = 0;
+        const Word* w = mWords;
+        for (; n<WORD_COUNT && !~*w; ++w, ++n) ;
+        return n==WORD_COUNT ? SIZE : (n << 6) + FindLowestOn(~*w);
+    }
+
+    //@{
+    /// Return the <i>n</i>th word of the bit mask, for a word of arbitrary size.
+    template<typename WordT>
+    WordT getWord(Index n) const
+    {
+        assert(n*8*sizeof(WordT) < SIZE);
+        return reinterpret_cast<const WordT*>(mWords)[n];
+    }
+    template<typename WordT>
+    WordT& getWord(Index n)
+    {
+        assert(n*8*sizeof(WordT) < SIZE);
+        return reinterpret_cast<WordT*>(mWords)[n];
+    }
+    //@}
+
+    void save(std::ostream& os) const
+    {
+        os.write(reinterpret_cast<const char*>(mWords), this->memUsage());
+    }
+    void load(std::istream& is) {
+        is.read(reinterpret_cast<char*>(mWords), this->memUsage());
+    }
+    /// @brief simple print method for debugging
+    void printInfo(std::ostream& os=std::cout) const
+    {
+        os << "NodeMask: Dim=" << DIM << " Log2Dim=" << Log2Dim
+            << " Bit count=" << SIZE << " word count=" << WORD_COUNT << std::endl;
+    }
+    void printBits(std::ostream& os=std::cout, Index32 max_out=80u) const
+    {
+        const Index32 n=(SIZE>max_out ? max_out : SIZE);
+        for (Index32 i=0; i < n; ++i) {
+            if ( !(i & 63) )
+                os << "||";
+            else if ( !(i%8) )
+                os << "|";
+            os << this->isOn(i);
+        }
+        os << "|" << std::endl;
+    }
+    void printAll(std::ostream& os=std::cout, Index32 max_out=80u) const
+    {
+        this->printInfo(os);
+        this->printBits(os, max_out);
+    }
+
+    Index32 findNextOn(Index32 start) const
+    {
+        Index32 n = start >> 6;//initiate
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
+        Index32 m = start & 63;
+        Word b = mWords[n];
+        if (b & (Word(1) << m)) return start;//simpel case: start is on
+        b &= ~Word(0) << m;// mask out lower bits
+        while(!b && ++n<WORD_COUNT) b = mWords[n];// find next none-zero word
+        return (!b ? SIZE : (n << 6) + FindLowestOn(b));//catch last word=0
+    }
+
+    Index32 findNextOff(Index32 start) const
+    {
+        Index32 n = start >> 6;//initiate
+        if (n >= WORD_COUNT) return SIZE; // check for out of bounds
+        Index32 m = start & 63;
+        Word b = ~mWords[n];
+        if (b & (Word(1) << m)) return start;//simpel case: start is on
+        b &= ~Word(0) << m;// mask out lower bits
+        while(!b && ++n<WORD_COUNT) b = ~mWords[n];// find next none-zero word
+        return (!b ? SIZE : (n << 6) + FindLowestOn(b));//catch last word=0
+    }
+};// NodeMask
+
+
+/// @brief Template specialization of NodeMask for Log2Dim=1, i.e. 2^3 nodes
+template<>
+class NodeMask<1>
+{
+public:
+
+    static const Index32 LOG2DIM    = 1;
+    static const Index32 DIM        = 2;
+    static const Index32 SIZE       = 8;
+    static const Index32 WORD_COUNT = 1;
+    typedef Byte Word;
+
+private:
+
+    Byte mByte;//only member data!
+
+public:
+    /// Default constructor sets all bits off
+    NodeMask() : mByte(0x00U) {}
+    /// All bits are set to the specified state
+    NodeMask(bool on) : mByte(on ? 0xFFU : 0x00U) {}
+    /// Copy constructor
+    NodeMask(const NodeMask &other) : mByte(other.mByte) {}
+    /// Destructor
+    ~NodeMask() {}
+    /// Assignment operator
+    void operator = (const NodeMask &other) { mByte = other.mByte; }
+
+    typedef OnMaskIterator<NodeMask>    OnIterator;
+    typedef OffMaskIterator<NodeMask>   OffIterator;
+    typedef DenseMaskIterator<NodeMask> DenseIterator;
+
+    OnIterator beginOn() const       { return OnIterator(this->findFirstOn(),this); }
+    OnIterator endOn() const         { return OnIterator(SIZE,this); }
+    OffIterator beginOff() const     { return OffIterator(this->findFirstOff(),this); }
+    OffIterator endOff() const       { return OffIterator(SIZE,this); }
+    DenseIterator beginDense() const { return DenseIterator(0,this); }
+    DenseIterator endDense() const   { return DenseIterator(SIZE,this); }
+
+    bool operator == (const NodeMask &other) const { return mByte == other.mByte; }
+
+    bool operator != (const NodeMask &other) const {return mByte != other.mByte; }
+
+    //
+    // Bitwise logical operations
+    //
+    
+    /// @brief Apply a functor to the words of the this and the other mask.
+    ///
+    /// @details An example that implements the "operator&=" method:
+    /// @code
+    /// struct Op { inline void operator()(Word &w1, const Word& w2) const { w1 &= w2; } };
+    /// @endcode
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other, const WordOp& op)
+    {
+        op(mByte, other.mByte);
+        return *this;
+    }
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other1, const NodeMask& other2, const WordOp& op)
+    {
+        op(mByte, other1.mByte, other2.mByte);
+        return *this;
+    }
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other1, const NodeMask& other2, const NodeMask& other3,
+                            const WordOp& op)
+    {
+        op(mByte, other1.mByte, other2.mByte, other3.mByte);
+        return *this;
+    }
+    /// @brief Bitwise intersection
+    const NodeMask& operator&=(const NodeMask& other)
+    {
+        mByte &= other.mByte;
+        return *this;
+    }
+    /// @brief Bitwise union 
+    const NodeMask& operator|=(const NodeMask& other)
+    {
+        mByte |= other.mByte;
+        return *this;
+    }
+    /// @brief Bitwise difference 
+    const NodeMask& operator-=(const NodeMask& other)
+    {
+        mByte &= static_cast<Byte>(~other.mByte);
+        return *this;
+    }
+    /// @brief Bitwise XOR
+    const NodeMask& operator^=(const NodeMask& other)
+    {
+        mByte ^= other.mByte;
+        return *this;
+    }
+    NodeMask operator!()                      const { NodeMask m(*this); m.toggle(); return m; }
+    NodeMask operator&(const NodeMask& other) const { NodeMask m(*this); m &= other; return m; }
+    NodeMask operator|(const NodeMask& other) const { NodeMask m(*this); m |= other; return m; }
+    NodeMask operator^(const NodeMask& other) const { NodeMask m(*this); m ^= other; return m; }
+    /// Return the byte size of this NodeMask
+    static Index32 memUsage() { return 1; }
+    /// Return the total number of on bits
+    Index32 countOn() const { return CountOn(mByte); }
+    ///  Return the total number of on bits
+    Index32 countOff() const { return CountOff(mByte); }
+    /// Set the <i>n</i>th  bit on
+    void setOn(Index32 n) {
+        assert( n  < 8 );
+        mByte = mByte | static_cast<Byte>(0x01U << (n & 7));
+    }
+    /// Set the <i>n</i>th bit off
+    void setOff(Index32 n) {
+        assert( n  < 8 );
+        mByte = mByte & static_cast<Byte>(~(0x01U << (n & 7)));
+    }
+    /// Set the <i>n</i>th bit to the specified state
+    void set(Index32 n, bool On) { On ? this->setOn(n) : this->setOff(n); }
+    /// Set all bits to the specified state
+    void set(bool on) { mByte = on ? 0xFFU : 0x00U; }
+    /// Set all bits on
+    void setOn() { mByte = 0xFFU; }
+    /// Set all bits off
+    void setOff() { mByte = 0x00U; }
+    /// Toggle the state of the <i>n</i>th bit
+    void toggle(Index32 n) {
+        assert( n  < 8 );
+        mByte = mByte ^ static_cast<Byte>(0x01U << (n & 7));
+    }
+    /// Toggle the state of all bits in the mask
+    void toggle() { mByte = static_cast<Byte>(~mByte); }
+    /// Set the first bit on
+    void setFirstOn()  { this->setOn(0); }
+    /// Set the last bit on
+    void setLastOn()   { this->setOn(7); }
+    /// Set the first bit off
+    void setFirstOff() { this->setOff(0); }
+    /// Set the last bit off
+    void setLastOff()  { this->setOff(7); }
+    /// Return true if the <i>n</i>th bit is on
+    bool isOn(Index32 n) const
+    {
+        assert( n  < 8 );
+        return mByte & (0x01U << (n & 7));
+    }
+    /// Return true if the <i>n</i>th bit is off
+    bool isOff(Index32 n) const {return !this->isOn(n); }
+    /// Return true if all the bits are on
+    bool isOn() const { return mByte == 0xFFU; }
+    /// Return true if all the bits are off
+    bool isOff() const { return mByte == 0; }
+    Index32 findFirstOn() const { return mByte ? FindLowestOn(mByte) : 8; }
+    Index32 findFirstOff() const
+    {
+        const Byte b = static_cast<Byte>(~mByte);
+        return b ? FindLowestOn(b) : 8;
+    }
+    /*
+    //@{
+    /// Return the <i>n</i>th word of the bit mask, for a word of arbitrary size.
+    /// @note This version assumes WordT=Byte and n=0!
+    template<typename WordT>
+    WordT getWord(Index n) const
+    {
+        BOOST_STATIC_ASSERT(sizeof(WordT) == sizeof(Byte));
+        assert(n == 0);
+        return reinterpret_cast<WordT>(mByte);
+    }
+    template<typename WordT>
+    WordT& getWord(Index n)
+    {
+        BOOST_STATIC_ASSERT(sizeof(WordT) == sizeof(Byte));
+        assert(n == 0);
+        return reinterpret_cast<WordT&>(mByte);
+    }
+    //@}
+    */
+    void save(std::ostream& os) const
+    {
+        os.write(reinterpret_cast<const char*>(&mByte), 1);
+    }
+    void load(std::istream& is) { is.read(reinterpret_cast<char*>(&mByte), 1); }
+    /// @brief simple print method for debugging
+    void printInfo(std::ostream& os=std::cout) const
+    {
+        os << "NodeMask: Dim=2, Log2Dim=1, Bit count=8, Word count=1"<<std::endl;
+    }
+    void printBits(std::ostream& os=std::cout) const
+    {
+        os << "||";
+        for (Index32 i=0; i < 8; ++i) os << this->isOn(i);
+        os << "||" << std::endl;
+    }
+    void printAll(std::ostream& os=std::cout) const
+    {
+        this->printInfo(os);
+        this->printBits(os);
+    }
+
+    Index32 findNextOn(Index32 start) const
+    {
+        if (start>=8) return 8;
+        const Byte b = static_cast<Byte>(mByte & (0xFFU << start));
+        return  b ? FindLowestOn(b) : 8;
+    }
+
+    Index32 findNextOff(Index32 start) const
+    {
+        if (start>=8) return 8;
+        const Byte b = static_cast<Byte>(~mByte & (0xFFU << start));
+        return  b ? FindLowestOn(b) : 8;
+    }
+
+};// NodeMask<1>
+
+
+/// @brief Template specialization of NodeMask for Log2Dim=2, i.e. 4^3 nodes
+template<>
+class NodeMask<2>
+{
+public:
+
+    static const Index32 LOG2DIM    =  2;
+    static const Index32 DIM        =  4;
+    static const Index32 SIZE       = 64;
+    static const Index32 WORD_COUNT = 1;
+    typedef Index64 Word;
+
+private:
+
+    Word mWord;//only member data!
+
+public:
+    /// Default constructor sets all bits off
+    NodeMask() : mWord(UINT64_C(0x00)) {}
+    /// All bits are set to the specified state
+    NodeMask(bool on) : mWord(on ? UINT64_C(0xFFFFFFFFFFFFFFFF) : UINT64_C(0x00)) {}
+    /// Copy constructor
+    NodeMask(const NodeMask &other) : mWord(other.mWord) {}
+    /// Destructor
+    ~NodeMask() {}
+    /// Assignment operator
+    void operator = (const NodeMask &other) { mWord = other.mWord; }
+
+    typedef OnMaskIterator<NodeMask>    OnIterator;
+    typedef OffMaskIterator<NodeMask>   OffIterator;
+    typedef DenseMaskIterator<NodeMask> DenseIterator;
+
+    OnIterator beginOn() const       { return OnIterator(this->findFirstOn(),this); }
+    OnIterator endOn() const         { return OnIterator(SIZE,this); }
+    OffIterator beginOff() const     { return OffIterator(this->findFirstOff(),this); }
+    OffIterator endOff() const       { return OffIterator(SIZE,this); }
+    DenseIterator beginDense() const { return DenseIterator(0,this); }
+    DenseIterator endDense() const   { return DenseIterator(SIZE,this); }
+
+    bool operator == (const NodeMask &other) const { return mWord == other.mWord; }
+
+    bool operator != (const NodeMask &other) const {return mWord != other.mWord; }
+
+    //
+    // Bitwise logical operations
+    //
+
+    /// @brief Apply a functor to the words of the this and the other mask.
+    ///
+    /// @details An example that implements the "operator&=" method:
+    /// @code
+    /// struct Op { inline void operator()(Word &w1, const Word& w2) const { w1 &= w2; } };
+    /// @endcode
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other, const WordOp& op)
+    {
+        op(mWord, other.mWord);
+        return *this;
+    }
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other1, const NodeMask& other2, const WordOp& op)
+    {
+        op(mWord, other1.mWord, other2.mWord);
+        return *this;
+    }
+    template<typename WordOp>
+    const NodeMask& foreach(const NodeMask& other1, const NodeMask& other2, const NodeMask& other3,
+                            const WordOp& op)
+    {
+        op(mWord, other1.mWord, other2.mWord, other3.mWord);
+        return *this;
+    }
+    /// @brief Bitwise intersection
+    const NodeMask& operator&=(const NodeMask& other)
+    {
+        mWord &= other.mWord;
+        return *this;
+    }
+    /// @brief Bitwise union
+    const NodeMask& operator|=(const NodeMask& other)
+    {
+        mWord |= other.mWord;
+        return *this;
+    }
+    /// @brief Bitwise difference 
+    const NodeMask& operator-=(const NodeMask& other)
+    {
+        mWord &= ~other.mWord;
+        return *this;
+    }
+    /// @brief Bitwise XOR
+    const NodeMask& operator^=(const NodeMask& other)
+    {
+        mWord ^= other.mWord;
+        return *this;
+    }
+    NodeMask operator!()                      const { NodeMask m(*this); m.toggle(); return m; }
+    NodeMask operator&(const NodeMask& other) const { NodeMask m(*this); m &= other; return m; }
+    NodeMask operator|(const NodeMask& other) const { NodeMask m(*this); m |= other; return m; }
+    NodeMask operator^(const NodeMask& other) const { NodeMask m(*this); m ^= other; return m; }
+    /// Return the byte size of this NodeMask
+    static Index32 memUsage() { return 8; }
+    /// Return the total number of on bits
+    Index32 countOn() const { return CountOn(mWord); }
+    ///  Return the total number of on bits
+    Index32 countOff() const { return CountOff(mWord); }
+    /// Set the <i>n</i>th  bit on
+    void setOn(Index32 n) {
+        assert( n  < 64 );
+        mWord |= UINT64_C(0x01) << (n & 63);
+    }
+    /// Set the <i>n</i>th bit off
+    void setOff(Index32 n) {
+        assert( n  < 64 );
+        mWord &= ~(UINT64_C(0x01) << (n & 63));
+    }
+    /// Set the <i>n</i>th bit to the specified state
+    void set(Index32 n, bool On) { On ? this->setOn(n) : this->setOff(n); }
+    /// Set all bits to the specified state
+    void set(bool on) { mWord = on ? UINT64_C(0xFFFFFFFFFFFFFFFF) : UINT64_C(0x00); }
+    /// Set all bits on
+    void setOn() { mWord = UINT64_C(0xFFFFFFFFFFFFFFFF); }
+    /// Set all bits off
+    void setOff() { mWord = UINT64_C(0x00); }
+    /// Toggle the state of the <i>n</i>th bit
+    void toggle(Index32 n) {
+        assert( n  < 64 );
+        mWord ^= UINT64_C(0x01) << (n & 63);
+    }
+    /// Toggle the state of all bits in the mask
+    void toggle() { mWord = ~mWord; }
+    /// Set the first bit on
+    void setFirstOn()  { this->setOn(0); }
+    /// Set the last bit on
+    void setLastOn()   { this->setOn(63); }
+    /// Set the first bit off
+    void setFirstOff() { this->setOff(0); }
+    /// Set the last bit off
+    void setLastOff()  { this->setOff(63); }
+    /// Return true if the <i>n</i>th bit is on
+    bool isOn(Index32 n) const
+    {
+        assert( n  < 64 );
+        return 0 != (mWord & (UINT64_C(0x01) << (n & 63)));
+    }
+    /// Return true if the <i>n</i>th bit is off
+    bool isOff(Index32 n) const {return !this->isOn(n); }
+    /// Return true if all the bits are on
+    bool isOn() const { return mWord == UINT64_C(0xFFFFFFFFFFFFFFFF); }
+    /// Return true if all the bits are off
+    bool isOff() const { return mWord == 0; }
+    Index32 findFirstOn() const { return mWord ? FindLowestOn(mWord) : 64; }
+    Index32 findFirstOff() const
+    {
+        const Word w = ~mWord;
+        return w ? FindLowestOn(w) : 64;
+    }
+    //@{
+    /// Return the <i>n</i>th word of the bit mask, for a word of arbitrary size.
+    template<typename WordT>
+    WordT getWord(Index n) const
+    {
+        assert(n*8*sizeof(WordT) < SIZE);
+        return reinterpret_cast<const WordT*>(&mWord)[n];
+    }
+    template<typename WordT>
+    WordT& getWord(Index n)
+    {
+        assert(n*8*sizeof(WordT) < SIZE);
+        return reinterpret_cast<WordT*>(mWord)[n];
+    }
+    //@}
+    void save(std::ostream& os) const
+    {
+        os.write(reinterpret_cast<const char*>(&mWord), 8);
+    }
+    void load(std::istream& is) { is.read(reinterpret_cast<char*>(&mWord), 8); }
+    /// @brief simple print method for debugging
+    void printInfo(std::ostream& os=std::cout) const
+    {
+        os << "NodeMask: Dim=4, Log2Dim=2, Bit count=64, Word count=1"<<std::endl;
+    }
+    void printBits(std::ostream& os=std::cout) const
+    {
+        os << "|";
+        for (Index32 i=0; i < 64; ++i) {
+            if ( !(i%8) ) os << "|";
+            os << this->isOn(i);
+        }
+        os << "||" << std::endl;
+    }
+    void printAll(std::ostream& os=std::cout) const
+    {
+        this->printInfo(os);
+        this->printBits(os);
+    }
+
+    Index32 findNextOn(Index32 start) const
+    {
+        if (start>=64) return 64;
+        const Word w = mWord & (UINT64_C(0xFFFFFFFFFFFFFFFF) << start);
+        return  w ? FindLowestOn(w) : 64;
+    }
+
+    Index32 findNextOff(Index32 start) const
+    {
+        if (start>=64) return 64;
+        const Word w = ~mWord & (UINT64_C(0xFFFFFFFFFFFFFFFF) << start);
+        return  w ? FindLowestOn(w) : 64;
+    }
+
+};// NodeMask<2>
+
+
+// Unlike NodeMask above this RootNodeMask has a run-time defined size.
+// It is only included for backward compatibility and will likely be
+// deprecated in the future!
+// This class is 32-bit specefic, hence the use if Index32 vs Index!
+class RootNodeMask
+{
+protected:
+    Index32   mBitSize, mIntSize;
+    Index32  *mBits;
+
+public:
+    RootNodeMask(): mBitSize(0), mIntSize(0), mBits(NULL) {}
+    RootNodeMask(Index32 bit_size):
+        mBitSize(bit_size), mIntSize(((bit_size-1)>>5)+1), mBits(new Index32[mIntSize])
+    {
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=0x00000000;
+    }
+    RootNodeMask(const RootNodeMask& B):
+        mBitSize(B.mBitSize), mIntSize(B.mIntSize), mBits(new Index32[mIntSize])
+    {
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=B.mBits[i];
+    }
+    ~RootNodeMask() {delete [] mBits;}
+
+    void init(Index32 bit_size) {
+        mBitSize = bit_size;
+        mIntSize =((bit_size-1)>>5)+1;
+        delete [] mBits;
+        mBits = new Index32[mIntSize];
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=0x00000000;
+    }
+
+    Index getBitSize() const {return mBitSize;}
+
+    Index getIntSize() const {return mIntSize;}
+
+    RootNodeMask& operator=(const RootNodeMask& B) {
+        if (mBitSize!=B.mBitSize) {
+            mBitSize=B.mBitSize;
+            mIntSize=B.mIntSize;
+            delete [] mBits;
+            mBits = new Index32[mIntSize];
+        }
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=B.mBits[i];
+        return *this;
+    }
+
+    class BaseIterator
+    {
+    protected:
+        Index32             mPos;//bit position
+        Index32             mBitSize;
+        const RootNodeMask* mParent;//this iterator can't change the parent_mask!
+    public:
+        BaseIterator() : mPos(0), mBitSize(0), mParent(NULL) {}
+        BaseIterator(Index32 pos,const RootNodeMask *parent)
+            : mPos(pos), mBitSize(parent->getBitSize()), mParent(parent) {
+            assert( pos<=mBitSize );
+        }
+        bool operator==(const BaseIterator &iter) const {return mPos == iter.mPos;}
+        bool operator!=(const BaseIterator &iter) const {return mPos != iter.mPos;}
+        bool operator< (const BaseIterator &iter) const {return mPos <  iter.mPos;}
+        BaseIterator& operator=(const BaseIterator& iter) {
+            mPos      = iter.mPos;
+            mBitSize  = iter.mBitSize;
+            mParent   = iter.mParent;
+            return *this;
+        }
+
+        Index32 offset() const {return mPos;}
+
+        Index32 pos() const {return mPos;}
+
+        bool test() const {
+            assert(mPos  <= mBitSize);
+            return (mPos != mBitSize);
+        }
+
+        operator bool() const {return this->test();}
+    }; // class BaseIterator
+
+    /// @note This happens to be a const-iterator!
+    class OnIterator: public BaseIterator
+    {
+    protected:
+        using BaseIterator::mPos;//bit position;
+        using BaseIterator::mBitSize;//bit size;
+        using BaseIterator::mParent;//this iterator can't change the parent_mask!
+    public:
+        OnIterator() : BaseIterator() {}
+        OnIterator(Index32 pos,const RootNodeMask *parent) : BaseIterator(pos,parent) {}
+        void increment() {
+            assert(mParent!=NULL);
+            mPos=mParent->findNextOn(mPos+1);
+            assert(mPos <= mBitSize);
+        }
+        void increment(Index n) {
+            for (Index i=0; i<n && this->next(); ++i) {}
+        }
+        bool next() {
+            this->increment();
+            return this->test();
+        }
+        bool operator*() const {return true;}
+        OnIterator& operator++() {
+            this->increment();
+            return *this;
+        }
+    }; // class OnIterator
+
+    class OffIterator: public BaseIterator
+    {
+    protected:
+        using BaseIterator::mPos;//bit position;
+        using BaseIterator::mBitSize;//bit size;
+        using BaseIterator::mParent;//this iterator can't change the parent_mask!
+    public:
+        OffIterator() : BaseIterator()  {}
+        OffIterator(Index32 pos,const RootNodeMask *parent) : BaseIterator(pos,parent) {}
+        void increment() {
+            assert(mParent!=NULL);
+            mPos=mParent->findNextOff(mPos+1);
+            assert(mPos <= mBitSize);
+        }
+        void increment(Index n) {
+            for (Index i=0; i<n && this->next(); ++i) {}
+        }
+        bool next() {
+            this->increment();
+            return this->test();
+        }
+        bool operator*() const {return true;}
+        OffIterator& operator++() {
+            this->increment();
+            return *this;
+        }
+    }; // class OffIterator
+
+    class DenseIterator: public BaseIterator
+    {
+    protected:
+        using BaseIterator::mPos;//bit position;
+        using BaseIterator::mBitSize;//bit size;
+        using BaseIterator::mParent;//this iterator can't change the parent_mask!
+    public:
+        DenseIterator() : BaseIterator() {}
+        DenseIterator(Index32 pos,const RootNodeMask *parent) : BaseIterator(pos,parent) {}
+        void increment() {
+            assert(mParent!=NULL);
+            mPos += 1;//carefull - the increament might go beyond the end
+            assert(mPos<= mBitSize);
+        }
+        void increment(Index n) {
+            for (Index i=0; i<n && this->next(); ++i) {}
+        }
+        bool next() {
+            this->increment();
+            return this->test();
+        }
+        bool operator*() const {return mParent->isOn(mPos);}
+        DenseIterator& operator++() {
+            this->increment();
+            return *this;
+        }
+    }; // class DenseIterator
+
+    OnIterator beginOn() const       { return OnIterator(this->findFirstOn(),this); }
+    OnIterator endOn() const         { return OnIterator(mBitSize,this); }
+    OffIterator beginOff() const     { return OffIterator(this->findFirstOff(),this); }
+    OffIterator endOff() const       { return OffIterator(mBitSize,this); }
+    DenseIterator beginDense() const { return DenseIterator(0,this); }
+    DenseIterator endDense() const   { return DenseIterator(mBitSize,this); }
+
+    bool operator == (const RootNodeMask &B) const {
+        if (mBitSize != B.mBitSize) return false;
+        for (Index32 i=0; i<mIntSize; ++i) if (mBits[i] !=  B.mBits[i]) return false;
+        return true;
+    }
+
+    bool operator != (const RootNodeMask &B) const {
+        if (mBitSize != B.mBitSize) return true;
+        for (Index32 i=0; i<mIntSize; ++i) if (mBits[i] !=  B.mBits[i]) return true;
+        return false;
+    }
+
+    //
+    // Bitwise logical operations
+    //
+    RootNodeMask operator!() const { RootNodeMask m = *this; m.toggle(); return m; }
+    const RootNodeMask& operator&=(const RootNodeMask& other) {
+        assert(mIntSize == other.mIntSize);
+        for (Index32 i = 0, N = std::min(mIntSize, other.mIntSize); i < N; ++i) {
+            mBits[i] &= other.mBits[i];
+        }
+        for (Index32 i = other.mIntSize; i < mIntSize; ++i) mBits[i] = 0x00000000;
+        return *this;
+    }
+    const RootNodeMask& operator|=(const RootNodeMask& other) {
+        assert(mIntSize == other.mIntSize);
+        for (Index32 i = 0, N = std::min(mIntSize, other.mIntSize); i < N; ++i) {
+            mBits[i] |= other.mBits[i];
+        }
+        return *this;
+    }
+    const RootNodeMask& operator^=(const RootNodeMask& other) {
+        assert(mIntSize == other.mIntSize);
+        for (Index32 i = 0, N = std::min(mIntSize, other.mIntSize); i < N; ++i) {
+            mBits[i] ^= other.mBits[i];
+        }
+        return *this;
+    }
+    RootNodeMask operator&(const RootNodeMask& other) const {
+        RootNodeMask m(*this); m &= other; return m;
+    }
+    RootNodeMask operator|(const RootNodeMask& other) const {
+        RootNodeMask m(*this); m |= other; return m;
+    }
+    RootNodeMask operator^(const RootNodeMask& other) const {
+        RootNodeMask m(*this); m ^= other; return m;
+    }
+
+
+    Index32 getMemUsage() const {
+        return static_cast<Index32>(mIntSize*sizeof(Index32) + sizeof(*this));
+    }
+
+    Index32 countOn() const {
+        assert(mBits);
+        Index32 n=0;
+        for (Index32 i=0; i< mIntSize; ++i) n += CountOn(mBits[i]);
+        return n;
+    }
+
+    Index32 countOff() const { return mBitSize-this->countOn(); }
+
+    void setOn(Index32 i) {
+        assert(mBits);
+        assert( (i>>5) < mIntSize);
+        mBits[i>>5] |=  1<<(i&31);
+    }
+
+    void setOff(Index32 i) {
+        assert(mBits);
+        assert( (i>>5) < mIntSize);
+        mBits[i>>5] &=  ~(1<<(i&31));
+    }
+
+    void set(Index32 i, bool On) { On ? this->setOn(i) : this->setOff(i); }
+
+    void setOn() {
+        assert(mBits);
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=0xFFFFFFFF;
+    }
+    void setOff() {
+        assert(mBits);
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=0x00000000;
+    }
+    void toggle(Index32 i) {
+        assert(mBits);
+        assert( (i>>5) < mIntSize);
+        mBits[i>>5] ^= 1<<(i&31);
+    }
+    void toggle() {
+        assert(mBits);
+        for (Index32 i=0; i<mIntSize; ++i) mBits[i]=~mBits[i];
+    }
+    void setFirstOn()  { this->setOn(0); }
+    void setLastOn()   { this->setOn(mBitSize-1); }
+    void setFirstOff() { this->setOff(0); }
+    void setLastOff()  { this->setOff(mBitSize-1); }
+    bool isOn(Index32 i) const {
+        assert(mBits);
+        assert( (i>>5) < mIntSize);
+        return ( mBits[i >> 5] & (1<<(i&31)) );
+    }
+    bool isOff(Index32 i) const {
+        assert(mBits);
+        assert( (i>>5) < mIntSize);
+        return ( ~mBits[i >> 5] & (1<<(i&31)) );
+    }
+
+    bool isOn() const {
+        if (!mBits) return false;//undefined is off
+        for (Index32 i=0; i<mIntSize; ++i) if (mBits[i] != 0xFFFFFFFF) return false;
+        return true;
+    }
+
+    bool isOff() const {
+        if (!mBits) return true;//undefined is off
+        for (Index32 i=0; i<mIntSize; ++i) if (mBits[i] != 0) return false;
+        return true;
+    }
+
+    Index32 findFirstOn() const {
+        assert(mBits);
+        Index32 i=0;
+        while(!mBits[i]) if (++i == mIntSize) return mBitSize;//reached end
+        return 32*i + FindLowestOn(mBits[i]);
+    }
+
+    Index32 findFirstOff() const {
+        assert(mBits);
+        Index32 i=0;
+        while(!(~mBits[i])) if (++i == mIntSize) return mBitSize;//reached end
+        return 32*i + FindLowestOn(~mBits[i]);
+    }
+
+    void save(std::ostream& os) const {
+        assert(mBits);
+        os.write((const char *)mBits,mIntSize*sizeof(Index32));
+    }
+    void load(std::istream& is) {
+        assert(mBits);
+        is.read((char *)mBits,mIntSize*sizeof(Index32));
+    }
+    /// @brief simple print method for debugging
+    void printInfo(std::ostream& os=std::cout) const {
+        os << "RootNodeMask: Bit-size="<<mBitSize<<" Int-size="<<mIntSize<<std::endl;
+    }
+
+    void printBits(std::ostream& os=std::cout, Index32 max_out=80u) const {
+        const Index32 n=(mBitSize>max_out?max_out:mBitSize);
+        for (Index32 i=0; i < n; ++i) {
+            if ( !(i&31) )
+                os << "||";
+            else if ( !(i%8) )
+                os << "|";
+            os << this->isOn(i);
+        }
+        os << "|" << std::endl;
+    }
+
+    void printAll(std::ostream& os=std::cout, Index32 max_out=80u) const {
+        this->printInfo(os);
+        this->printBits(os,max_out);
+    }
+
+    Index32 findNextOn(Index32 start) const {
+        assert(mBits);
+        Index32 n = start >> 5, m = start & 31;//initiate
+        if (n>=mIntSize) return mBitSize; // check for out of bounds
+        Index32 b = mBits[n];
+        if (b & (1<<m)) return start;//simple case
+        b &= 0xFFFFFFFF << m;// mask lower bits
+        while(!b && ++n<mIntSize) b = mBits[n];// find next nonzero int
+        return (!b ? mBitSize : 32*n + FindLowestOn(b));//catch last-int=0
+    }
+
+    Index32 findNextOff(Index32 start) const {
+        assert(mBits);
+        Index32 n = start >> 5, m = start & 31;//initiate
+        if (n>=mIntSize) return mBitSize; // check for out of bounds
+        Index32 b = ~mBits[n];
+        if (b & (1<<m)) return start;//simple case
+        b &= 0xFFFFFFFF<<m;// mask lower bits
+        while(!b && ++n<mIntSize) b = ~mBits[n];// find next nonzero int
+        return (!b ? mBitSize : 32*n + FindLowestOn(b));//catch last-int=0
+    }
+
+    Index32 memUsage() const {
+        assert(mBits);
+        return static_cast<Index32>(sizeof(Index32*)+(2+mIntSize)*sizeof(Index32));//in bytes
+    }
+}; // class RootNodeMask
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_NODEMASKS_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/NullInterrupter.h b/nuparu/include/openvdb_new/util/NullInterrupter.h
new file mode 100644
index 00000000..8022a723
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/NullInterrupter.h
@@ -0,0 +1,90 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file NullInterrupter.h
+
+#ifndef OPENVDB_UTIL_NULL_INTERRUPTER_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_NULL_INTERRUPTER_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+/// @brief Dummy NOOP interrupter class defining interface
+///
+/// This shows the required interface for the @c InterrupterType template argument
+/// using by several threaded applications (e.g. tools/PointAdvect.h). The host
+/// application calls start() at the beginning of an interruptible operation, end()
+/// at the end of the operation, and wasInterrupted() periodically during the operation.
+/// If any call to wasInterrupted() returns @c true, the operation will be aborted.
+/// @note This Dummy interrupter will NEVER interrupt since wasInterrupted() always
+/// returns false!
+struct NullInterrupter
+{
+    /// Default constructor
+    NullInterrupter () {}
+    /// Signal the start of an interruptible operation.
+    /// @param name  an optional descriptive name for the operation
+    void start(const char* name = NULL) { (void)name; }
+    /// Signal the end of an interruptible operation.
+    void end() {}
+    /// Check if an interruptible operation should be aborted.
+    /// @param percent  an optional (when >= 0) percentage indicating
+    ///     the fraction of the operation that has been completed
+    /// @note this method is assumed to be thread-safe. The current
+    /// implementation is clearly a NOOP and should compile out during
+    /// optimization!
+    inline bool wasInterrupted(int percent = -1) { (void)percent; return false; }
+};
+
+/// This method allows NullInterrupter::wasInterrupted to be compiled
+/// out when client code only has a pointer (vs reference) to the interrupter.
+///
+/// @note This is a free-standing function since C++ doesn't allow for
+/// partial template specialization (in client code of the interrupter).
+template <typename T>
+inline bool wasInterrupted(T* i, int percent = -1) { return i && i->wasInterrupted(percent); }
+
+/// Specialization for NullInterrupter
+template<>
+inline bool wasInterrupted<util::NullInterrupter>(util::NullInterrupter*, int) { return false; }
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_NULL_INTERRUPTER_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/PagedArray.h b/nuparu/include/openvdb_new/util/PagedArray.h
new file mode 100644
index 00000000..0fbef832
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/PagedArray.h
@@ -0,0 +1,735 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+///
+/// @file   PagedArray.h
+///
+/// @author Ken Museth
+///
+/// @brief  Concurrent page-based linear data structure with O(1)
+///         random access and std-compliant iterators. It is
+///         primarily intended for applications that involve
+///         multi-threading of dynamically growing linear arrays with
+///         fast random access. 
+
+#ifndef OPENVDB_UTIL_PAGED_ARRAY_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_PAGED_ARRAY_HAS_BEEN_INCLUDED
+
+
+#include <deque>
+#include <cassert>
+#include <iostream>
+#include <algorithm>// std::swap
+#include <tbb/atomic.h>
+#include <tbb/spin_mutex.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_sort.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+////////////////////////////////////////
+
+
+/// @brief   Concurrent page-based linear data structure with O(1)
+///          random access and std-compliant iterators. It is
+///          primarily intended for applications that involve
+///          multi-threading of dynamically growing linear arrays with
+///          fast random access. 
+///
+/// @note    Multiple threads can grow the page-table and push_back  
+///          new elements concurrently. A ValueBuffer provides accelerated
+///          and threadsafe push_back at the cost of potentially re-ordering
+///          elements (when multiple instances are used).
+///
+/// @details This data structure employes contiguous pages of elements
+///          (like a std::deque) which avoids moving data when the
+///          capacity is out-grown and new pages are allocated. The
+///          size of the pages can be controlled with the Log2PageSize
+///          template parameter (defaults to 1024 elements of type ValueT).
+///
+/// There are three fundamentally different ways to insert elements to
+/// this container - each with different advanteges and disadvanteges.
+///
+/// The simplest way to insert elements is to use PagedArray::push_back e.g.   
+/// @code
+///   PagedArray<int> array;
+///   for (int i=0; i<100000; ++i) array.push_back(i);
+/// @endcode
+/// or with tbb task-based multi-threading
+/// @code
+/// struct Functor1 {
+///   Functor1(int n, PagedArray<int>& _array) : array(&_array) {
+///     tbb::parallel_for(tbb::blocked_range<int>(0, n, PagedArray<int>::pageSize()), *this);
+///   }
+///   void operator()(const tbb::blocked_range<int>& r) const {
+///      for (int i=r.begin(), n=r.end(); i!=n; ++i) array->push_back(i);
+///   }
+///   PagedArray<int>* array;
+/// };    
+/// PagedArray<int> array;   
+/// Functor1 tmp(10000, array);  
+/// @endcode    
+/// PagedArray::push_back has the advantage that it's thread-safe and
+/// preserves the ordering of the inserted elements. In fact it returns
+/// the linear offset to the added element which can then be used for
+/// fast O(1) random access. The disadvantage is it's the slowest of
+/// the three different ways of inserting elements.
+///
+/// The fastest way (by far) to insert elements is to use one (or
+/// more) instances of a PagedArray::ValueBuffer, e.g.
+/// @code
+///   PagedArray<int> array;    
+///   PagedArray<int>::ValueBuffer buffer(array);
+///   for (int i=0; i<100000; ++i) buffer.push_back(i);    
+///   buffer.flush();    
+/// @endcode
+/// or    
+/// @code
+///   PagedArray<int> array;
+///   {//local scope of a single thread   
+///     PagedArray<int>::ValueBuffer buffer(array);
+///     for (int i=0; i<100000; ++i) buffer.push_back(i);    
+///   }    
+/// @endcode
+/// or with tbb task-based multi-threading
+/// @code
+/// struct Functor2 {
+///   Functor2(int n, PagedArray<int>& array) : buffer(array) {
+///     tbb::parallel_for(tbb::blocked_range<int>(0, n, PagedArray<int>::pageSize()), *this);
+///   }
+///   void operator()(const tbb::blocked_range<int>& r) const {
+///      for (int i=r.begin(), n=r.end(); i!=n; ++i) buffer.push_back(i);
+///   }
+///   mutable typename PagedArray<int>::ValueBuffer buffer;
+/// };
+/// PagedArray<int> array;    
+/// Functor2 tmp(10000, array);  
+/// @endcode
+/// or with tbb Thread Local Storage for even better performance (due
+/// to fewer concurrent instantiations of partially full ValueBuffers)
+/// @code
+/// struct Functor3 { 
+///   typedef tbb::enumerable_thread_specific<PagedArray<int>::ValueBuffer> PoolType;     
+///   Functor3(size_t n, PoolType& _pool) : pool(&_pool) {     
+///     tbb::parallel_for(tbb::blocked_range<int>(0, n, PagedArray<int>::pageSize()), *this);
+///   }
+///   void operator()(const tbb::blocked_range<int>& r) const {
+///      PagedArray<int>::ValueBuffer& buffer = pool->local();    
+///      for (int i=r.begin(), n=r.end(); i!=n; ++i) buffer.push_back(i);
+///   }
+///   PoolType* pool;
+/// };   
+/// PagedArray<int> array;   
+/// PagedArray<int>::ValueBuffer exemplar(array);//dummy used for initialization
+/// Functor3::PoolType pool(exemplar);//thread local storage pool of ValueBuffers
+/// Functor3 tmp(10000, pool);
+/// for (Functor3::PoolType::iterator i=pool.begin(); i!=pool.end(); ++i) i->flush();    
+/// @endcode
+/// This technique generally outperforms PagedArray::push_back, 
+/// std::vector::push_back, std::deque::push_back and even
+/// tbb::concurrent_vector::push_back. Additionally it
+/// is thread-safe as long as each thread has it's own instance of a 
+/// PagedArray::ValueBuffer. The only disadvantage is the ordering of
+/// the elements is undefined if multiple instance of a 
+/// PagedArray::ValueBuffer are employed. This is typically the case
+/// in the context of multi-threading, where the
+/// ordering of inserts are undefined anyway. Note that a local scope
+/// can be used to guarentee that the ValueBuffer has inerted all its
+/// elements by the time the scope ends. Alternatively the ValueBuffer
+/// can be explicitly flushed by calling ValueBuffer::flush.
+///
+/// The third way to insert elements is to resize the container and use
+/// random access, e.g.
+/// @code
+///   PagedArray<int> array;
+///   array.resize(100000);
+///   for (int i=0; i<100000; ++i) array[i] = i;    
+/// @endcode
+/// or in terms of the random access iterator
+/// @code
+///   PagedArray<int> array;
+///   array.resize(100000);
+///   for (PagedArray<int>::Iterator i=array.begin(); i!=array.end(); ++i) *i = i.pos();    
+/// @endcode    
+/// While this approach is both fast and thread-safe it suffers from the
+/// major disadvantage that the problem size, i.e. number of elements, needs to
+/// be known in advance. If that's the case you might as well consider
+/// using std::vector or a raw c-style array! In other words the
+/// PagedArray is most useful in the context of applications that
+/// involve multi-threading of dynamically growing linear arrays that
+/// require fast random access. 
+template <typename ValueT, size_t Log2PageSize = 10UL>
+class PagedArray {
+
+  private:
+    class Page;
+    typedef std::deque<Page*> PageTableT;
+    
+  public:
+    typedef ValueT ValueType;
+
+    /// @brief Default constructor
+    PagedArray() : mPageTable(), mSize(), mCapacity(0), mGrowthMutex() { mSize = 0; }
+
+    /// @brief Destructor removed all allocated pages
+    ~PagedArray() { this->clear(); }
+    
+    /// @brief Caches values into a local memory Page to improve
+    ///        performance of push_back into a PagedArray.
+    ///
+    /// @note The ordering of inserted elements is undefined when
+    ///       multiple ValueBuffers are used!
+    ///
+    /// @warning By design this ValueBuffer is not threadsafe so
+    ///          make sure to create an instance per thread!
+    class ValueBuffer;
+    
+    /// Const std-compliant iterator
+    class ConstIterator;
+
+     /// Non-const std-compliant iterator
+    class Iterator;
+  
+    /// @brief  Thread safe insertion, adds a new element at
+    ///         the end and increases the container size by one.
+    ///
+    /// @note   Constant time complexity. May allocate a new page.
+    size_t push_back(const ValueType& value)
+    {
+        const size_t index = mSize.fetch_and_increment();
+        if (index >= mCapacity) this->grow(index);       
+        (*mPageTable[index >> Log2PageSize])[index] = value;
+        return index;
+    }
+
+    /// @brief Slightly faster then the thread-safe push_back above.
+    ///
+    /// @note For best performance consider using the ValueBuffer!
+    ///
+    /// @warning Not thread-safe!
+    size_t push_back_unsafe(const ValueType& value)
+    {
+        const size_t index = mSize.fetch_and_increment();
+        if (index >= mCapacity) {
+            mPageTable.push_back( new Page() );
+            mCapacity += Page::Size;
+        }
+        (*mPageTable[index >> Log2PageSize])[index] = value;
+        return index;
+    }
+
+    /// @brief Returns the last element, decrements the size by one.
+    ///
+    /// @details Consider subsequnetly calling shrink_to_fit to
+    /// reduce the page table to match the new size.
+    ///
+    /// @note Calling this method on an empty containter is
+    /// undefined (as is also the case for std containers).
+    ///
+    /// @warning If values were added to the container by means of
+    /// multiple ValueBuffers the last value might not be what you
+    /// expect since the ordering is generally not perserved. Only
+    /// PagedArray::push_back preserves the ordering (or a single
+    /// instance of a ValueBuffer).
+    ValueType pop_back()
+    {
+        assert(mSize>0);
+        --mSize;
+        return (*mPageTable[mSize >> Log2PageSize])[mSize];
+    }
+
+    /// @brief Reduce the page table to fix the current size.
+    ///
+    /// @warning Not thread-safe!
+    void shrink_to_fit();
+    
+    /// @brief Return a reference to the value at the specified offset
+    ///
+    /// @note This random access has constant time complexity.
+    ///
+    /// @warning It is assumed that the i'th element is already allocated!
+    ValueType& operator[](size_t i)
+    {
+        assert(i<mCapacity);
+        return (*mPageTable[i>>Log2PageSize])[i];
+    }
+
+    /// @brief Return a const-reference to the value at the specified offset
+    ///
+    /// @note This random access has constant time complexity.
+    ///
+    /// @warning It is assumed that the i'th element is already allocated!
+    const ValueType& operator[](size_t i) const
+    {
+        assert(i<mCapacity);
+        return (*mPageTable[i>>Log2PageSize])[i];
+    }
+
+    /// @brief Set all elements to the specified value
+    void fill(const ValueType& v)
+    {
+        tbb::spin_mutex::scoped_lock lock(mGrowthMutex);
+        Fill tmp(this, v);
+    }
+
+    /// @brief Resize this array to the specified size.
+    ///
+    /// @note This will grow or shrink the page table.
+    ///
+    /// @warning Not thread-safe!
+    void resize(size_t size)
+    {
+        mSize = size;
+        if (size > mCapacity) {
+            this->grow(size-1);
+        } else {
+            this->shrink_to_fit();
+        }
+    }
+
+    /// @brief Resize this array to the specified size and
+    ///        set all elements to the specified value.
+    ///
+    /// @warning Not thread-safe!
+    void resize(size_t size, const ValueType& v)
+    {
+       this->resize(size);
+       this->fill(v);
+    }
+    
+    /// @brief Return the number of elements in this array.
+    size_t size() const { return mSize; }
+    
+    /// @brief Return the maximum number of elements that this array
+    /// can contain without allocating more memory pages.
+    size_t capacity() const { return mCapacity; }
+
+    /// @brief Return the number of additional elements that can be
+    /// added to this array without allocating more memory pages.
+    size_t freeCount() const { return mCapacity - mSize; }
+
+    /// @brief Return the number of allocated memory pages.
+    size_t pageCount() const { return mPageTable.size(); }
+
+    /// @brief Return the number of elements per memory page.
+    static size_t pageSize() { return Page::Size; }
+
+    /// @brief Return log2 of the number of elements per memory page.
+    static size_t log2PageSize() { return Log2PageSize; }
+
+    /// @brief Return the memory footprint of this array in bytes.
+    size_t memUsage() const
+    {
+        return sizeof(*this) + mPageTable.size() * Page::memUsage();
+    }
+
+    /// @brief Return true if the container contains no elements.
+    bool isEmpty() const { return mSize == 0; }
+    
+    /// @brief Return true if the page table is partially full, i.e. the 
+    ///        last non-empty page contains less than pageSize() elements.
+    ///
+    /// @details When the page table is partially full calling merge()
+    ///          or using a ValueBuffer will rearrange the ordering of
+    ///          existing elements. 
+    bool isPartiallyFull() const { return (mSize & Page::Mask) > 0; }
+
+    /// @brief  Removes all elements from the array and delete all pages.
+    ///
+    /// @warning Not thread-safe!
+    void clear()
+    {
+        tbb::spin_mutex::scoped_lock lock(mGrowthMutex);
+        for (size_t i=0, n=mPageTable.size(); i<n; ++i) delete mPageTable[i];
+        PageTableT().swap(mPageTable);
+        mSize     = 0;
+        mCapacity = 0;
+    }
+
+    /// @brief Return a non-const iterator pointing to the first element
+    Iterator begin() { return Iterator(*this, 0); }
+
+    /// @brief Return a non-const iterator pointing to the
+    /// past-the-last element.
+    ///
+    /// @warning Iterator does not point to a valid element and should not
+    /// be dereferenced! 
+    Iterator end() { return Iterator(*this, mSize); }
+
+    /// @brief Return a const iterator pointing to the first element
+    ConstIterator cbegin() const { return ConstIterator(*this, 0); }
+
+    /// @brief Return a const iterator pointing to the
+    /// past-the-last element.
+    ///
+    /// @warning Itrator does not point to a valid element and should not
+    /// be dereferenced! 
+    ConstIterator cend() const { return ConstIterator(*this, mSize); }
+
+    /// @brief Parallel sort of all the elements in ascending order.
+    void sort() { tbb::parallel_sort(this->begin(), this->end(), std::less<ValueT>() ); }
+
+    /// @brief Parallel sort of all the elements in descending order.
+    void invSort() { tbb::parallel_sort(this->begin(), this->end(), std::greater<ValueT>()); }
+
+    /// @brief Parallel sort of all the elements based on a custom
+    /// functor with the api:
+    /// @code bool operator()(const ValueT& a, const ValueT& b) @endcode
+    /// which returns true if a comes before b.
+    template <typename Functor>
+    void sort() { tbb::parallel_sort(this->begin(), this->end(), Functor() ); }
+
+    /// @brief Transfer all the elements (and pages) from the other array to this array.
+    ///
+    /// @note The other PagedArray is empty on return.
+    ///
+    /// @warning The ordering of elements is undefined if this page table is partially full!
+    void merge(PagedArray& other);
+
+    /// @brief Print information for debugging
+    void print(std::ostream& os = std::cout) const
+      {
+          os << "PagedArray:\n"
+             << "\tSize:       " << this->size() << " elements\n"
+             << "\tPage table: " << this->pageCount() << " pages\n"
+             << "\tPage size:  " << this->pageSize() << " elements\n"
+             << "\tCapacity:   " << this->capacity() << " elements\n"
+             << "\tFootrpint:  " << this->memUsage() << " bytes\n";
+      }
+
+private:
+    // Disallow copy construction and assignment
+    PagedArray(const PagedArray&);//not implemented
+    void operator=(const PagedArray&);//not implemented
+
+    friend class ValueBuffer;
+
+    // Private class for concurrent fill
+    struct Fill;
+
+    void grow(size_t index)
+    {
+        tbb::spin_mutex::scoped_lock lock(mGrowthMutex);
+        while(index >= mCapacity) {
+            mPageTable.push_back( new Page() );
+            mCapacity += Page::Size;
+        }
+    }
+
+    void add_full(Page*& page, size_t size);
+    
+    void add_partially_full(Page*& page, size_t size);     
+    
+    void add(Page*& page, size_t size) {
+        tbb::spin_mutex::scoped_lock lock(mGrowthMutex);
+        if (size == Page::Size) {//page is full
+            this->add_full(page, size);
+        } else if (size>0) {//page is only partially full
+            this->add_partially_full(page, size);
+        }
+    }
+    PageTableT mPageTable;//holds points to allocated pages
+    tbb::atomic<size_t> mSize;// current number of elements in array
+    size_t mCapacity;//capacity of array given the current page count
+    tbb::spin_mutex mGrowthMutex;//Mutex-lock required to grow pages
+}; // Public class PagedArray
+
+////////////////////////////////////////////////////////////////////////////////    
+    
+template <typename ValueT, size_t Log2PageSize>
+void PagedArray<ValueT, Log2PageSize>::shrink_to_fit()
+{
+    if (mPageTable.size() > (mSize >> Log2PageSize) + 1) {
+        tbb::spin_mutex::scoped_lock lock(mGrowthMutex);
+        const size_t pageCount = (mSize >> Log2PageSize) + 1;
+        if (mPageTable.size() > pageCount) {
+            delete mPageTable.back();
+            mPageTable.pop_back();
+            mCapacity -= Page::Size;
+        }
+    }
+}
+
+template <typename ValueT, size_t Log2PageSize>
+void PagedArray<ValueT, Log2PageSize>::merge(PagedArray& other)
+{
+    if (!other.isEmpty()) {
+        tbb::spin_mutex::scoped_lock lock(mGrowthMutex);
+        // extract last partially full page if it exists
+        Page* page = NULL;
+        const size_t size = mSize & Page::Mask; //number of elements in the last page
+        if ( size > 0 ) {
+            page = mPageTable.back();
+            mPageTable.pop_back();
+            mSize -= size;
+        }
+        // transfer all pages from the other page table
+        mPageTable.insert(mPageTable.end(), other.mPageTable.begin(), other.mPageTable.end());
+        mSize          += other.mSize;
+        mCapacity       = Page::Size*mPageTable.size();
+        other.mSize     = 0;
+        other.mCapacity = 0;
+        PageTableT().swap(other.mPageTable);
+        // add back last partially full page
+        if (page) this->add_partially_full(page, size);
+    } 
+}    
+
+template <typename ValueT, size_t Log2PageSize>
+void PagedArray<ValueT, Log2PageSize>::add_full(Page*& page, size_t size)
+{
+    assert(size == Page::Size);//page must be full
+    if (mSize & Page::Mask) {//page-table is partially full
+        Page*& tmp = mPageTable.back();
+        std::swap(tmp, page);//swap last table entry with page
+    }
+    mPageTable.push_back( page );
+    mCapacity += Page::Size;
+    mSize     += size;
+    page       = NULL;
+}
+    
+template <typename ValueT, size_t Log2PageSize>
+void PagedArray<ValueT, Log2PageSize>::add_partially_full(Page*& page, size_t size)
+{
+    assert(size > 0 && size < Page::Size);//page must be partially full
+    if (size_t m = mSize & Page::Mask) {//page table is also partially full
+        ValueT *s = page->data(), *t = mPageTable.back()->data() + m;
+        for (size_t i=std::min(mSize+size, mCapacity)-mSize; i; --i) *t++ = *s++;
+        if (mSize+size > mCapacity) {//grow page table
+            mPageTable.push_back( new Page() );
+            t = mPageTable.back()->data();
+            for (size_t i=mSize+size-mCapacity; i; --i) *t++ = *s++;
+            mCapacity += Page::Size;
+        }
+    } else {//page table is full so simply append page
+        mPageTable.push_back( page );
+        mCapacity += Page::Size;   
+        page       = NULL;
+    }
+    mSize += size;
+}
+    
+////////////////////////////////////////////////////////////////////////////////
+
+// Public member-class of PagedArray    
+template <typename ValueT, size_t Log2PageSize>
+class PagedArray<ValueT, Log2PageSize>::
+ValueBuffer
+{
+public:
+    typedef PagedArray<ValueT, Log2PageSize> PagedArrayType;
+    /// @brief Constructor from a PageArray
+    ValueBuffer(PagedArray& parent) : mParent(&parent), mPage(new Page()), mSize(0) {}
+    /// @warning This copy-constructor is shallow in the sense that no
+    ///          elements are copied, i.e. size = 0.
+    ValueBuffer(const ValueBuffer& other) : mParent(other.mParent), mPage(new Page()), mSize(0) {}
+    /// @brief Destructor that transfers an buffered values to the parent PagedArray.
+    ~ValueBuffer() { this->flush(); delete mPage; }
+    /// @brief Add a value to the buffer and increment the size.
+    ///
+    /// @details If the internal memory page is full it will
+    /// automaically flush the page to the parent PagedArray.
+    void push_back(const ValueT& v) {
+        (*mPage)[mSize++] = v;
+        if (mSize == Page::Size) this->flush();
+    }
+    /// @brief Manually transfer the values in this buffer to the parent PagedArray.
+    ///
+    /// @note This method is also called by the destructor and
+    /// puach_back so it should only be called when manually want to
+    /// sync up the buffer with the array, e.g. during debugging.
+    void flush() {
+        mParent->add(mPage, mSize);
+        if (mPage == NULL) mPage = new Page();
+        mSize = 0;
+    }
+    /// @brief Return a reference to the parent PagedArray
+    PagedArrayType& parent() const { return *mParent; }
+    /// @brief Return the current number of elements cached in this buffer.
+    size_t size() const { return mSize; }
+private:
+    ValueBuffer& operator=(const ValueBuffer& other);//not implemented
+    PagedArray* mParent;
+    Page*       mPage; 
+    size_t      mSize;
+};// Public class PagedArray::ValueBuffer
+  
+////////////////////////////////////////////////////////////////////////////////
+  
+// Const std-compliant iterator
+// Public member-class of PagedArray     
+template <typename ValueT, size_t Log2PageSize>
+class PagedArray<ValueT, Log2PageSize>::
+ConstIterator : public std::iterator<std::random_access_iterator_tag, ValueT>
+{
+public:
+    typedef std::iterator<std::random_access_iterator_tag, ValueT> BaseT;
+    typedef typename BaseT::difference_type difference_type;
+    // constructors and assignment
+    ConstIterator() : mPos(0), mParent(NULL) {}
+    ConstIterator(const PagedArray& parent, size_t pos=0) : mPos(pos), mParent(&parent) {}
+    ConstIterator(const ConstIterator& other) : mPos(other.mPos), mParent(other.mParent) {}
+    ConstIterator& operator=(const ConstIterator& other) {
+        mPos=other.mPos;
+        mParent=other.mParent;
+        return *this;
+    }
+    // prefix
+    ConstIterator& operator++() { ++mPos; return *this; }
+    ConstIterator& operator--() { --mPos; return *this; }
+    // postfix
+    ConstIterator  operator++(int) { ConstIterator tmp(*this); ++mPos; return tmp; }
+    ConstIterator  operator--(int) { ConstIterator tmp(*this); --mPos; return tmp; }
+    // value access
+    const ValueT& operator*()  const { return (*mParent)[mPos]; }
+    const ValueT* operator->() const { return &(this->operator*()); }
+    const ValueT& operator[](const difference_type& pos) const { return (*mParent)[mPos+pos]; }
+    // offset
+    ConstIterator& operator+=(const difference_type& pos) { mPos += pos; return *this; }
+    ConstIterator& operator-=(const difference_type& pos) { mPos -= pos; return *this; }
+    ConstIterator operator+(const difference_type &pos) const { return Iterator(*mParent,mPos+pos); }
+    ConstIterator operator-(const difference_type &pos) const { return Iterator(*mParent,mPos-pos); }
+    difference_type operator-(const ConstIterator& other) const { return mPos - other.pos(); }
+    // comparisons
+    bool operator==(const ConstIterator& other) const { return mPos == other.mPos; }
+    bool operator!=(const ConstIterator& other) const { return mPos != other.mPos; }
+    bool operator>=(const ConstIterator& other) const { return mPos >= other.mPos; }
+    bool operator<=(const ConstIterator& other) const { return mPos <= other.mPos; }
+    bool operator< (const ConstIterator& other) const { return mPos <  other.mPos; }
+    bool operator> (const ConstIterator& other) const { return mPos >  other.mPos; }
+    // non-std methods
+    bool isValid() const { return mParent != NULL && mPos < mParent->size(); }
+    size_t pos()   const { return mPos; }
+private:
+    size_t            mPos;
+    const PagedArray* mParent;
+};// Public class PagedArray::ConstIterator
+  
+////////////////////////////////////////////////////////////////////////////////  
+
+// Public member-class of PagedArray     
+template <typename ValueT, size_t Log2PageSize>
+class PagedArray<ValueT, Log2PageSize>::
+Iterator : public std::iterator<std::random_access_iterator_tag, ValueT>
+{
+public:
+    typedef std::iterator<std::random_access_iterator_tag, ValueT> BaseT;
+    typedef typename BaseT::difference_type difference_type;
+    // constructors and assignment
+    Iterator() : mPos(0), mParent(NULL) {}
+    Iterator(PagedArray& parent, size_t pos=0) : mPos(pos), mParent(&parent) {}
+    Iterator(const Iterator& other) : mPos(other.mPos), mParent(other.mParent) {}
+    Iterator& operator=(const Iterator& other) {
+        mPos=other.mPos;
+        mParent=other.mParent;
+        return *this;
+    }
+    // prefix
+    Iterator& operator++() { ++mPos; return *this; }
+    Iterator& operator--() { --mPos; return *this; }
+    // postfix
+    Iterator  operator++(int) { Iterator tmp(*this); ++mPos; return tmp; }
+    Iterator  operator--(int) { Iterator tmp(*this); --mPos; return tmp; }
+    // value access
+    ValueT& operator*()  const { return (*mParent)[mPos]; }
+    ValueT* operator->() const { return &(this->operator*()); }
+    ValueT& operator[](const difference_type& pos) const { return (*mParent)[mPos+pos]; }
+    // offset
+    Iterator& operator+=(const difference_type& pos) { mPos += pos; return *this; }
+    Iterator& operator-=(const difference_type& pos) { mPos -= pos; return *this; }
+    Iterator operator+(const difference_type &pos) const { return Iterator(*mParent, mPos+pos); }
+    Iterator operator-(const difference_type &pos) const { return Iterator(*mParent, mPos-pos); }
+    difference_type operator-(const Iterator& other) const { return mPos - other.pos(); }
+    // comparisons
+    bool operator==(const Iterator& other) const { return mPos == other.mPos; }
+    bool operator!=(const Iterator& other) const { return mPos != other.mPos; }
+    bool operator>=(const Iterator& other) const { return mPos >= other.mPos; }
+    bool operator<=(const Iterator& other) const { return mPos <= other.mPos; }
+    bool operator< (const Iterator& other) const { return mPos <  other.mPos; }
+    bool operator> (const Iterator& other) const { return mPos >  other.mPos; }
+    // non-std methods
+    bool isValid() const { return mParent != NULL && mPos < mParent->size(); }
+    size_t pos()   const { return mPos; }
+  private:
+    size_t      mPos;
+    PagedArray* mParent;
+};// Public class PagedArray::Iterator
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Private member-class of PagedArray implementing a memory page
+template <typename ValueT, size_t Log2PageSize>
+class PagedArray<ValueT, Log2PageSize>::
+Page
+{
+public:
+    static const size_t Size = 1UL << Log2PageSize;
+    static const size_t Mask = Size - 1UL;
+    static size_t memUsage() { return sizeof(ValueT)*Size; }
+    Page() : mData(new ValueT[Size]) {}
+    ~Page() { delete [] mData; }
+    ValueT& operator[](const size_t i) { return mData[i & Mask]; }
+    const ValueT& operator[](const size_t i) const { return mData[i & Mask]; }
+    void fill(const ValueT& v) { ValueT* p = mData; for (size_t i=Size; i; --i) *p++ = v; }
+    ValueT* data() { return mData; }
+protected:
+    Page(const Page& other);//copy construction is not implemented
+    Page& operator=(const Page& rhs);//copy assignment is not implemented
+    ValueT* mData;
+};// Private class PagedArray::Page
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Private member-class of PagedArray implementing concurrent fill of a Page
+template <typename ValueT, size_t Log2PageSize>
+struct PagedArray<ValueT, Log2PageSize>::
+Fill {
+    Fill(PagedArray* _d, const ValueT& _v) : d(_d), v(_v) {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, d->pageCount()), *this);
+    }
+    void operator()(const tbb::blocked_range<size_t>& r) const {
+        for (size_t i=r.begin(); i!=r.end(); ++i) d->mPageTable[i]->fill(v);
+    }
+    PagedArray* d;
+    const ValueT& v;
+};// Private class PagedArray::Fill
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_PAGED_ARRAY_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/Util.h b/nuparu/include/openvdb_new/util/Util.h
new file mode 100644
index 00000000..2751fcbc
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/Util.h
@@ -0,0 +1,166 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_UTIL_UTIL_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_UTIL_HAS_BEEN_INCLUDED
+
+#include <openvdb/Types.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/tools/ValueTransformer.h>
+#include <openvdb/tools/Prune.h>// for tree::pruneInactive
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace util {
+
+OPENVDB_API extern const Index32 INVALID_IDX;
+
+/// @brief coordinate offset table for neighboring voxels
+OPENVDB_API extern const Coord COORD_OFFSETS[26];
+
+
+////////////////////////////////////////
+
+
+/// Return @a voxelCoord rounded to the closest integer coordinates.
+inline Coord
+nearestCoord(const Vec3d& voxelCoord)
+{
+    Coord ijk;
+    ijk[0] = int(std::floor(voxelCoord[0]));
+    ijk[1] = int(std::floor(voxelCoord[1]));
+    ijk[2] = int(std::floor(voxelCoord[2]));
+    return ijk;
+}
+
+
+////////////////////////////////////////
+
+
+/// @brief Functor for use with tools::foreach() to compute the boolean intersection
+/// between the value masks of corresponding leaf nodes in two trees
+template<class TreeType1, class TreeType2>
+class LeafTopologyIntOp
+{
+public:
+    LeafTopologyIntOp(const TreeType2& tree): mOtherTree(&tree) {}
+
+    inline void operator()(const typename TreeType1::LeafIter& lIter) const
+    {
+        const Coord xyz = lIter->origin();
+        const typename TreeType2::LeafNodeType* leaf = mOtherTree->probeConstLeaf(xyz);
+        if (leaf) {//leaf node
+            lIter->topologyIntersection(*leaf, zeroVal<typename TreeType1::ValueType>());
+        } else if (!mOtherTree->isValueOn(xyz)) {//inactive tile
+            lIter->setValuesOff();
+        }
+    }
+
+private:
+    const TreeType2* mOtherTree;
+};
+
+
+/// @brief Functor for use with tools::foreach() to compute the boolean difference
+/// between the value masks of corresponding leaf nodes in two trees
+template<class TreeType1, class TreeType2>
+class LeafTopologyDiffOp
+{
+public:
+    LeafTopologyDiffOp(const TreeType2& tree): mOtherTree(&tree) {}
+
+    inline void operator()(const typename TreeType1::LeafIter& lIter) const
+    {
+        const Coord xyz = lIter->origin();
+        const typename TreeType2::LeafNodeType* leaf = mOtherTree->probeConstLeaf(xyz);
+        if (leaf) {//leaf node
+            lIter->topologyDifference(*leaf, zeroVal<typename TreeType1::ValueType>());
+        } else if (mOtherTree->isValueOn(xyz)) {//active tile
+            lIter->setValuesOff();
+        }
+    }
+
+private:
+    const TreeType2* mOtherTree;
+};
+
+
+////////////////////////////////////////
+
+
+/// @brief Perform a boolean intersection between two leaf nodes' topology masks.
+/// @return a pointer to a new, boolean-valued tree containing the overlapping voxels.
+template<class TreeType1, class TreeType2>
+inline typename TreeType1::template ValueConverter<bool>::Type::Ptr
+leafTopologyIntersection(const TreeType1& lhs, const TreeType2& rhs, bool threaded = true)
+{
+    typedef typename TreeType1::template ValueConverter<bool>::Type BoolTreeType;
+
+    typename BoolTreeType::Ptr topologyTree(new BoolTreeType(
+        lhs, /*inactiveValue=*/false, /*activeValue=*/true, TopologyCopy()));
+
+    tools::foreach(topologyTree->beginLeaf(),
+        LeafTopologyIntOp<BoolTreeType, TreeType2>(rhs), threaded);
+
+    tools::pruneInactive(*topologyTree, threaded);
+    return topologyTree;
+}
+
+
+/// @brief Perform a boolean difference between two leaf nodes' topology masks.
+/// @return a pointer to a new, boolean-valued tree containing the non-overlapping
+/// voxels from the lhs.
+template<class TreeType1, class TreeType2>
+inline typename TreeType1::template ValueConverter<bool>::Type::Ptr
+leafTopologyDifference(const TreeType1& lhs, const TreeType2& rhs, bool threaded = true)
+{
+    typedef typename TreeType1::template ValueConverter<bool>::Type BoolTreeType;
+
+    typename BoolTreeType::Ptr topologyTree(new BoolTreeType(
+        lhs, /*inactiveValue=*/false, /*activeValue=*/true, TopologyCopy()));
+
+    tools::foreach(topologyTree->beginLeaf(),
+        LeafTopologyDiffOp<BoolTreeType, TreeType2>(rhs), threaded);
+
+    tools::pruneInactive(*topologyTree, threaded);
+    return topologyTree;
+}
+
+} // namespace util
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_UTIL_UTIL_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/util/logging.h b/nuparu/include/openvdb_new/util/logging.h
new file mode 100644
index 00000000..3d5cff79
--- /dev/null
+++ b/nuparu/include/openvdb_new/util/logging.h
@@ -0,0 +1,83 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_UTIL_LOGGING_HAS_BEEN_INCLUDED
+#define OPENVDB_UTIL_LOGGING_HAS_BEEN_INCLUDED
+
+#ifndef OPENVDB_USE_LOG4CPLUS
+
+/// Log an info message of the form '<TT>someVar << "some text" << ...</TT>'.
+#define OPENVDB_LOG_INFO(message)
+/// Log a warning message of the form '<TT>someVar << "some text" << ...</TT>'.
+#define OPENVDB_LOG_WARN(message)           do { std::cerr << message << std::endl; } while (0);
+/// Log an error message of the form '<TT>someVar << "some text" << ...</TT>'.
+#define OPENVDB_LOG_ERROR(message)          do { std::cerr << message << std::endl; } while (0);
+/// Log a fatal error message of the form '<TT>someVar << "some text" << ...</TT>'.
+#define OPENVDB_LOG_FATAL(message)          do { std::cerr << message << std::endl; } while (0);
+/// In debug builds only, log a debugging message of the form '<TT>someVar << "text" << ...</TT>'.
+#define OPENVDB_LOG_DEBUG(message)
+/// @brief Log a debugging message in both debug and optimized builds.
+/// @warning Don't use this in performance-critical code.
+#define OPENVDB_LOG_DEBUG_RUNTIME(message)
+
+#else // ifdef OPENVDB_USE_LOG4CPLUS
+
+#include <log4cplus/logger.h>
+#include <log4cplus/loglevel.h>
+#include <sstream>
+
+#define OPENVDB_LOG(level, message) \
+    do { \
+        log4cplus::Logger _log = log4cplus::Logger::getInstance(LOG4CPLUS_TEXT("main")); \
+        if (_log.isEnabledFor(log4cplus::level##_LOG_LEVEL)) { \
+            std::ostringstream _buf; \
+            _buf << message; \
+            _log.forcedLog(log4cplus::level##_LOG_LEVEL, _buf.str(), __FILE__, __LINE__); \
+        } \
+    } while (0);
+
+#define OPENVDB_LOG_INFO(message)           OPENVDB_LOG(INFO, message)
+#define OPENVDB_LOG_WARN(message)           OPENVDB_LOG(WARN, message)
+#define OPENVDB_LOG_ERROR(message)          OPENVDB_LOG(ERROR, message)
+#define OPENVDB_LOG_FATAL(message)          OPENVDB_LOG(FATAL, message)
+#ifdef DEBUG
+#define OPENVDB_LOG_DEBUG(message)          OPENVDB_LOG(DEBUG, message)
+#else
+#define OPENVDB_LOG_DEBUG(message)
+#endif
+#define OPENVDB_LOG_DEBUG_RUNTIME(message)  OPENVDB_LOG(DEBUG, message)
+
+#endif // OPENVDB_USE_LOG4CPLUS
+
+#endif // OPENVDB_UTIL_LOGGING_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_new/version.h b/nuparu/include/openvdb_new/version.h
new file mode 100644
index 00000000..adc3d8ee
--- /dev/null
+++ b/nuparu/include/openvdb_new/version.h
@@ -0,0 +1,133 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of DreamWorks Animation nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_VERSION_HAS_BEEN_INCLUDED
+#define OPENVDB_VERSION_HAS_BEEN_INCLUDED
+
+#include "Platform.h"
+
+
+/// The version namespace name for this library version
+///
+/// Fully-namespace-qualified symbols are named as follows:
+/// openvdb::vX_Y_Z::Vec3i, openvdb::vX_Y_Z::io::File, openvdb::vX_Y_Z::tree::Tree, etc.,
+/// where X, Y and Z are OPENVDB_LIBRARY_MAJOR_VERSION, OPENVDB_LIBRARY_MINOR_VERSION
+/// and OPENVDB_LIBRARY_PATCH_VERSION, respectively (defined below).
+#define OPENVDB_VERSION_NAME v3_2_0
+
+// Library major, minor and patch version numbers
+#define OPENVDB_LIBRARY_MAJOR_VERSION_NUMBER 3
+#define OPENVDB_LIBRARY_MINOR_VERSION_NUMBER 2
+#define OPENVDB_LIBRARY_PATCH_VERSION_NUMBER 0
+
+/// @brief Library version number string of the form "<major>.<minor>.<patch>"
+/// @details This is a macro rather than a static constant because we typically
+/// want the compile-time version number, not the runtime version number
+/// (although the two are usually the same).
+#define OPENVDB_LIBRARY_VERSION_STRING "3.2.0"
+
+/// Library version number as a packed integer ("%02x%02x%04x", major, minor, patch)
+#define OPENVDB_LIBRARY_VERSION_NUMBER \
+    ((OPENVDB_LIBRARY_MAJOR_VERSION_NUMBER << 24) | \
+    ((OPENVDB_LIBRARY_MINOR_VERSION_NUMBER & 0xFF) << 16) | \
+    (OPENVDB_LIBRARY_PATCH_VERSION_NUMBER & 0xFFFF))
+
+/// If OPENVDB_REQUIRE_VERSION_NAME is undefined, symbols from the version
+/// namespace are promoted to the top-level namespace (e.g., openvdb::v1_0_0::io::File
+/// can be referred to simply as openvdb::io::File).  Otherwise, symbols must be fully
+/// namespace-qualified.
+#ifdef OPENVDB_REQUIRE_VERSION_NAME
+#define OPENVDB_USE_VERSION_NAMESPACE
+#else
+/// @note The empty namespace clause below ensures that
+/// OPENVDB_VERSION_NAME is recognized as a namespace name.
+#define OPENVDB_USE_VERSION_NAMESPACE \
+    namespace OPENVDB_VERSION_NAME {} \
+    using namespace OPENVDB_VERSION_NAME;
+#endif
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+/// @brief The magic number is stored in the first four bytes of every VDB file.
+/// @details This can be used to quickly test whether we have a valid file or not.
+const int32_t OPENVDB_MAGIC = 0x56444220;
+
+// Library major, minor and patch version numbers
+const uint32_t
+    OPENVDB_LIBRARY_MAJOR_VERSION = OPENVDB_LIBRARY_MAJOR_VERSION_NUMBER,
+    OPENVDB_LIBRARY_MINOR_VERSION = OPENVDB_LIBRARY_MINOR_VERSION_NUMBER,
+    OPENVDB_LIBRARY_PATCH_VERSION = OPENVDB_LIBRARY_PATCH_VERSION_NUMBER;
+/// Library version number as a packed integer ("%02x%02x%04x", major, minor, patch)
+const uint32_t OPENVDB_LIBRARY_VERSION = OPENVDB_LIBRARY_VERSION_NUMBER;
+
+/// @brief The current version number of the VDB file format
+/// @details  This can be used to enable various backwards compatability switches
+/// or to reject files that cannot be read.
+const uint32_t OPENVDB_FILE_VERSION = 223;
+
+/// Notable file format version numbers
+enum {
+    OPENVDB_FILE_VERSION_ROOTNODE_MAP = 213,
+    OPENVDB_FILE_VERSION_INTERNALNODE_COMPRESSION = 214,
+    OPENVDB_FILE_VERSION_SIMPLIFIED_GRID_TYPENAME = 215,
+    OPENVDB_FILE_VERSION_GRID_INSTANCING = 216,
+    OPENVDB_FILE_VERSION_BOOL_LEAF_OPTIMIZATION = 217,
+    OPENVDB_FILE_VERSION_BOOST_UUID = 218,
+    OPENVDB_FILE_VERSION_NO_GRIDMAP = 219,
+    OPENVDB_FILE_VERSION_NEW_TRANSFORM = 219,
+    OPENVDB_FILE_VERSION_SELECTIVE_COMPRESSION = 220,
+    OPENVDB_FILE_VERSION_FLOAT_FRUSTUM_BBOX = 221,
+    OPENVDB_FILE_VERSION_NODE_MASK_COMPRESSION = 222,
+    OPENVDB_FILE_VERSION_BLOSC_COMPRESSION = 223,
+    OPENVDB_FILE_VERSION_POINT_INDEX_GRID = 223
+};
+
+
+/// Return a library version number string of the form "<major>.<minor>.<patch>".
+inline const char* getLibraryVersionString() { return OPENVDB_LIBRARY_VERSION_STRING; }
+
+
+struct VersionId {
+    uint32_t first, second;
+    VersionId(): first(0), second(0) {}
+    VersionId(uint32_t major, uint32_t minor): first(major), second(minor) {}
+};
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_VERSION_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2012-2016 DreamWorks Animation LLC
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/Types.h b/nuparu/include/openvdb_points/Types.h
new file mode 100644
index 00000000..0231bee1
--- /dev/null
+++ b/nuparu/include/openvdb_points/Types.h
@@ -0,0 +1,65 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_POINTS_TYPES_HAS_BEEN_INCLUDED
+#define OPENVDB_POINTS_TYPES_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/Platform.h>
+#include <openvdb/Types.h>
+#include <OpenEXR/half.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+// add some extra typeNameAsString specializations
+
+template<> inline const char* typeNameAsString<half>()                   { return "half"; }
+template<> inline const char* typeNameAsString<uint8_t>()                { return "uint8"; }
+template<> inline const char* typeNameAsString<int16_t>()                { return "int16"; }
+template<> inline const char* typeNameAsString<uint16_t>()               { return "uint16"; }
+template<> inline const char* typeNameAsString<math::Vec2<half> >()      { return "vec2h"; }
+template<> inline const char* typeNameAsString<math::Vec3<half> >()      { return "vec3h"; }
+template<> inline const char* typeNameAsString<math::Vec3<uint8_t> >()   { return "vec3u8"; }
+template<> inline const char* typeNameAsString<math::Vec3<uint16_t> >()  { return "vec3u16"; }
+
+
+////////////////////////////////////////
+
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_POINTS_TYPES_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/openvdb.h b/nuparu/include/openvdb_points/openvdb.h
new file mode 100644
index 00000000..9b95299c
--- /dev/null
+++ b/nuparu/include/openvdb_points/openvdb.h
@@ -0,0 +1,56 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_POINTS_INIT_HAS_BEEN_INCLUDED
+#define OPENVDB_POINTS_INIT_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+namespace points {
+
+/// Global registration of basic types
+OPENVDB_API void initialize();
+
+/// Global deregistration of basic types
+OPENVDB_API void uninitialize();
+
+} // namespace points
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_POINTS_INIT_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/AttributeArray.h b/nuparu/include/openvdb_points/tools/AttributeArray.h
new file mode 100644
index 00000000..9675e76b
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/AttributeArray.h
@@ -0,0 +1,1573 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file AttributeArray.h
+///
+/// @authors Dan Bailey, Mihai Alden, Peter Cucka
+///
+/// @brief  Attribute Array storage templated on type and compression codec.
+///
+
+
+#ifndef OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+
+#include <openvdb_points/Types.h>
+#include <openvdb/math/QuantizedUnitVec.h>
+#include <openvdb/util/Name.h>
+#include <openvdb/util/logging.h>
+#include <openvdb/io/io.h> // MappedFile
+#include <openvdb/io/Compression.h> // COMPRESS_BLOSC
+
+#include <openvdb_points/tools/IndexIterator.h>
+
+#include <tbb/spin_mutex.h>
+#include <tbb/atomic.h>
+
+#include <boost/scoped_array.hpp>
+
+#include <string>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+
+// Add new typedef for a Name pair
+typedef std::pair<Name, Name> NamePair;
+
+namespace tools {
+
+
+////////////////////////////////////////
+
+// Attribute Compression methods
+
+
+namespace attribute_compression {
+
+/// @brief Returns true if compression is available
+bool canCompress();
+
+/// @brief Retrieves the uncompressed size of buffer when uncompressed
+///
+/// @param buffer the compressed buffer
+int uncompressedSize(const char* buffer);
+
+/// @brief Retrieves the compressed size of buffer when compressed
+///
+/// @param buffer the uncompressed buffer
+/// @param typeSize the size of the data type
+/// @param uncompressedBytes number of uncompressed bytes
+int compressedSize(const char* buffer, const size_t typeSize, const int uncompressedBytes);
+
+/// @brief Compress and return the compressed buffer.
+///
+/// @param buffer the buffer to compress
+/// @param typeSize the size of the data type
+/// @param uncompressedBytes number of uncompressed bytes
+/// @param compressedBytes number of compressed bytes (written to this variable)
+/// @param cleanup if true, the supplied buffer will be deleted prior to allocating new memory
+char* compress( char* buffer, const size_t typeSize,
+                const int uncompressedBytes, int& compressedBytes,
+                const bool cleanup = false);
+
+/// @brief Compress and return the compressed buffer.
+///
+/// @param buffer the buffer to compress
+/// @param typeSize the size of the data type
+/// @param uncompressedBytes number of uncompressed bytes
+/// @param compressedBytes number of compressed bytes (written to this variable)
+///
+/// @note Unlike the non-const buffer version, the buffer will never be deleted.
+char* compress( const char* buffer, const size_t typeSize,
+                const int uncompressedBytes, int& compressedBytes);
+
+/// @brief Decompress and return the uncompressed buffer.
+///
+/// @param buffer the buffer to decompress
+/// @param expectedBytes the number of bytes expected once the buffer is decompressed
+/// @param cleanup if true, the supplied buffer will be deleted prior to allocating new memory
+char* decompress(char* buffer, const int expectedBytes, const bool cleanup = false);
+
+/// @brief Decompress and return the uncompressed buffer.
+///
+/// @param buffer the buffer to decompress
+/// @param expectedBytes the number of bytes expected once the buffer is decompressed
+///
+/// @note Unlike the non-const buffer version, the buffer will never be deleted.
+char* decompress(const char* buffer, const int expectedBytes);
+
+} // namespace attribute_compression
+
+
+////////////////////////////////////////
+
+// Utility methods
+
+template <typename IntegerT, typename FloatT>
+inline IntegerT
+floatingPointToFixedPoint(const FloatT s)
+{
+    BOOST_STATIC_ASSERT(boost::is_unsigned<IntegerT>::value);
+    if (FloatT(0.0) > s) return std::numeric_limits<IntegerT>::min();
+    else if (FloatT(1.0) <= s) return std::numeric_limits<IntegerT>::max();
+    return IntegerT(std::floor(s * FloatT(std::numeric_limits<IntegerT>::max())));
+}
+
+
+template <typename FloatT, typename IntegerT>
+inline FloatT
+fixedPointToFloatingPoint(const IntegerT s)
+{
+    BOOST_STATIC_ASSERT(boost::is_unsigned<IntegerT>::value);
+    return FloatT(s) / FloatT((std::numeric_limits<IntegerT>::max()));
+}
+
+
+template <typename IntegerVectorT, typename FloatT>
+inline IntegerVectorT
+floatingPointToFixedPoint(const math::Vec3<FloatT>& v)
+{
+    return IntegerVectorT(
+        floatingPointToFixedPoint<typename IntegerVectorT::ValueType>(v.x()),
+        floatingPointToFixedPoint<typename IntegerVectorT::ValueType>(v.y()),
+        floatingPointToFixedPoint<typename IntegerVectorT::ValueType>(v.z()));
+}
+
+template <typename FloatVectorT, typename IntegerT>
+inline FloatVectorT
+fixedPointToFloatingPoint(const math::Vec3<IntegerT>& v)
+{
+    return FloatVectorT(
+        fixedPointToFloatingPoint<typename FloatVectorT::ValueType>(v.x()),
+        fixedPointToFloatingPoint<typename FloatVectorT::ValueType>(v.y()),
+        fixedPointToFloatingPoint<typename FloatVectorT::ValueType>(v.z()));
+}
+
+
+////////////////////////////////////////
+
+// Attribute codec schemes
+
+template<typename StorageType_>
+struct NullAttributeCodec
+{
+    typedef StorageType_ StorageType;
+    template<typename ValueType> static void decode(const StorageType&, ValueType&);
+    template<typename ValueType> static void encode(const StorageType&, ValueType&);
+    static const char* name() { return "null"; }
+};
+
+
+template<typename IntType>
+struct FixedPointAttributeCodec
+{
+    typedef IntType StorageType;
+    template<typename ValueType> static void decode(const StorageType&, ValueType&);
+    template<typename ValueType> static void encode(const ValueType&, StorageType&);
+    static const char* name() { return "fxpt"; }
+};
+
+
+struct UnitVecAttributeCodec
+{
+    typedef uint16_t StorageType;
+    template<typename T> static void decode(const StorageType&, math::Vec3<T>&);
+    template<typename T> static void encode(const math::Vec3<T>&, StorageType&);
+    static const char* name() { return "uvec"; }
+};
+
+
+////////////////////////////////////////
+
+
+/// Base class for storing attribute data
+class AttributeArray
+{
+protected:
+    struct AccessorBase;
+    template <typename T> struct Accessor;
+
+    typedef boost::shared_ptr<AccessorBase>             AccessorBasePtr;
+
+public:
+    enum Flag { TRANSIENT = 0x1, HIDDEN = 0x2, GROUP=0x4, WRITEUNIFORM=0x8,
+                WRITEMEMCOMPRESS=0x10, WRITEDISKCOMPRESS=0x20, OUTOFCORE=0x40 };
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    struct FileInfo
+    {
+        FileInfo(): bufpos(0), bytes(0) {}
+        std::streamoff bufpos;
+        Index64 bytes;
+        io::MappedFile::Ptr mapping;
+        boost::shared_ptr<io::StreamMetadata> meta;
+    };
+#endif
+
+    typedef boost::shared_ptr<AttributeArray>           Ptr;
+    typedef boost::shared_ptr<const AttributeArray>     ConstPtr;
+
+    template <typename> friend class AttributeHandle;
+
+    typedef Ptr (*FactoryMethod)(size_t);
+
+    AttributeArray() : mCompressedBytes(0), mFlags(0) {}
+    virtual ~AttributeArray() {}
+
+    /// Return a copy of this attribute.
+    virtual AttributeArray::Ptr copy() const = 0;
+
+    /// Return an uncompressed copy of this attribute (will return a copy if not compressed).
+    virtual AttributeArray::Ptr copyUncompressed() const = 0;
+
+    /// Return the length of this array.
+    virtual size_t size() const = 0;
+
+    /// Return the number of bytes of memory used by this attribute.
+    virtual size_t memUsage() const = 0;
+
+    /// Create a new attribute array of the given (registered) type and length.
+    static Ptr create(const NamePair& type, size_t length);
+    /// Return @c true if the given attribute type name is registered.
+    static bool isRegistered(const NamePair& type);
+    /// Clear the attribute type registry.
+    static void clearRegistry();
+
+    /// Return the name of this attribute's type.
+    virtual const NamePair& type() const = 0;
+    /// Return @c true if this attribute is of the same type as the template parameter.
+    template<typename AttributeArrayType>
+    bool isType() const { return this->type() == AttributeArrayType::attributeType(); }
+
+    /// Return @c true if this attribute has a value type the same as the template parameter
+    template<typename ValueType>
+    bool hasValueType() const { return this->type().first == typeNameAsString<ValueType>();}
+
+    /// Set value at given index @a n from @a sourceIndex of another @a sourceArray
+    virtual void set(const Index n, const AttributeArray& sourceArray, const Index sourceIndex) = 0;
+
+    /// Return @c true if this array is stored as a single uniform value.
+    virtual bool isUniform() const = 0;
+    /// @brief  If this array is uniform, replace it with an array of length size().
+    /// @param  fill if true, assign the uniform value to each element of the array.
+    virtual void expand(bool fill = true) = 0;
+    /// Replace the existing array with a uniform zero value.
+    virtual void collapse() = 0;
+    /// Compact the existing array to become uniform if all values are identical
+    virtual bool compact() = 0;
+
+    /// Return @c true if this array is compressed.
+    bool isCompressed() const { return mCompressedBytes != 0; }
+    /// Compress the attribute array.
+    virtual bool compress() = 0;
+    /// Uncompress the attribute array.
+    virtual bool decompress() = 0;
+
+    /// @brief   Specify whether this attribute should be hidden (e.g., from UI or iterators).
+    /// @details This is useful if the attribute is used for blind data or as scratch space
+    ///          for a calculation.
+    /// @note    Attributes are not hidden by default.
+    void setHidden(bool state);
+    /// Return @c true if this attribute is hidden (e.g., from UI or iterators).
+    bool isHidden() const { return bool(mFlags & HIDDEN); }
+
+    /// @brief Specify whether this attribute should only exist in memory
+    ///        and not be serialized during stream output.
+    /// @note  Attributes are not transient by default.
+    void setTransient(bool state);
+    /// Return @c true if this attribute is not serialized during stream output.
+    bool isTransient() const { return bool(mFlags & TRANSIENT); }
+
+    /// @brief Retrieve the attribute array flags
+    uint16_t flags() const { return mFlags; }
+
+    IndexIter beginIndex() const;
+
+    /// Read attribute metadata and buffers from a stream.
+    virtual void read(std::istream&) = 0;
+    /// Write attribute metadata and buffers to a stream.
+    virtual void write(std::ostream&) const = 0;
+
+    /// Ensures all data is in-core
+    virtual void loadData() const = 0;
+
+    /// Check the compressed bytes and flags. If they are equal, perform a deeper
+    /// comparison check necessary on the inherited types (TypedAttributeArray)
+    /// Requires non operator implementation due to inheritance
+    bool operator==(const AttributeArray& other) const;
+    bool operator!=(const AttributeArray& other) const { return !this->operator==(other); }
+
+private:
+    /// Virtual function used by the comparison operator to perform
+    /// comparisons on inherited types
+    virtual bool isEqual(const AttributeArray& other) const = 0;
+
+protected:
+    /// Obtain an Accessor that stores getter and setter functors.
+    virtual AccessorBasePtr getAccessor() const = 0;
+
+    /// Register a attribute type along with a factory function.
+    static void registerType(const NamePair& type, FactoryMethod);
+    /// Remove a attribute type from the registry.
+    static void unregisterType(const NamePair& type);
+
+    size_t mCompressedBytes;
+    uint16_t mFlags;
+
+    /// Out-of-core data
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    boost::shared_ptr<FileInfo> mFileInfo;
+#endif
+}; // class AttributeArray
+
+
+////////////////////////////////////////
+
+
+/// Accessor base class for AttributeArray storage where type is not available
+struct AttributeArray::AccessorBase { };
+
+/// Templated Accessor stores typed function pointers used in binding
+/// AttributeHandles
+template <typename T>
+struct AttributeArray::Accessor : public AttributeArray::AccessorBase
+{
+    typedef T (*GetterPtr)(const AttributeArray* array, const Index n);
+    typedef void (*SetterPtr)(AttributeArray* array, const Index n, const T& value);
+    typedef void (*ValuePtr)(AttributeArray* array, const T& value);
+
+    Accessor(GetterPtr getter, SetterPtr setter, ValuePtr collapser, ValuePtr filler) :
+        mGetter(getter), mSetter(setter), mCollapser(collapser), mFiller(filler) { }
+
+    GetterPtr mGetter;
+    SetterPtr mSetter;
+    ValuePtr  mCollapser;
+    ValuePtr  mFiller;
+}; // struct AttributeArray::Accessor
+
+
+////////////////////////////////////////
+
+
+/// Typed class for storing attribute data
+template<typename ValueType_, typename Codec_ = NullAttributeCodec<ValueType_> >
+class TypedAttributeArray: public AttributeArray
+{
+public:
+    typedef boost::shared_ptr<TypedAttributeArray>          Ptr;
+    typedef boost::shared_ptr<const TypedAttributeArray>    ConstPtr;
+
+    typedef ValueType_                  ValueType;
+    typedef Codec_                      Codec;
+    typedef typename Codec::StorageType StorageType;
+
+    //////////
+
+    /// Default constructor, always constructs a uniform attribute.
+    explicit TypedAttributeArray(size_t n = 1,
+        const ValueType& uniformValue = zeroVal<ValueType>());
+    /// Deep copy constructor (optionally decompress during copy).
+    TypedAttributeArray(const TypedAttributeArray&, bool uncompress = false);
+    /// Deep copy assignment operator.
+    TypedAttributeArray& operator=(const TypedAttributeArray&);
+
+    virtual ~TypedAttributeArray() { this->deallocate(); }
+
+    /// Return a copy of this attribute.
+    virtual AttributeArray::Ptr copy() const;
+
+    /// Return an uncompressed copy of this attribute (will just return a copy if not compressed).
+    virtual AttributeArray::Ptr copyUncompressed() const;
+
+    /// Return a new attribute array of the given length @a n with uniform value zero.
+    static Ptr create(size_t n);
+
+    /// Cast an AttributeArray to TypedAttributeArray<T>
+    static TypedAttributeArray& cast(AttributeArray& attributeArray);
+
+    /// Cast an AttributeArray to TypedAttributeArray<T>
+    static const TypedAttributeArray& cast(const AttributeArray& attributeArray);
+
+    /// Return the name of this attribute's type (includes codec)
+    static const NamePair& attributeType();
+    /// Return the name of this attribute's type.
+    virtual const NamePair& type() const { return attributeType(); }
+
+    /// Return @c true if this attribute type is registered.
+    static bool isRegistered();
+    /// Register this attribute type along with a factory function.
+    static void registerType();
+    /// Remove this attribute type from the registry.
+    static void unregisterType();
+
+    /// Return the length of this array.
+    virtual size_t size() const { return mSize; };
+
+    /// Return the number of bytes of memory used by this attribute.
+    virtual size_t memUsage() const;
+
+    /// Return the value at index @a n (assumes uncompressed and in-core)
+    ValueType getUnsafe(Index n) const;
+    /// Return the value at index @a n
+    ValueType get(Index n) const;
+    /// Return the @a value at index @a n (assumes uncompressed and in-core)
+    template<typename T> void getUnsafe(Index n, T& value) const;
+    /// Return the @a value at index @a n
+    template<typename T> void get(Index n, T& value) const;
+
+    /// Non-member equivalent to getUnsafe() that static_casts array to this TypedAttributeArray
+    /// (assumes uncompressed and in-core)
+    static ValueType getUnsafe(const AttributeArray* array, const Index n);
+
+    /// Set @a value at the given index @a n (assumes uncompressed and in-core)
+    void setUnsafe(Index n, const ValueType& value);
+    /// Set @a value at the given index @a n
+    void set(Index n, const ValueType& value);
+    /// Set @a value at the given index @a n (assumes uncompressed and in-core)
+    template<typename T> void setUnsafe(Index n, const T& value);
+    /// Set @a value at the given index @a n
+    template<typename T> void set(Index n, const T& value);
+
+    /// Non-member equivalent to setUnsafe() that static_casts array to this TypedAttributeArray
+    /// (assumes uncompressed and in-core)
+    static void setUnsafe(AttributeArray* array, const Index n, const ValueType& value);
+
+    /// Set value at given index @a n from @a sourceIndex of another @a sourceArray
+    virtual void set(const Index n, const AttributeArray& sourceArray, const Index sourceIndex);
+
+    /// Return @c true if this array is stored as a single uniform value.
+    virtual bool isUniform() const { return mIsUniform; }
+    /// @brief  Replace the single value storage with an array of length size().
+    /// @note   Non-uniform attributes are unchanged.
+    /// @param  fill toggle to initialize the array elements with the pre-expanded value.
+    virtual void expand(bool fill = true);
+    /// Replace the existing array with a uniform zero value.
+    virtual void collapse();
+    /// Compact the existing array to become uniform if all values are identical
+    virtual bool compact();
+
+    /// Replace the existing array with the given uniform value.
+    void collapse(const ValueType& uniformValue);
+    /// @brief Fill the existing array with the given value.
+    /// @note Identical to collapse() except a non-uniform array will not become uniform.
+    void fill(const ValueType& value);
+
+    /// Non-member equivalent to collapse() that static_casts array to this TypedAttributeArray
+    static void collapse(AttributeArray* array, const ValueType& value);
+    /// Non-member equivalent to fill() that static_casts array to this TypedAttributeArray
+    static void fill(AttributeArray* array, const ValueType& value);
+
+    /// Compress the attribute array.
+    virtual bool compress();
+    /// Uncompress the attribute array.
+    virtual bool decompress();
+
+    /// Read attribute data from a stream.
+    virtual void read(std::istream& is);
+    /// Write attribute data to a stream.
+    virtual void write(std::ostream& os) const;
+
+    /// Return @c true if this buffer's values have not yet been read from disk.
+    inline bool isOutOfCore() const;
+
+    /// Ensures all data is in-core
+    virtual void loadData() const;
+
+protected:
+    virtual AccessorBasePtr getAccessor() const;
+
+private:
+    /// Load data from memory-mapped file.
+    inline void doLoad() const;
+    /// Load data from memory-mapped file (unsafe as this function is not protected by a mutex).
+    inline void doLoadUnsafe() const;
+
+    /// Toggle out-of-core state
+    inline void setOutOfCore(const bool);
+
+    /// Compare the this data to another attribute array. Used by the base class comparison operator
+    virtual bool isEqual(const AttributeArray& other) const;
+
+    size_t arrayMemUsage() const;
+    void allocate(const size_t size);
+    void deallocate();
+
+    /// Helper function for use with registerType()
+    static AttributeArray::Ptr factory(size_t n) { return TypedAttributeArray::create(n); }
+
+    static tbb::atomic<const NamePair*> sTypeName;
+    StorageType*    mData;
+    size_t          mSize;
+    bool            mIsUniform;
+    tbb::spin_mutex mMutex;
+}; // class TypedAttributeArray
+
+
+////////////////////////////////////////
+
+
+/// AttributeHandles provide access to specific TypedAttributeArray methods without needing
+/// to know the compression codec, however these methods also incur the cost of a function pointer
+template <typename T>
+class AttributeHandle
+{
+public:
+    typedef boost::shared_ptr<AttributeHandle<T> > Ptr;
+
+protected:
+    typedef T (*GetterPtr)(const AttributeArray* array, const Index n);
+    typedef void (*SetterPtr)(AttributeArray* array, const Index n, const T& value);
+    typedef void (*ValuePtr)(AttributeArray* array, const T& value);
+
+public:
+    static Ptr create(const AttributeArray& array, const bool preserveCompression = true);
+
+    AttributeHandle(const AttributeArray& array, const bool preserveCompression = true);
+
+    bool isUniform() const;
+
+    T get(Index n) const;
+
+protected:
+    const AttributeArray* mArray;
+
+    GetterPtr mGetter;
+    SetterPtr mSetter;
+    ValuePtr  mCollapser;
+    ValuePtr  mFiller;
+
+private:
+    // local copy of AttributeArray (to preserve compression)
+    AttributeArray::Ptr mLocalArray;
+}; // class AttributeHandle
+
+
+/// Write-able version of AttributeHandle
+template <typename T>
+class AttributeWriteHandle : public AttributeHandle<T>
+{
+public:
+    typedef boost::shared_ptr<AttributeWriteHandle<T> > Ptr;
+
+    static Ptr create(AttributeArray& array);
+
+    AttributeWriteHandle(AttributeArray& array);
+
+    /// @brief  If this array is uniform, replace it with an array of length size().
+    /// @param  fill if true, assign the uniform value to each element of the array.
+    void expand(bool fill = true);
+
+    /// Replace the existing array with a uniform value (zero if none provided).
+    void collapse();
+    void collapse(const T& uniformValue);
+
+    /// Compact the existing array to become uniform if all values are identical
+    bool compact();
+
+    /// @brief Fill the existing array with the given value.
+    /// @note Identical to collapse() except a non-uniform array will not become uniform.
+    void fill(const T& value);
+
+    void set(Index n, const T& value);
+}; // class AttributeWriteHandle
+
+
+typedef AttributeHandle<float> AttributeHandleROF;
+typedef AttributeWriteHandle<float> AttributeHandleRWF;
+
+typedef AttributeHandle<Vec3f> AttributeHandleROVec3f;
+typedef AttributeWriteHandle<Vec3f> AttributeHandleRWVec3f;
+
+
+////////////////////////////////////////
+
+
+// Attribute codec implementation
+
+
+template<typename StorageType_>
+template<typename ValueType>
+inline void
+NullAttributeCodec<StorageType_>::decode(const StorageType& data, ValueType& val)
+{
+    val = static_cast<ValueType>(data);
+}
+
+
+template<typename StorageType_>
+template<typename ValueType>
+inline void
+NullAttributeCodec<StorageType_>::encode(const StorageType& val, ValueType& data)
+{
+    data = static_cast<StorageType>(val);
+}
+
+
+template<typename IntType>
+template<typename ValueType>
+inline void
+FixedPointAttributeCodec<IntType>::decode(const StorageType& data, ValueType& val)
+{
+    val = fixedPointToFloatingPoint<ValueType>(data);
+
+    // shift value range to be -0.5 => 0.5 (as this is most commonly used for position)
+
+    val -= ValueType(0.5);
+}
+
+
+template<typename IntType>
+template<typename ValueType>
+inline void
+FixedPointAttributeCodec<IntType>::encode(const ValueType& val, StorageType& data)
+{
+    // shift value range to be -0.5 => 0.5 (as this is most commonly used for position)
+
+    const ValueType newVal = val + ValueType(0.5);
+
+    data = floatingPointToFixedPoint<StorageType>(newVal);
+}
+
+
+template<typename T>
+inline void
+UnitVecAttributeCodec::decode(const StorageType& data, math::Vec3<T>& val)
+{
+    val = math::QuantizedUnitVec::unpack(data);
+}
+
+
+template<typename T>
+inline void
+UnitVecAttributeCodec::encode(const math::Vec3<T>& val, StorageType& data)
+{
+    data = math::QuantizedUnitVec::pack(val);
+}
+
+
+////////////////////////////////////////
+
+// TypedAttributeArray implementation
+
+template<typename ValueType_, typename Codec_>
+tbb::atomic<const NamePair*> TypedAttributeArray<ValueType_, Codec_>::sTypeName;
+
+
+template<typename ValueType_, typename Codec_>
+TypedAttributeArray<ValueType_, Codec_>::TypedAttributeArray(
+    size_t n, const ValueType& uniformValue)
+    : AttributeArray()
+    , mData(new StorageType[1])
+    , mSize(n)
+    , mIsUniform(true)
+    , mMutex()
+{
+    mSize = std::max(size_t(1), mSize);
+    Codec::encode(uniformValue, mData[0]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+TypedAttributeArray<ValueType_, Codec_>::TypedAttributeArray(const TypedAttributeArray& rhs, bool uncompress)
+    : AttributeArray(rhs)
+    , mData(NULL)
+    , mSize(rhs.mSize)
+    , mIsUniform(rhs.mIsUniform)
+    , mMutex()
+{
+    using attribute_compression::decompress;
+    using attribute_compression::uncompressedSize;
+
+    // disable uncompress if data is not compressed
+
+    if (!this->isCompressed())  uncompress = false;
+
+    if (mIsUniform) {
+        this->allocate(1);
+        mData[0] = rhs.mData[0];
+    } else if (this->isOutOfCore()) {
+        // do nothing
+    } else if (this->isCompressed()) {
+        char* buffer = 0;
+        if (uncompress) {
+            rhs.doLoad();
+            const char* charBuffer = reinterpret_cast<char*>(rhs.mData);
+            buffer = decompress(charBuffer, uncompressedSize(charBuffer));
+        }
+        if (buffer)         mCompressedBytes = 0;
+        else {
+            // decompression wasn't requested or failed so deep copy instead
+            buffer = new char[mCompressedBytes];
+            memcpy(buffer, rhs.mData, mCompressedBytes);
+        }
+        assert(buffer);
+        mData = reinterpret_cast<StorageType*>(buffer);
+    } else {
+        this->allocate(mSize);
+        memcpy(mData, rhs.mData, mSize * sizeof(StorageType));
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::TypedAttributeArray&
+TypedAttributeArray<ValueType_, Codec_>::operator=(const TypedAttributeArray& rhs)
+{
+    if (&rhs != this) {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+
+        this->deallocate();
+
+        mFlags = rhs.mFlags;
+        mCompressedBytes = rhs.mCompressedBytes;
+        mSize = rhs.mSize;
+        mIsUniform = rhs.mIsUniform;
+
+        if (mIsUniform) {
+            this->allocate(1);
+            mData[0] = rhs.mData[0];
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        } else if (rhs.isOutOfCore()) {
+            mFileInfo = rhs.mFileInfo;
+#endif
+        } else if (this->isCompressed()) {
+            char* buffer = new char[mCompressedBytes];
+            memcpy(buffer, rhs.mData, mCompressedBytes);
+            mData = reinterpret_cast<StorageType*>(buffer);
+        } else {
+            this->allocate(mSize);
+            memcpy(mData, rhs.mData, mSize * sizeof(StorageType));
+        }
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline const NamePair&
+TypedAttributeArray<ValueType_, Codec_>::attributeType()
+{
+    if (sTypeName == NULL) {
+        std::ostringstream ostr1, ostr2;
+        ostr1 << typeNameAsString<ValueType>();
+        ostr2 << Codec::name() << "_" << typeNameAsString<StorageType>();
+        NamePair* s = new NamePair(ostr1.str(), ostr2.str());
+        if (sTypeName.compare_and_swap(s, NULL) != NULL) delete s;
+    }
+    return *sTypeName;
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline bool
+TypedAttributeArray<ValueType_, Codec_>::isRegistered()
+{
+    return AttributeArray::isRegistered(TypedAttributeArray::attributeType());
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline void
+TypedAttributeArray<ValueType_, Codec_>::registerType()
+{
+    AttributeArray::registerType(TypedAttributeArray::attributeType(), TypedAttributeArray::factory);
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline void
+TypedAttributeArray<ValueType_, Codec_>::unregisterType()
+{
+    AttributeArray::unregisterType(TypedAttributeArray::attributeType());
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline typename TypedAttributeArray<ValueType_, Codec_>::Ptr
+TypedAttributeArray<ValueType_, Codec_>::create(size_t n)
+{
+    return Ptr(new TypedAttributeArray(n));
+}
+
+template<typename ValueType_, typename Codec_>
+inline TypedAttributeArray<ValueType_, Codec_>&
+TypedAttributeArray<ValueType_, Codec_>::cast(AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<TypedAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<TypedAttributeArray&>(attributeArray);
+}
+
+template<typename ValueType_, typename Codec_>
+inline const TypedAttributeArray<ValueType_, Codec_>&
+TypedAttributeArray<ValueType_, Codec_>::cast(const AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<TypedAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<const TypedAttributeArray&>(attributeArray);
+}
+
+template<typename ValueType_, typename Codec_>
+AttributeArray::Ptr
+TypedAttributeArray<ValueType_, Codec_>::copy() const
+{
+    return AttributeArray::Ptr(new TypedAttributeArray<ValueType, Codec>(*this));
+}
+
+
+template<typename ValueType_, typename Codec_>
+AttributeArray::Ptr
+TypedAttributeArray<ValueType_, Codec_>::copyUncompressed() const
+{
+    return AttributeArray::Ptr(new TypedAttributeArray<ValueType, Codec>(*this, /*decompress = */true));
+}
+
+
+template<typename ValueType_, typename Codec_>
+size_t
+TypedAttributeArray<ValueType_, Codec_>::arrayMemUsage() const
+{
+    if (mIsUniform)                 return sizeof(StorageType);
+    if (this->isOutOfCore())        return 0;
+    if (this->isCompressed())       return mCompressedBytes;
+
+    return mSize * sizeof(StorageType);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::allocate(const size_t size)
+{
+    assert(!mData);
+    mData = new StorageType[size];
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::deallocate()
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    // detach from file if delay-loaded
+    if (this->isOutOfCore()) {
+        this->setOutOfCore(false);
+        this->mFileInfo.reset();
+    }
+#endif
+    if (mData) {
+        delete[] mData;
+        mData = NULL;
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+size_t
+TypedAttributeArray<ValueType_, Codec_>::memUsage() const
+{
+    return sizeof(*this) + (mData != NULL ? this->arrayMemUsage() : 0);
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::ValueType
+TypedAttributeArray<ValueType_, Codec_>::getUnsafe(Index n) const
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    ValueType val;
+    Codec::decode(/*in=*/mData[mIsUniform ? 0 : n], /*out=*/val);
+    return val;
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::ValueType
+TypedAttributeArray<ValueType_, Codec_>::get(Index n) const
+{
+    if (this->isCompressed())           const_cast<TypedAttributeArray*>(this)->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    return this->getUnsafe(n);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::getUnsafe(Index n, T& val) const
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    ValueType tmp;
+    Codec::decode(/*in=*/mData[mIsUniform ? 0 : n], /*out=*/tmp);
+    val = static_cast<T>(tmp);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::get(Index n, T& val) const
+{
+    if (this->isCompressed())           const_cast<TypedAttributeArray*>(this)->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    this->getUnsafe(n, val);
+}
+
+
+template<typename ValueType_, typename Codec_>
+typename TypedAttributeArray<ValueType_, Codec_>::ValueType
+TypedAttributeArray<ValueType_, Codec_>::getUnsafe(const AttributeArray* array, const Index n)
+{
+    return static_cast<const TypedAttributeArray<ValueType, Codec>*>(array)->getUnsafe(n);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::setUnsafe(Index n, const ValueType& val)
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    if (mIsUniform)     this->expand();
+
+    Codec::encode(/*in=*/val, /*out=*/mData[n]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::set(Index n, const ValueType& val)
+{
+    if (this->isCompressed())           this->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    this->setUnsafe(n, val);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::setUnsafe(Index n, const T& val)
+{
+    assert(!this->isCompressed());
+    assert(!this->isOutOfCore());
+
+    if (mIsUniform)     this->expand();
+
+    const ValueType tmp = static_cast<ValueType>(val);
+    Codec::encode(/*in=*/tmp, /*out=*/mData[n]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+template<typename T>
+void
+TypedAttributeArray<ValueType_, Codec_>::set(Index n, const T& val)
+{
+    if (this->isCompressed())           this->decompress();
+    else if (this->isOutOfCore())       this->doLoad();
+
+    this->setUnsafe(n, val);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::setUnsafe(AttributeArray* array, const Index n, const ValueType& value)
+{
+    static_cast<TypedAttributeArray<ValueType, Codec>*>(array)->setUnsafe(n, value);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::set(Index n, const AttributeArray& sourceArray, const Index sourceIndex)
+{
+    const TypedAttributeArray& sourceTypedArray = static_cast<const TypedAttributeArray&>(sourceArray);
+
+    ValueType sourceValue;
+    sourceTypedArray.get(sourceIndex, sourceValue);
+
+    this->set(n, sourceValue);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::expand(bool fill)
+{
+    if (!mIsUniform)    return;
+
+    const StorageType val = mData[0];
+
+    {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+        this->deallocate();
+        this->allocate(mSize);
+    }
+
+    mCompressedBytes = 0;
+    mIsUniform = false;
+
+    if (fill) {
+        for (size_t i = 0; i < mSize; ++i)  mData[i] = val;
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+bool
+TypedAttributeArray<ValueType_, Codec_>::compact()
+{
+    if (mIsUniform)     return true;
+
+    // compaction is not possible if any values are different
+    const ValueType_ val = this->get(0);
+    for (size_t i = 1; i < size(); i++) {
+        if (this->get(i) != val)    return false;
+    }
+
+    this->collapse(this->get(0));
+    return true;
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::collapse()
+{
+    this->collapse(zeroVal<ValueType>());
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::collapse(const ValueType& uniformValue)
+{
+    if (!mIsUniform) {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+        this->deallocate();
+        this->allocate(1);
+        mIsUniform = true;
+    }
+    Codec::encode(uniformValue, mData[0]);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::collapse(AttributeArray* array, const ValueType& value)
+{
+    static_cast<TypedAttributeArray<ValueType, Codec>*>(array)->collapse(value);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::fill(const ValueType& value)
+{
+    if (this->isOutOfCore()) {
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+        this->deallocate();
+        this->allocate(mSize);
+    }
+
+    const size_t size = mIsUniform ? 1 : mSize;
+    for (size_t i = 0; i < size; ++i)  {
+        Codec::encode(value, mData[i]);
+    }
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::fill(AttributeArray* array, const ValueType& value)
+{
+    static_cast<TypedAttributeArray<ValueType, Codec>*>(array)->fill(value);
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline bool
+TypedAttributeArray<ValueType_, Codec_>::compress()
+{
+    using attribute_compression::canCompress;
+    using attribute_compression::compress;
+
+    if (!canCompress())     return false;
+
+    if (!mIsUniform && !this->isCompressed()) {
+
+        tbb::spin_mutex::scoped_lock lock(mMutex);
+
+        this->doLoadUnsafe();
+
+        const size_t typeSize = sizeof(typename Codec_::StorageType);
+        const int inBytes = int(mSize * sizeof(StorageType));
+        int outBytes;
+        char* charBuffer = reinterpret_cast<char*>(mData);
+        char* buffer = compress(charBuffer, typeSize, inBytes, outBytes, /*cleanup=*/true);
+
+        if (buffer) {
+            mData = reinterpret_cast<StorageType*>(buffer);
+            mCompressedBytes = size_t(outBytes);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+template<typename ValueType_, typename Codec_>
+inline bool
+TypedAttributeArray<ValueType_, Codec_>::decompress()
+{
+    using attribute_compression::decompress;
+    using attribute_compression::uncompressedSize;
+
+    tbb::spin_mutex::scoped_lock lock(mMutex);
+
+    if (this->isCompressed()) {
+        this->doLoadUnsafe();
+        char* charBuffer = reinterpret_cast<char*>(this->mData);
+        char* buffer = decompress(charBuffer, uncompressedSize(charBuffer));
+        if (buffer) {
+            mData = reinterpret_cast<StorageType*>(buffer);
+            mCompressedBytes = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+template<typename ValueType_, typename Codec_>
+bool
+TypedAttributeArray<ValueType_, Codec_>::isOutOfCore() const
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    return (mFlags & OUTOFCORE);
+#else
+    return false;
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::setOutOfCore(const bool b)
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (b)  mFlags |= OUTOFCORE;
+    else    mFlags &= ~OUTOFCORE;
+#else
+    (void) b;
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::doLoad() const
+{
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!(this->isOutOfCore()))     return;
+
+    TypedAttributeArray<ValueType_, Codec_>* self = const_cast<TypedAttributeArray<ValueType_, Codec_>*>(this);
+
+    // This lock will be contended at most once, after which this buffer
+    // will no longer be out-of-core.
+    tbb::spin_mutex::scoped_lock lock(self->mMutex);
+    this->doLoadUnsafe();
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::loadData() const
+{
+    this->doLoad();
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::read(std::istream& is)
+{
+    using attribute_compression::decompress;
+
+    // read data
+
+    Index64 bytes = Index64(0);
+    is.read(reinterpret_cast<char*>(&bytes), sizeof(Index64));
+    bytes = bytes - /*flags*/sizeof(Int16) - /*size*/sizeof(Index64);
+
+    Int16 flags = Int16(0);
+    is.read(reinterpret_cast<char*>(&flags), sizeof(Int16));
+    mFlags = flags;
+
+    Index64 size = Index64(0);
+    is.read(reinterpret_cast<char*>(&size), sizeof(Index64));
+    mSize = size;
+
+    char* buffer = new char[bytes];
+
+    // read uniform and compressed state
+
+    mIsUniform = mFlags & WRITEUNIFORM;
+    mCompressedBytes = mFlags & WRITEMEMCOMPRESS ? bytes : Index64(0);
+
+    // clear uniform and compress flags
+
+    mFlags &= Int16(~WRITEUNIFORM & ~WRITEMEMCOMPRESS);
+
+    tbb::spin_mutex::scoped_lock lock(mMutex);
+
+    this->deallocate();
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    // If this array is being read from a memory-mapped file, delay loading of its data
+    // until the data is actually accessed.
+    io::MappedFile::Ptr mappedFile = io::getMappedFilePtr(is);
+    const bool delayLoad = (mappedFile.get() != NULL);
+
+    if (delayLoad) {
+        this->setOutOfCore(true);
+        mFileInfo.reset(new FileInfo);
+        mFileInfo->bufpos = is.tellg();
+        mFileInfo->mapping = mappedFile;
+        mFileInfo->bytes = bytes;
+        mFileInfo->meta = io::getStreamMetadataPtr(is);
+
+        // read and discard buffer
+        is.read(buffer, bytes);
+        delete[] buffer;
+        return;
+    }
+#endif
+
+    is.read(buffer, bytes);
+
+    // compressed on-disk
+
+    if (mFlags & WRITEDISKCOMPRESS) {
+
+        // decompress buffer
+
+        const int inBytes = int(mSize * sizeof(StorageType));
+        char* newBuffer = decompress(buffer, inBytes, /*cleanup=*/true);
+        if (newBuffer)  buffer = newBuffer;
+    }
+
+    // set data to buffer
+
+    mData = reinterpret_cast<StorageType*>(buffer);
+
+    // clear all write flags
+
+    mFlags &= Int16(~WRITEDISKCOMPRESS);
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::write(std::ostream& os) const
+{
+    using attribute_compression::compress;
+
+    if (this->isTransient())    return;
+
+    Int16 flags(mFlags);
+    Index64 size(mSize);
+
+    boost::scoped_array<char> compressedBuffer;
+    int compressedBytes = 0;
+
+    this->doLoad();
+
+    if (mIsUniform)
+    {
+        flags |= WRITEUNIFORM;
+    }
+    else if (this->isCompressed())
+    {
+        flags |= WRITEMEMCOMPRESS;
+    }
+    else if (io::getDataCompression(os) & io::COMPRESS_BLOSC)
+    {
+        const char* charBuffer = reinterpret_cast<const char*>(mData);
+        const size_t typeSize = sizeof(typename Codec_::StorageType);
+        const int inBytes = int(mSize * sizeof(StorageType));
+        compressedBuffer.reset(compress(charBuffer, typeSize, inBytes, compressedBytes));
+        if (compressedBuffer)   flags |= WRITEDISKCOMPRESS;
+    }
+
+    Index64 bytes = /*flags*/ sizeof(Int16) + /*size*/ sizeof(Index64);
+
+    bytes += compressedBuffer ? compressedBytes : this->arrayMemUsage();
+
+    // write data
+
+    os.write(reinterpret_cast<const char*>(&bytes), sizeof(Index64));
+    os.write(reinterpret_cast<const char*>(&flags), sizeof(Int16));
+    os.write(reinterpret_cast<const char*>(&size), sizeof(Index64));
+
+    if (compressedBuffer)   os.write(reinterpret_cast<const char*>(compressedBuffer.get()), compressedBytes);
+    else                    os.write(reinterpret_cast<const char*>(mData), this->arrayMemUsage());
+}
+
+
+template<typename ValueType_, typename Codec_>
+void
+TypedAttributeArray<ValueType_, Codec_>::doLoadUnsafe() const
+{
+    using attribute_compression::decompress;
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    if (!(this->isOutOfCore()))     return;
+
+    // this function expects the mutex to already be locked
+
+    TypedAttributeArray<ValueType_, Codec_>* self = const_cast<TypedAttributeArray<ValueType_, Codec_>*>(this);
+
+    assert(self->mFileInfo);
+    assert(self->mFileInfo->mapping.get() != NULL);
+
+    FileInfo& info = *(self->mFileInfo);
+
+    boost::shared_ptr<std::streambuf> buf = info.mapping->createBuffer();
+    std::istream is(buf.get());
+
+    const Index64 bytes = info.bytes;
+
+    is.seekg(info.bufpos);
+
+    char* buffer = new char[bytes];
+    is.read(buffer, bytes);
+
+    // compressed on-disk
+
+    if (mFlags & WRITEDISKCOMPRESS) {
+
+        // decompress buffer
+
+        const int inBytes = int(mSize * sizeof(StorageType));
+        char* newBuffer = decompress(buffer, inBytes, /*cleanup=*/true);
+        if (newBuffer)  buffer = newBuffer;
+    }
+
+    // set data to buffer
+
+    self->mData = reinterpret_cast<StorageType*>(buffer);
+
+    // clear write and out-of-core flags
+
+    self->mFlags &= Int16(~WRITEDISKCOMPRESS & ~OUTOFCORE);
+#endif
+}
+
+
+template<typename ValueType_, typename Codec_>
+AttributeArray::AccessorBasePtr
+TypedAttributeArray<ValueType_, Codec_>::getAccessor() const
+{
+    // use the faster 'unsafe' get and set methods as attribute handles
+    // ensure data is uncompressed and in-core when constructed
+
+    return AccessorBasePtr(new AttributeArray::Accessor<ValueType_>(
+        &TypedAttributeArray<ValueType_, Codec_>::getUnsafe,
+        &TypedAttributeArray<ValueType_, Codec_>::setUnsafe,
+        &TypedAttributeArray<ValueType_, Codec_>::collapse,
+        &TypedAttributeArray<ValueType_, Codec_>::fill));
+}
+
+
+template<typename ValueType_, typename Codec_>
+bool
+TypedAttributeArray<ValueType_, Codec_>::isEqual(const AttributeArray& other) const
+{
+    const TypedAttributeArray<ValueType_, Codec_>* const otherT = dynamic_cast<const TypedAttributeArray<ValueType_, Codec_>* >(&other);
+    if(!otherT) return false;
+    if(this->mSize != otherT->mSize ||
+       this->mIsUniform != otherT->mIsUniform ||
+       *this->sTypeName != *otherT->sTypeName) return false;
+
+    this->doLoad();
+
+    const StorageType *target = this->mData, *source = otherT->mData;
+    if (!target && !source) return true;
+    if (!target || !source) return false;
+    Index n = this->mIsUniform ? 1 : mSize;
+    while (n && math::isExactlyEqual(*target++, *source++)) --n;
+    return n == 0;
+}
+
+////////////////////////////////////////
+
+// AttributeHandle implementation
+
+template <typename T>
+typename AttributeHandle<T>::Ptr
+AttributeHandle<T>::create(const AttributeArray& array, const bool preserveCompression)
+{
+    return typename AttributeHandle<T>::Ptr(new AttributeHandle<T>(array, preserveCompression));
+}
+
+template <typename T>
+AttributeHandle<T>::AttributeHandle(const AttributeArray& array, const bool preserveCompression)
+    : mArray(&array)
+{
+    // load data if delay-loaded
+
+    mArray->loadData();
+
+    // if array is compressed and preserve compression is true, copy and decompress
+    // into a local copy that is destroyed with handle to maintain thread-safety
+
+    if (array.isCompressed())
+    {
+        if (preserveCompression) {
+            mLocalArray = array.copyUncompressed();
+            mLocalArray->decompress();
+            mArray = mLocalArray.get();
+        }
+        else {
+            const_cast<AttributeArray*>(mArray)->decompress();
+        }
+    }
+
+    // bind getter and setter methods
+
+    AttributeArray::AccessorBasePtr accessor = mArray->getAccessor();
+    assert(accessor);
+
+    AttributeArray::Accessor<T>* typedAccessor = static_cast<AttributeArray::Accessor<T>*>(accessor.get());
+
+    if (!typedAccessor) {
+        OPENVDB_THROW(RuntimeError, "Cannot bind AttributeHandle due to mis-matching types.");
+    }
+
+    mGetter = typedAccessor->mGetter;
+    mSetter = typedAccessor->mSetter;
+    mCollapser = typedAccessor->mCollapser;
+    mFiller = typedAccessor->mFiller;
+}
+
+
+template <typename T>
+T AttributeHandle<T>::get(Index n) const
+{
+    return mGetter(mArray, n);
+}
+
+template <typename T>
+bool AttributeHandle<T>::isUniform() const
+{
+    return mArray->isUniform();
+}
+
+////////////////////////////////////////
+
+// AttributeWriteHandle implementation
+
+template <typename T>
+typename AttributeWriteHandle<T>::Ptr
+AttributeWriteHandle<T>::create(AttributeArray& array)
+{
+    return typename AttributeWriteHandle<T>::Ptr(new AttributeWriteHandle<T>(array));
+}
+
+template <typename T>
+AttributeWriteHandle<T>::AttributeWriteHandle(AttributeArray& array)
+    : AttributeHandle<T>(array, /*preserveCompression = */ false) { }
+
+template <typename T>
+void AttributeWriteHandle<T>::set(Index n, const T& value)
+{
+    this->mSetter(const_cast<AttributeArray*>(this->mArray), n, value);
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::expand(const bool fill)
+{
+    const_cast<AttributeArray*>(this->mArray)->expand(fill);
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::collapse()
+{
+    const_cast<AttributeArray*>(this->mArray)->collapse();
+}
+
+template <typename T>
+bool AttributeWriteHandle<T>::compact()
+{
+    return const_cast<AttributeArray*>(this->mArray)->compact();
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::collapse(const T& uniformValue)
+{
+    this->mCollapser(const_cast<AttributeArray*>(this->mArray), uniformValue);
+}
+
+template <typename T>
+void AttributeWriteHandle<T>::fill(const T& value)
+{
+    this->mFiller(const_cast<AttributeArray*>(this->mArray), value);
+}
+
+} // namespace tools
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/AttributeGroup.h b/nuparu/include/openvdb_points/tools/AttributeGroup.h
new file mode 100644
index 00000000..830be73a
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/AttributeGroup.h
@@ -0,0 +1,209 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file AttributeGroup.h
+///
+/// @authors Dan Bailey
+///
+/// @brief  Attribute Group access and filtering for iteration.
+///
+
+
+#ifndef OPENVDB_TOOLS_ATTRIBUTE_GROUP_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ATTRIBUTE_GROUP_HAS_BEEN_INCLUDED
+
+#include <openvdb_points/tools/AttributeArray.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+typedef uint8_t GroupType;
+
+
+////////////////////////////////////////
+
+
+class GroupAttributeArray : public TypedAttributeArray<GroupType, NullAttributeCodec<GroupType> >
+{
+public:
+    /// Default constructor, always constructs a uniform attribute.
+    explicit GroupAttributeArray(   size_t n = 1,
+                                    const ValueType& uniformValue = zeroVal<ValueType>());
+    /// Deep copy constructor (optionally decompress during copy).
+    GroupAttributeArray(const GroupAttributeArray& array,
+                        const bool decompress = false);
+
+    /// Cast an AttributeArray to GroupAttributeArray
+    static GroupAttributeArray& cast(AttributeArray& attributeArray);
+
+    /// Cast an AttributeArray to GroupAttributeArray
+    static const GroupAttributeArray& cast(const AttributeArray& attributeArray);
+
+    /// Return @c true if the AttributeArray provided is a group
+    static bool isGroup(const AttributeArray& attributeArray);
+
+    /// @brief Specify whether this attribute is for tracking group membership
+    /// @note  Attributes are not group attributes by default.
+    void setGroup(bool state);
+    /// Return @c true if this attribute is for tracking groups
+    bool isGroup() const { return bool(mFlags & GROUP); }
+
+}; // class GroupAttributeArray
+
+
+inline GroupAttributeArray&
+GroupAttributeArray::cast(AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<GroupAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<GroupAttributeArray&>(attributeArray);
+}
+
+
+inline const GroupAttributeArray&
+GroupAttributeArray::cast(const AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<GroupAttributeArray>()) {
+        OPENVDB_THROW(TypeError, "Invalid Attribute Type");
+    }
+    return static_cast<const GroupAttributeArray&>(attributeArray);
+}
+
+
+inline bool
+GroupAttributeArray::isGroup(const AttributeArray& attributeArray)
+{
+    if (!attributeArray.isType<GroupAttributeArray>())  return false;
+
+    return GroupAttributeArray::cast(attributeArray).isGroup();
+}
+
+
+////////////////////////////////////////
+
+
+class GroupHandle
+{
+public:
+    // Dummy class that distinguishes an offset from a bitmask on construction
+    struct BitMask { };
+
+    typedef std::pair<size_t, uint8_t> GroupIndex;
+
+    GroupHandle(const GroupAttributeArray& array, const GroupType& offset);
+    GroupHandle(const GroupAttributeArray& array, const GroupType& bitMask, BitMask);
+
+    size_t size() const { return mArray.size(); }
+    bool isUniform() const { return mArray.isUniform(); }
+
+    bool get(Index n) const;
+
+protected:
+    const GroupAttributeArray& mArray;
+    const GroupType mBitMask;
+}; // class GroupHandle
+
+
+////////////////////////////////////////
+
+
+class GroupWriteHandle : public GroupHandle
+{
+public:
+
+    GroupWriteHandle(GroupAttributeArray& array, const GroupType& offset);
+
+    void set(Index n, bool on);
+
+    /// @brief Set membership for the whole array and attempt to collapse
+    ///
+    /// @param on True or false for inclusion in group
+    ///
+    /// @note This method guarantees that all attributes will have group membership
+    /// changed according to the input bool, however compaction will not be performed
+    /// if other groups that share the same underlying array are non-uniform.
+    /// The return value indicates if the group array ends up being uniform.
+    bool collapse(bool on);
+
+}; // class GroupWriteHandle
+
+
+////////////////////////////////////////
+
+
+/// Index filtering on group membership
+class GroupFilter
+{
+public:
+    struct Data
+    {
+        Data(const Name& _attribute)
+            : attribute(_attribute) { }
+        const Name attribute;
+    };
+
+    GroupFilter(const GroupHandle& handle)
+        : mHandle(handle) { }
+
+    template <typename LeafT>
+    static GroupFilter create(const LeafT& leaf, const Data& data) {
+        return GroupFilter(leaf.groupHandle(data.attribute));
+    }
+
+    template <typename IterT>
+    bool valid(const IterT& iter) const {
+        return mHandle.get(*iter);
+    }
+
+private:
+    const GroupHandle mHandle;
+}; // class GroupFilter
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_ATTRIBUTE_GROUP_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/AttributeSet.h b/nuparu/include/openvdb_points/tools/AttributeSet.h
new file mode 100644
index 00000000..3bb5aafd
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/AttributeSet.h
@@ -0,0 +1,405 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file AttributeSet.h
+///
+/// @authors Dan Bailey, Mihai Alden, Peter Cucka
+///
+/// @brief  Set of Attribute Arrays which tracks metadata about each array.
+///
+
+
+#ifndef OPENVDB_TOOLS_ATTRIBUTE_SET_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_ATTRIBUTE_SET_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/metadata/MetaMap.h>
+
+#include <boost/integer_traits.hpp> // integer_traits
+#include <boost/shared_ptr.hpp> // shared_ptr
+
+#include <vector>
+
+#include <openvdb_points/tools/AttributeArray.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+/// Ordered collection of uniquely-named attribute arrays
+class AttributeSet
+{
+public:
+    enum { INVALID_POS = boost::integer_traits<size_t>::const_max };
+
+    typedef boost::shared_ptr<AttributeSet> Ptr;
+    typedef boost::shared_ptr<const AttributeSet> ConstPtr;
+
+    class Descriptor;
+
+    typedef boost::shared_ptr<Descriptor> DescriptorPtr;
+    typedef boost::shared_ptr<const Descriptor> DescriptorConstPtr;
+
+    //////////
+
+    struct Util
+    {
+        /// Attribute and type name pair.
+        struct NameAndType {
+            NameAndType(const std::string& n, const NamePair& t)
+                : name(n), type(t) {}
+            Name name;
+            NamePair type;
+        };
+
+        typedef std::vector<NameAndType> NameAndTypeVec;
+        typedef std::map<std::string, size_t> NameToPosMap;
+        typedef std::pair<size_t, uint8_t> GroupIndex;
+    };
+
+    //////////
+
+    AttributeSet();
+
+    /// Construct from the given descriptor
+    explicit AttributeSet(const DescriptorPtr&, size_t arrayLength = 1);
+
+    /// Shallow copy constructor, the descriptor and attribute arrays will be shared.
+    AttributeSet(const AttributeSet&);
+
+    //@{
+    /// @brief  Return a reference to this attribute set's descriptor, which might
+    ///         be shared with other sets.
+    Descriptor& descriptor() { return *mDescr; }
+    const Descriptor& descriptor() const { return *mDescr; }
+    //@}
+
+    /// @brief Return a pointer to this attribute set's descriptor, which might be
+    /// shared with other sets
+    DescriptorPtr descriptorPtr() const { return mDescr; }
+
+    /// Return the number of attributes in this set.
+    size_t size() const { return mAttrs.size(); }
+
+    /// Return the number of attributes with this flag set
+    size_t size(const uint16_t flag) const;
+
+    /// Return the number of bytes of memory used by this attribute set.
+    size_t memUsage() const;
+
+    /// @brief  Return the position of the attribute array whose name is @a name,
+    ///         or @c INVALID_POS if no match is found.
+    size_t find(const std::string& name) const;
+
+    /// @brief  Replace the attribute array whose name is @a name.
+    /// @return The position of the updated attribute array or @c INVALID_POS
+    ///         if the given name does not exist or if the replacement failed because
+    ///         the new array type does not comply with the descriptor.
+    size_t replace(const std::string& name, const AttributeArray::Ptr&);
+
+    /// @brief  Replace the attribute array stored at position @a pos in this container.
+    /// @return The position of the updated attribute array or @c INVALID_POS
+    ///         if replacement failed because the new array type does not comply with
+    ///         the descriptor.
+    size_t replace(size_t pos, const AttributeArray::Ptr&);
+
+    //@{
+    /// @brief  Return a pointer to the attribute array whose name is @a name or
+    ///         a null pointer if no match is found.
+    const AttributeArray* getConst(const std::string& name) const;
+    const AttributeArray* get(const std::string& name) const;
+    AttributeArray*       get(const std::string& name);
+    //@}
+
+    //@{
+    /// @brief  Return a pointer to the attribute array stored at position @a pos
+    ///         in this set.
+    const AttributeArray* getConst(size_t pos) const;
+    const AttributeArray* get(size_t pos) const;
+    AttributeArray*       get(size_t pos);
+    //@}
+
+    //@{
+    /// @brief Return the group offset from the name or index of the group
+    /// A group attribute array is a single byte (8-bit), each bit of which
+    /// can denote a group. The group offset is the position of the bit that
+    /// denotes the requested group if all group attribute arrays in the set
+    /// (and only attribute arrays marked as group) were to be laid out linearly
+    /// according to their order in the set.
+    size_t groupOffset(const Name& groupName) const;
+    size_t groupOffset(const Util::GroupIndex& index) const;
+    //@}
+
+    /// Return the group index from the name of the group
+    Util::GroupIndex groupIndex(const Name& groupName) const;
+    /// Return the group index from the offset of the group
+    /// @note see offset description for groupOffset()
+    Util::GroupIndex groupIndex(const size_t offset) const;
+
+    /// Create an iterator for iterating through point indices
+    IndexIter beginIndex() const;
+
+    /// Return true if the attribute array stored at position @a pos is shared.
+    bool isShared(size_t pos) const;
+    /// @brief  If the attribute array stored at position @a pos is shared,
+    ///         replace the array with a deep copy of itself that is not
+    ///         shared with anyone else.
+    void makeUnique(size_t pos);
+
+    /// Append attribute @a attribute (simple method)
+    AttributeArray::Ptr appendAttribute(const Util::NameAndType& attribute,
+                                        Metadata::Ptr defaultValue = Metadata::Ptr());
+
+    /// Append attribute @a attribute (descriptor-sharing)
+    /// Requires current descriptor to match @a expected
+    /// On append, current descriptor is replaced with @a replacement
+    AttributeArray::Ptr appendAttribute(const Util::NameAndType& attribute,
+                                        const Descriptor& expected, DescriptorPtr& replacement);
+
+    /// Drop attributes with @a pos indices (simple method)
+    /// Creates a new descriptor for this attribute set
+    void dropAttributes(const std::vector<size_t>& pos);
+
+    /// Drop attributes with @a pos indices (descriptor-sharing method)
+    /// Requires current descriptor to match @a expected
+    /// On drop, current descriptor is replaced with @a replacement
+    void dropAttributes(const std::vector<size_t>& pos,
+                        const Descriptor& expected, DescriptorPtr& replacement);
+
+    /// Re order attribute set to match a provided descriptor
+    /// Replaces own descriptor with @a replacement
+    void reorderAttributes(const DescriptorPtr& replacement);
+
+    /// Re-name attributes in set to match a provided descriptor
+    /// Replaces own descriptor with @a replacement
+    void renameAttributes(const Descriptor& expected, DescriptorPtr& replacement);
+
+    /// Read the entire set from a stream.
+    void read(std::istream&);
+    /// Write the entire set to a stream.
+    void write(std::ostream&) const;
+
+    /// This will read the attribute descriptor from a stream, but no attribute data.
+    void readMetadata(std::istream&);
+    /// This will write the attribute descriptor to a stream, but no attribute data.
+    void writeMetadata(std::ostream&) const;
+
+    /// Read attribute data from a stream.
+    void readAttributes(std::istream&);
+    /// Write attribute data to a stream.
+    void writeAttributes(std::ostream&) const;
+
+    /// Compare the descriptors and attribute arrays on the attribute sets
+    /// Exit early if the descriptors do not match
+    bool operator==(const AttributeSet& other) const;
+    bool operator!=(const AttributeSet& other) const { return !this->operator==(other); }
+
+private:
+    /// Disallow assignment, since it wouldn't be obvious whether the copy is deep or shallow.
+    AttributeSet& operator=(const AttributeSet&);
+
+    typedef std::vector<AttributeArray::Ptr> AttrArrayVec;
+
+    DescriptorPtr mDescr;
+    AttrArrayVec  mAttrs;
+}; // class AttributeSet
+
+////////////////////////////////////////
+
+
+/// @brief  An immutable object that stores name, type and AttributeSet position
+///         for a constant collection of attribute arrays.
+/// @note   The attribute name is actually mutable, but the attribute type
+///         and position can not be changed after creation.
+class AttributeSet::Descriptor
+{
+public:
+    typedef boost::shared_ptr<Descriptor> Ptr;
+
+    typedef Util::NameAndType             NameAndType;
+    typedef Util::NameAndTypeVec          NameAndTypeVec;
+    typedef Util::GroupIndex              GroupIndex;
+    typedef Util::NameToPosMap            NameToPosMap;
+    typedef NameToPosMap::const_iterator  ConstIterator;
+
+    /// Utility method to construct a NameAndType sequence.
+    struct Inserter {
+        NameAndTypeVec vec;
+        Inserter& add(const NameAndType& nameAndType) {
+            vec.push_back(nameAndType); return *this;
+        }
+        Inserter& add(const Name& name, const NamePair& type) {
+            vec.push_back(NameAndType(name, type)); return *this;
+        }
+        Inserter& add(const NameAndTypeVec& other) {
+            for (NameAndTypeVec::const_iterator it = other.begin(), itEnd = other.end(); it != itEnd; ++it) {
+                vec.push_back(NameAndType(it->name, it->type));
+            }
+            return *this;
+        }
+    };
+
+    //////////
+
+    Descriptor();
+
+    /// Copy constructor
+    Descriptor(const Descriptor&);
+
+    /// Create a new descriptor from the given attribute and type name pairs.
+    static Ptr create(const NameAndTypeVec&);
+
+    /// Create a new descriptor from the given attribute and type name pairs
+    /// and copy the group maps and metamap.
+    static Ptr create(const NameAndTypeVec&, const NameToPosMap&, const MetaMap&);
+
+    /// Create a new descriptor from a position attribute type and assumes "P" (for convenience).
+    static Ptr create(const NamePair&);
+
+    Ptr duplicateAppend(const NameAndType& attribute) const;
+    Ptr duplicateAppend(const NameAndTypeVec& vec) const;
+    Ptr duplicateDrop(const std::vector<size_t>& pos) const;
+
+    /// Return the number of attributes in this descriptor.
+    size_t size() const { return mTypes.size(); }
+
+    /// Return the number of bytes of memory used by this attribute set.
+    size_t memUsage() const;
+
+    /// @brief  Return the position of the attribute array whose name is @a name,
+    ///         or @c INVALID_POS if no match is found.
+    size_t find(const std::string& name) const;
+
+    /// Rename an attribute array
+    size_t rename(const std::string& fromName, const std::string& toName);
+
+    /// Return the name of the attribute array's type.
+    const Name& valueType(size_t pos) const;
+    /// Return the name of the attribute array's type.
+    const NamePair& type(size_t pos) const;
+
+    /// Retrieve metadata map
+    MetaMap& getMetadata();
+    const MetaMap& getMetadata() const;
+
+    /// Return true if the attribute has a default value
+    bool hasDefaultValue(const Name& name) const;
+    /// Get a default value for an existing attribute
+    template <typename ValueType>
+    ValueType getDefaultValue(const Name& name) const;
+    /// Set a default value for an existing attribute
+    void setDefaultValue(const Name& name, const Metadata& defaultValue);
+    // Remove the default value if it exists
+    void removeDefaultValue(const Name& name);
+    // Prune any default values for which the key is no longer present
+    void pruneUnusedDefaultValues();
+
+    /// Return true if this descriptor is equal to the given one.
+    bool operator==(const Descriptor&) const;
+    /// Return true if this descriptor is not equal to the given one.
+    bool operator!=(const Descriptor& rhs) const { return !this->operator==(rhs); }
+    /// Return true if this descriptor contains the same attributes
+    /// as the given descriptor, ignoring attribute order
+    bool hasSameAttributes(const Descriptor& rhs) const;
+
+    /// Return a reference to the name-to-position map.
+    const NameToPosMap& map() const { return mNameMap; }
+    /// Return a reference to the name-to-position group map.
+    const NameToPosMap& groupMap() const { return mGroupMap; }
+
+    /// Append to a vector of names and types from this Descriptor in position order
+    void appendTo(NameAndTypeVec& attrs) const;
+
+    /// Return @c true if group exists
+    bool hasGroup(const Name& group) const;
+    /// Define a group name to offset mapping
+    void setGroup(const Name& group, const size_t offset);
+    /// Drop any mapping keyed by group name
+    void dropGroup(const Name& group);
+    /// Clear all groups
+    void clearGroups();
+
+    /// Return a unique name for an attribute array based on given name
+    const Name uniqueName(const Name& name) const;
+
+    /// Serialize this descriptor to the given stream.
+    void write(std::ostream&) const;
+    /// Unserialize this transform from the given stream.
+    void read(std::istream&);
+
+private:
+    size_t insert(const std::string& name, const NamePair& typeName);
+    NameToPosMap                mNameMap;
+    std::vector<NamePair>       mTypes;
+    NameToPosMap                mGroupMap;
+    MetaMap                     mMetadata;
+}; // class Descriptor
+
+
+template <typename ValueType>
+ValueType
+AttributeSet::Descriptor::getDefaultValue(const Name& name) const
+{
+    typedef typename TypedMetadata<ValueType>::ConstPtr MetadataPtr;
+
+    const size_t pos = find(name);
+    if (pos == INVALID_POS) {
+        OPENVDB_THROW(LookupError, "Cannot find attribute name to set default value.")
+    }
+
+    std::stringstream ss;
+    ss << "default:" << name;
+
+    MetadataPtr metadata = mMetadata.getMetadata<TypedMetadata<ValueType> >(ss.str());
+
+    if (metadata)   return metadata->value();
+
+    return zeroVal<ValueType>();
+}
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_ATTRIBUTE_ARRAY_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+
diff --git a/nuparu/include/openvdb_points/tools/IndexFilter.h b/nuparu/include/openvdb_points/tools/IndexFilter.h
new file mode 100644
index 00000000..4e45a5c6
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/IndexFilter.h
@@ -0,0 +1,161 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file IndexFilter.h
+///
+/// @authors Dan Bailey
+///
+/// @brief  Index filters primarily designed to be used with a FilterIndexIter.
+///
+
+
+#ifndef OPENVDB_TOOLS_INDEX_FILTER_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_INDEX_FILTER_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+
+#include <openvdb/math/Transform.h>
+
+#include <openvdb_points/tools/IndexIterator.h>
+#include <openvdb_points/tools/AttributeArray.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+////////////////////////////////////////
+
+
+// Random index filtering per leaf
+template <typename RandGenT>
+class RandomLeafFilter
+{
+public:
+    typedef std::map<openvdb::Coord, Index64> LeafSeedMap;
+    typedef boost::uniform_01<RandGenT> Distribution;
+    typedef typename Distribution::result_type ResultT;
+
+    struct Data
+    {
+        Data(const ResultT _factor, const LeafSeedMap& _leafSeedMap)
+            : factor(_factor), leafSeedMap(_leafSeedMap) { }
+        const ResultT factor;
+        const LeafSeedMap& leafSeedMap;
+    };
+
+    RandomLeafFilter(const Data& data, const unsigned int seed)
+        : mData(data)
+        , mDistribution(RandGenT(seed)) { }
+
+    inline ResultT next() const {
+        return const_cast<boost::uniform_01<boost::mt11213b>&>(mDistribution)();
+    }
+
+    template <typename LeafT>
+    static RandomLeafFilter create(const LeafT& leaf, const Data& data) {
+        const LeafSeedMap::const_iterator it = data.leafSeedMap.find(leaf.origin());
+        if (it == data.leafSeedMap.end()) {
+            OPENVDB_THROW(openvdb::KeyError, "Cannot find leaf origin in offset map for random filter");
+        }
+        return RandomLeafFilter(data, (unsigned int) it->second);
+    }
+
+    template <typename IterT>
+    bool valid(const IterT&) const {
+        return next() < mData.factor;
+    }
+
+private:
+    const Data mData;
+    Distribution mDistribution;
+}; // class RandomLeafFilter
+
+
+// BBox index filtering
+class BBoxFilter
+{
+public:
+    struct Data
+    {
+        Data(const openvdb::math::Transform& _transform,
+             const openvdb::BBoxd& _bboxWS)
+            : transform(_transform)
+            , bbox(transform.worldToIndex(_bboxWS)) { }
+        const openvdb::math::Transform transform;
+        const openvdb::BBoxd bbox;
+    };
+
+    BBoxFilter( const Data& data,
+                const AttributeHandle<openvdb::Vec3f>::Ptr& positionHandle)
+        : mData(data)
+        , mPositionHandle(positionHandle) { }
+
+    template <typename LeafT>
+    static BBoxFilter create(const LeafT& leaf, const Data& data) {
+        return BBoxFilter(data, AttributeHandle<openvdb::Vec3f>::create(leaf.constAttributeArray("P")));
+    }
+
+    template <typename IterT>
+    bool valid(const IterT& iter) const {
+        const openvdb::Coord ijk = iter.getCoord();
+        const openvdb::Vec3f voxelIndexSpace = ijk.asVec3d();
+
+        // Retrieve point position in voxel space
+        const openvdb::Vec3f& pointVoxelSpace = mPositionHandle->get(*iter);
+
+        // Compute point position in index space
+        const openvdb::Vec3f pointIndexSpace = pointVoxelSpace + voxelIndexSpace;
+
+        return mData.bbox.isInside(pointIndexSpace);
+    }
+
+private:
+    const Data mData;
+    const AttributeHandle<openvdb::Vec3f>::Ptr mPositionHandle;
+}; // class BBoxFilter
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_INDEX_FILTER_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/IndexIterator.h b/nuparu/include/openvdb_points/tools/IndexIterator.h
new file mode 100644
index 00000000..06d70fe5
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/IndexIterator.h
@@ -0,0 +1,347 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @file IndexIterator.h
+///
+/// @authors Dan Bailey
+///
+/// @brief  Index Iterators.
+///
+
+
+#ifndef OPENVDB_TOOLS_INDEX_ITERATOR_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_INDEX_ITERATOR_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+#include <openvdb/Types.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Count up the number of times the iterator can iterate
+///
+/// @param iter the iterator.
+///
+/// @note counting by iteration only performed where a dynamic filter is in use,
+template <typename IterT>
+inline Index64 iterCount(const IterT& iter);
+
+
+////////////////////////////////////////
+
+
+/// @brief A forward iterator over array indices
+class IndexIter
+{
+public:
+    IndexIter()
+        : mEnd(0), mItem(0) {}
+    IndexIter(Index32 item, Index32 end)
+        : mEnd(end), mItem(item) {}
+    IndexIter(const IndexIter& other)
+        : mEnd(other.mEnd), mItem(other.mItem) { }
+
+    inline Index32 end() const { return mEnd; }
+
+    /// @brief Reset the begining and end of the iterator.
+    inline void reset(Index32 item, Index32 end) {
+        mItem = item;
+        mEnd = end;
+    }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    inline Index32 operator*() { return mItem; }
+    inline Index32 operator*() const { return mItem; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    inline operator bool() const { return mItem < mEnd; }
+    inline bool test() const { return mItem < mEnd; }
+
+    /// @brief  Advance to the next (valid) item (prefix).
+    inline IndexIter& operator++() {
+        ++mItem;
+        return *this;
+    }
+
+    /// @brief  Advance to the next (valid) item (postfix).
+    inline IndexIter operator++(int /*dummy*/) {
+        IndexIter newIterator(*this);
+        this->operator++();
+        return newIterator;
+    }
+
+    /// @brief  Advance to the next (valid) item.
+    inline bool next() { this->operator++(); return this->test(); }
+    inline bool increment() { this->next(); return this->test(); }
+
+    /// Throw an error as Coord methods are not available on this iterator
+    inline Coord getCoord() const { OPENVDB_THROW(RuntimeError, "IndexIter does not provide a valid Coord, use a ValueIndexIter instead."); }
+    /// Throw an error as Coord methods are not available on this iterator
+    inline void getCoord(Coord&) const { OPENVDB_THROW(RuntimeError, "IndexIter does not provide a valid Coord, use a ValueIndexIter instead."); }
+
+    /// @brief Equality operators
+    inline bool operator==(const IndexIter& other) const { return mItem == other.mItem; }
+    inline bool operator!=(const IndexIter& other) const { return !this->operator==(other); }
+
+private:
+    Index32 mEnd, mItem;
+}; // class IndexIter
+
+
+/// @brief A forward iterator over array indices from a value iterator (such as ValueOnCIter)
+template <typename ValueIterT>
+class ValueIndexIter
+{
+public:
+    ValueIndexIter(ValueIterT& iter)
+        : mIndexIter(), mIter(iter), mParent(mIter.parent())
+    {
+        if (mIter) {
+            Index32 start = mIter.offset() > 0 ? Index32(mParent.getValue(mIter.offset() - 1)) : Index32(0);
+            mIndexIter.reset(start, *mIter);
+            if (!mIndexIter.test())   this->operator++();
+        }
+    }
+    ValueIndexIter(const ValueIndexIter& other)
+        : mIndexIter(other.mIndexIter), mIter(other.mIter), mParent(other.mParent) { }
+
+    inline Index32 end() const { return mIndexIter.end(); }
+
+    inline void reset(Index32 item, Index32 end) {
+        mIndexIter.reset(item, end);
+    }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    inline Index32 operator*() { return *mIndexIter; }
+    inline Index32 operator*() const { return *mIndexIter; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    inline operator bool() const { return mIter; }
+    inline bool test() const { return mIter; }
+
+    /// @brief  Advance to the next (valid) item (prefix).
+    inline ValueIndexIter& operator++() {
+        mIndexIter.next();
+        while (!mIndexIter.test() && mIter.next()) {
+            mIndexIter.reset(mParent.getValue(mIter.offset() - 1), *mIter);
+        }
+        return *this;
+    }
+
+    /// @brief  Advance to the next (valid) item (postfix).
+    inline ValueIndexIter operator++(int /*dummy*/) {
+        IndexIter newIterator(*this);
+        this->operator++();
+        return newIterator;
+    }
+
+    /// @brief  Advance to the next (valid) item.
+    inline bool next() { this->operator++(); return this->test(); }
+    inline bool increment() { this->next(); return this->test(); }
+
+    /// Return the coordinates of the item to which the value iterator is pointing.
+    inline Coord getCoord() const { return mIter.getCoord(); }
+    /// Return in @a xyz the coordinates of the item to which the value iterator is pointing.
+    inline void getCoord(Coord& xyz) const { xyz = mIter.getCoord(); }
+
+    /// Return the const index iterator
+    inline const IndexIter& indexIter() const { return mIndexIter; }
+    /// Return the const value iterator
+    inline const ValueIterT& valueIter() const { return mIter; }
+
+    /// @brief Equality operators
+    bool operator==(const ValueIndexIter& other) const { return *mIndexIter == *other.mIndexIter; }
+    bool operator!=(const ValueIndexIter& other) const { return !this->operator==(other); }
+
+private:
+    IndexIter mIndexIter;
+    ValueIterT mIter;
+    const typename ValueIterT::NodeType& mParent;
+}; // ValueIndexIter
+
+
+/// IndexIterTraits provides the following for iterators of the three value
+/// types, i.e., for {Value}{On,Off,All}{CIter}:
+/// - a begin(leaf) function that returns an index iterator or an index value
+///   iterator for the leaf provided,
+///   eg IndexIterTraits<Tree, Tree::LeafNodeType::ValueOn>::begin(leaf) returns
+///   leaf.beginIndexOn()
+/// - an Iterator typedef that aliases to the index iterator for this value type
+template<typename TreeT, typename ValueT> struct IndexIterTraits;
+
+template<typename TreeT>
+struct IndexIterTraits<TreeT, typename TreeT::LeafNodeType::ValueAllCIter> {
+    typedef IndexIter Iterator;
+    static Iterator begin(const typename TreeT::LeafNodeType& leaf) {
+        return Iterator(leaf.beginIndexAll());
+    }
+};
+
+template<typename TreeT>
+struct IndexIterTraits<TreeT, typename TreeT::LeafNodeType::ValueOnCIter> {
+    typedef typename TreeT::LeafNodeType::IndexOnIter Iterator;
+    static Iterator begin(const typename TreeT::LeafNodeType& leaf) {
+        return Iterator(leaf.beginIndexOn());
+    }
+};
+
+template<typename TreeT>
+struct IndexIterTraits<TreeT, typename TreeT::LeafNodeType::ValueOffCIter> {
+    typedef typename TreeT::LeafNodeType::IndexOffIter Iterator;
+    static Iterator begin(const typename TreeT::LeafNodeType& leaf) {
+        return Iterator(leaf.beginIndexOff());
+    }
+};
+
+
+/// @brief A forward iterator over array indices with filtering
+/// IteratorT can be either IndexIter or ValueIndexIter (or some custom index iterator)
+/// FilterT should be a struct or class with a valid() method than can be evaluated per index
+/// Here's a simple filter example that only accepts even indices:
+///
+/// struct EvenIndexFilter
+/// {
+///     bool valid(const Index32 offset) const {
+///         return (offset % 2) == 0;
+///     }
+/// };
+///
+template <typename IteratorT, typename FilterT>
+class FilterIndexIter
+{
+public:
+    FilterIndexIter(const IteratorT& iterator, const FilterT& filter)
+        : mIterator(iterator), mFilter(filter) { if (mIterator) { this->reset(*mIterator, mIterator.end()); } }
+    FilterIndexIter(const FilterIndexIter& other)
+        : mIterator(other.mIterator), mFilter(other.mFilter) { }
+
+    Index32 end() const { return mIterator.end(); }
+
+    /// @brief Reset the begining and end of the iterator.
+    void reset(Index32 begin, Index32 end) {
+        mIterator.reset(begin, end);
+        while (mIterator.test() && !mFilter.template valid<IteratorT>(mIterator)) {
+            ++mIterator;
+        }
+    }
+
+    /// @brief  Returns the item to which this iterator is currently pointing.
+    Index32 operator*() { return *mIterator; }
+    Index32 operator*() const { return *mIterator; }
+
+    /// @brief  Return @c true if this iterator is not yet exhausted.
+    operator bool() const { return mIterator.test(); }
+    bool test() const { return mIterator.test(); }
+
+    /// @brief  Advance to the next (valid) item (prefix).
+    FilterIndexIter& operator++() {
+        while (true) {
+            ++mIterator;
+            if (!mIterator.test() || mFilter.template valid<IteratorT>(mIterator)) {
+                break;
+            }
+        }
+        return *this;
+    }
+
+    /// @brief  Advance to the next (valid) item (postfix).
+    FilterIndexIter operator++(int /*dummy*/) {
+        FilterIndexIter newIterator(*this);
+        this->operator++();
+        return newIterator;
+    }
+
+    /// @brief  Advance to the next (valid) item.
+    bool next() { this->operator++(); return this->test(); }
+    bool increment() { this->next(); return this->test(); }
+
+    /// Return the const index iterator
+    inline const IteratorT& indexIter() const { return mIterator; }
+    /// Return the const filter
+    inline const FilterT& filter() const { return mFilter; }
+
+    /// @brief Equality operators
+    bool operator==(const FilterIndexIter& other) const { return mIterator == other.mIterator; }
+    bool operator!=(const FilterIndexIter& other) const { return !this->operator==(other); }
+
+private:
+    IteratorT mIterator;
+    const FilterT mFilter;
+}; // class FilterIndexIter
+
+
+////////////////////////////////////////
+
+
+template <typename IterT>
+inline Index64 iterCount(const IterT& iter)
+{
+    Index64 size = 0;
+    for (IterT newIter(iter); newIter; ++newIter, ++size) { }
+    return size;
+}
+
+
+template <>
+inline Index64 iterCount(const IndexIter& iter)
+{
+    return iter ? iter.end() - *iter : 0;
+}
+
+
+template <typename T>
+inline Index64 iterCount(const ValueIndexIter<T>& iter)
+{
+    T newIter(iter.valueIter());
+    Index64 size = 0;
+    for ( ; newIter; ++newIter) {
+        size += *newIter - (newIter.offset() == 0 ? Index32(0) : Index32(newIter.parent().getValue(newIter.offset() - 1)));
+    }
+    return size;
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_INDEX_ITERATOR_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/PointAttribute.h b/nuparu/include/openvdb_points/tools/PointAttribute.h
new file mode 100644
index 00000000..5506b6d3
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/PointAttribute.h
@@ -0,0 +1,545 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointAttribute.h
+///
+/// @brief  Point attribute manipulation in a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/AttributeGroup.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Appends a new attribute to the VDB tree.
+///
+/// @param tree          the PointDataTree to be appended to.
+/// @param newAttribute  name and type for the new attribute.
+/// @param defaultValue  metadata default attribute value
+/// @param hidden        mark attribute as hidden
+/// @param transient     mark attribute as transient
+/// @param group         mark attribute as group
+template <typename PointDataTree>
+inline void appendAttribute(PointDataTree& tree,
+                            const AttributeSet::Util::NameAndType& newAttribute,
+                            Metadata::Ptr defaultValue = Metadata::Ptr(),
+                            const bool hidden = false,
+                            const bool transient = false,
+                            const bool group = false);
+
+/// @brief Drops attributes from the VDB tree.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param indices       indices of the attributes to drop.
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<size_t>& indices);
+
+/// @brief Drops attributes from the VDB tree.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param names         names of the attributes to drop.
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<Name>& names);
+
+/// @brief Drop one attribute from the VDB tree (convenience method).
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param index         index of the attribute to drop.
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const size_t& index);
+
+/// @brief Drop one attribute from the VDB tree (convenience method).
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param name          name of the attribute to drop.
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const Name& name);
+
+/// @brief Rename attributes in a VDB tree.
+///
+/// @param tree          the PointDataTree.
+/// @param oldNames      a list of old attribute names to rename from.
+/// @param newNames      a list of new attribute names to rename to.
+///
+/// @note Number of oldNames must match the number of newNames.
+///
+/// @note Duplicate names and renaming group attributes are not allowed.
+template <typename PointDataTree>
+inline void renameAttributes(PointDataTree& tree,
+                            const std::vector<Name>& oldNames,
+                            const std::vector<Name>& newNames);
+
+/// @brief Rename an attribute in a VDB tree.
+///
+/// @param tree          the PointDataTree.
+/// @param oldName       the old attribute name to rename from.
+/// @param newName       the new attribute name to rename to.
+///
+/// @note newName must not already exist and must not be a group attribute.
+template <typename PointDataTree>
+inline void renameAttribute(PointDataTree& tree,
+                            const Name& oldName,
+                            const Name& newName);
+
+/// @brief Compact attributes in a VDB tree (if possible).
+///
+/// @param tree          the PointDataTree.
+template <typename PointDataTree>
+inline void compactAttributes(PointDataTree& tree);
+
+/// @brief Apply Blosc compression to one attribute in the VDB tree.
+///
+/// @param tree          the PointDataTree.
+/// @param name          name of the attribute to compress.
+template <typename PointDataTree>
+inline void bloscCompressAttribute( PointDataTree& tree,
+                                    const Name& name);
+
+////////////////////////////////////////
+
+
+namespace point_attribute_internal {
+
+template<typename PointDataTreeType>
+struct AppendAttributeOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef AttributeSet::Descriptor::NameAndType               NameAndType;
+
+    AppendAttributeOp(  PointDataTreeType& tree,
+                        const NameAndType& newAttribute,
+                        AttributeSet::DescriptorPtr& descriptor,
+                        const bool hidden = false,
+                        const bool transient = false,
+                        const bool group = false)
+        : mTree(tree)
+        , mNewAttribute(newAttribute)
+        , mDescriptor(descriptor)
+        , mHidden(hidden)
+        , mTransient(transient)
+        , mGroup(group) { }
+
+    void operator()(const LeafRangeT& range) const {
+
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            const AttributeSet::Descriptor& expected = leaf->attributeSet().descriptor();
+
+            AttributeArray::Ptr attribute = leaf->appendAttribute(mNewAttribute, expected, mDescriptor);
+
+            if (mHidden)      attribute->setHidden(true);
+            if (mTransient)   attribute->setTransient(true);
+
+            if (mGroup) {
+                GroupAttributeArray::cast(*attribute).setGroup(true);
+            }
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&              mTree;
+    const NameAndType&              mNewAttribute;
+    AttributeSet::DescriptorPtr&    mDescriptor;
+    const bool                      mHidden;
+    const bool                      mTransient;
+    const bool                      mGroup;
+}; // class AppendAttributeOp
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataTreeType>
+struct DropAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef std::vector<size_t>                                 Indices;
+
+    DropAttributesOp(   PointDataTreeType& tree,
+                        const Indices& indices,
+                        AttributeSet::DescriptorPtr& descriptor)
+        : mTree(tree)
+        , mIndices(indices)
+        , mDescriptor(descriptor) { }
+
+    void operator()(const LeafRangeT& range) const {
+
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            const AttributeSet::Descriptor& expected = leaf->attributeSet().descriptor();
+
+            leaf->dropAttributes(mIndices, expected, mDescriptor);
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&              mTree;
+    const Indices&                  mIndices;
+    AttributeSet::DescriptorPtr&    mDescriptor;
+}; // class DropAttributesOp
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataTreeType>
+struct CompactAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+
+    CompactAttributesOp() { }
+
+    void operator()(const LeafRangeT& range) const {
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+            leaf->compactAttributes();
+        }
+    }
+}; // class CompactAttributesOp
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataTreeType>
+struct BloscCompressAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef std::vector<size_t>                                 Indices;
+
+    BloscCompressAttributesOp(  PointDataTreeType& tree,
+                                const Indices& indices)
+        : mTree(tree)
+        , mIndices(indices) { }
+
+    void operator()(const LeafRangeT& range) const {
+
+        for (typename LeafRangeT::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            for (Indices::const_iterator    it = mIndices.begin(),
+                                            itEnd = mIndices.end(); it != itEnd; ++it) {
+
+                AttributeArray& array = leaf->attributeArray(*it);
+                array.compress();
+            }
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&              mTree;
+    const Indices&                  mIndices;
+}; // class BloscCompressAttributesOp
+
+
+} // namespace point_attribute_internal
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void appendAttribute(PointDataTree& tree,
+                            const AttributeSet::Util::NameAndType& newAttribute,
+                            Metadata::Ptr defaultValue,
+                            const bool hidden, const bool transient, const bool group)
+{
+    typedef AttributeSet::Util::NameAndTypeVec                    NameAndTypeVec;
+    typedef AttributeSet::Descriptor                              Descriptor;
+
+    using point_attribute_internal::AppendAttributeOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    // do not append a non-unique attribute
+
+    const Descriptor& descriptor = iter->attributeSet().descriptor();
+    const size_t index = descriptor.find(newAttribute.name);
+
+    if (index != AttributeSet::INVALID_POS) {
+        OPENVDB_THROW(KeyError, "Cannot append an attribute with a non-unique name - " << newAttribute.name << ".");
+    }
+
+    // create a new attribute descriptor
+    NameAndTypeVec vec;
+    vec.push_back(newAttribute);
+
+    Descriptor::Ptr newDescriptor = descriptor.duplicateAppend(vec);
+
+    // store the attribute default value in the descriptor metadata
+
+    if (defaultValue) {
+        newDescriptor->setDefaultValue(newAttribute.name, *defaultValue);
+    }
+
+    // insert attributes using the new descriptor
+
+    AppendAttributeOp<PointDataTree> append(tree, newAttribute, newDescriptor, hidden, transient, group);
+    tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), append);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<size_t>& indices)
+{
+    typedef typename tree::LeafManager<PointDataTree>       LeafManagerT;
+    typedef AttributeSet::Descriptor                        Descriptor;
+
+    using point_attribute_internal::DropAttributesOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const Descriptor& descriptor = iter->attributeSet().descriptor();
+
+    // throw if position index present in the indices as this attribute is mandatory
+
+    const size_t positionIndex = descriptor.find("P");
+    if (positionIndex!= AttributeSet::INVALID_POS &&
+        std::find(indices.begin(), indices.end(), positionIndex) != indices.end()) {
+        OPENVDB_THROW(KeyError, "Cannot drop mandatory position attribute.");
+    }
+
+    // insert attributes using the new descriptor
+
+    Descriptor::Ptr newDescriptor = descriptor.duplicateDrop(indices);
+    tbb::parallel_for(LeafManagerT(tree).leafRange(), DropAttributesOp<PointDataTree>(tree, indices, newDescriptor));
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropAttributes( PointDataTree& tree,
+                            const std::vector<Name>& names)
+{
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const AttributeSet::Descriptor& descriptor = attributeSet.descriptor();
+
+    std::vector<size_t> indices;
+
+    for (std::vector<Name>::const_iterator it = names.begin(), itEnd = names.end(); it != itEnd; ++it) {
+        const size_t index = descriptor.find(*it);
+
+        // do not attempt to drop an attribute that does not exist
+        if (index == AttributeSet::INVALID_POS) {
+            OPENVDB_THROW(KeyError, "Cannot drop an attribute that does not exist - " << *it << ".");
+        }
+
+        indices.push_back(index);
+    }
+
+    dropAttributes(tree, indices);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const size_t& index)
+{
+    std::vector<size_t> indices;
+    indices.push_back(index);
+    dropAttributes(tree, indices);
+}
+
+
+template <typename PointDataTree>
+inline void dropAttribute(  PointDataTree& tree,
+                            const Name& name)
+{
+    std::vector<Name> names;
+    names.push_back(name);
+    dropAttributes(tree, names);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void renameAttributes(   PointDataTree& tree,
+                                const std::vector<Name>& oldNames,
+                                const std::vector<Name>& newNames)
+{
+    if (oldNames.size() != newNames.size()) {
+        OPENVDB_THROW(ValueError, "Mis-matching sizes of name vectors, cannot rename attributes.");
+    }
+
+    typedef AttributeSet::Descriptor                        Descriptor;
+
+    typename PointDataTree::LeafIter iter = tree.beginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const Descriptor& descriptor = attributeSet.descriptor();
+    AttributeSet::DescriptorPtr newDescriptor(new Descriptor(descriptor));
+
+    for (size_t i = 0; i < oldNames.size(); i++) {
+        const Name oldName(oldNames[i]);
+        if (descriptor.find(oldName) == AttributeSet::INVALID_POS) {
+            OPENVDB_THROW(KeyError, "Cannot find requested attribute - " << oldName << ".");
+        }
+
+        const Name newName(newNames[i]);
+        if (descriptor.find(newName) != AttributeSet::INVALID_POS) {
+            OPENVDB_THROW(KeyError, "Cannot rename attribute as new name already exists - " << newName << ".");
+        }
+
+        const AttributeArray* array = attributeSet.getConst(oldName);
+        assert(array);
+
+        if (GroupAttributeArray::isGroup(*array)) {
+            OPENVDB_THROW(KeyError, "Cannot rename group attribute - " << oldName << ".");
+        }
+
+        newDescriptor->rename(oldName, newName);
+    }
+
+    for (; iter; ++iter) {
+        iter->renameAttributes(descriptor, newDescriptor);
+    }
+}
+
+
+template <typename PointDataTree>
+inline void renameAttribute(PointDataTree& tree,
+                            const Name& oldName,
+                            const Name& newName)
+{
+    std::vector<Name> oldNames;
+    std::vector<Name> newNames;
+    oldNames.push_back(oldName);
+    newNames.push_back(newName);
+    renameAttributes(tree, oldNames, newNames);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void compactAttributes(PointDataTree& tree)
+{
+    typedef typename tree::LeafManager<PointDataTree>       LeafManagerT;
+
+    using point_attribute_internal::CompactAttributesOp;
+
+    typename PointDataTree::LeafIter iter = tree.beginLeaf();
+    if (!iter)  return;
+
+    tbb::parallel_for(LeafManagerT(tree).leafRange(), CompactAttributesOp<PointDataTree>());
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void bloscCompressAttribute( PointDataTree& tree,
+                                    const Name& name)
+{
+    using point_attribute_internal::BloscCompressAttributesOp;
+
+    typedef typename tree::LeafManager<PointDataTree>       LeafManagerT;
+    typedef AttributeSet::Descriptor                        Descriptor;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const Descriptor& descriptor = iter->attributeSet().descriptor();
+
+    // throw if index cannot be found in descriptor
+
+    const size_t index = descriptor.find(name);
+    if (index == AttributeSet::INVALID_POS) {
+        OPENVDB_THROW(KeyError, "Cannot find requested attribute - " << name << ".");
+    }
+
+    // blosc compress attributes
+
+    std::vector<size_t> indices;
+    indices.push_back(index);
+
+    tbb::parallel_for(LeafManagerT(tree).leafRange(), BloscCompressAttributesOp<PointDataTree>(tree, indices));
+}
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_ATTRIBUTE_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/PointConversion.h b/nuparu/include/openvdb_points/tools/PointConversion.h
new file mode 100644
index 00000000..5593919d
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/PointConversion.h
@@ -0,0 +1,414 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointConversion.h
+///
+/// @brief  Convert existing points and attributes into VDB Point Data grids and attributes.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_CONVERSION_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_CONVERSION_HAS_BEEN_INCLUDED
+
+#include <openvdb/math/Transform.h>
+
+#include <openvdb/tools/PointIndexGrid.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief  Localises points with position into a @c PointDataGrid into two stages:
+///         allocation of the leaf attribute data and population of the positions.
+///
+/// @param  pointIndexGrid  a PointIndexGrid into the points.
+/// @param  positions       list of world space point positions.
+/// @param  positionType    the type of the position (includes compression info).
+/// @param  xform           world to index space transform.
+/// @param  positionDefaultValue metadata default position value
+///
+/// @note   The position data must be supplied in a Point-Partitioner compatible
+///         data structure. A convenience PointAttributeVector class is offered.
+///
+/// @note   The position data is populated separately to perform world space to
+///         voxel space conversion and apply quantisation.
+///
+/// @note   A @c PointIndexGrid to the points must be supplied to perform this
+///         operation. Typically this is built implicitly by the PointDataGrid constructor.
+
+template<typename PointDataGridT, typename PositionArrayT, typename PointIndexGridT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const PointIndexGridT& pointIndexGrid, const PositionArrayT& positions,
+                    const openvdb::NamePair& positionType, const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue = Metadata::Ptr());
+
+
+/// @brief  Convenience method to create a @c PointDataGrid from a std::vector of
+///         point positions.
+///
+/// @param  positions     list of world space point positions.
+/// @param  positionType  the type of the position (includes compression info).
+/// @param  xform         world to index space transform.
+/// @param  positionDefaultValue metadata default position value
+///
+/// @note   This method implicitly wraps the std::vector for a Point-Partitioner compatible
+///         data structure and creates the required @c PointIndexGrid to the points.
+
+template <typename PointDataGridT, typename ValueT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const std::vector<ValueT>& positions,
+                    const openvdb::NamePair& positionType, const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue = Metadata::Ptr());
+
+
+/// @brief  Stores point attribute data in an existing @c PointDataGrid attribute.
+///
+/// @param  tree            the PointDataGrid to be populated.
+/// @param  pointIndexTree  a PointIndexTree into the points.
+/// @param  attributeName   the name of the VDB Points attribute to be populated.
+/// @param  data            a wrapper to the attribute data.
+///
+/// @note   A @c PointIndexGrid to the points must be supplied to perform this
+///         operation. This is required to ensure the same point index ordering.
+
+template <typename PointDataTreeT, typename PointIndexTreeT, typename PointArrayT>
+inline void
+populateAttribute(  PointDataTreeT& tree, const PointIndexTreeT& pointIndexTree,
+                    const openvdb::Name& attributeName, const PointArrayT& data);
+
+
+////////////////////////////////////////
+
+
+/// @brief Point-partitioner compatible STL vector attribute wrapper for convenience
+template<typename ValueType>
+class PointAttributeVector {
+public:
+    typedef ValueType PosType;
+    typedef ValueType value_type;
+
+    PointAttributeVector(const std::vector<value_type>& data)
+        : mData(data) { }
+
+    size_t size() const { return mData.size(); }
+    void getPos(size_t n, ValueType& xyz) const { xyz = mData[n]; }
+
+    template <typename T>
+    void get(size_t n, T& value) const { value = mData[n]; }
+
+private:
+    const std::vector<value_type>& mData;
+}; // PointAttributeVector
+
+
+////////////////////////////////////////
+
+
+namespace point_conversion_internal {
+
+template<typename PointDataTreeType, typename PointIndexTreeType>
+struct InitialiseAttributesOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType> LeafManagerT;
+    typedef typename LeafManagerT::LeafRange LeafRangeT;
+
+    typedef typename PointIndexTreeType::LeafNodeType PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray IndexArray;
+
+    InitialiseAttributesOp( PointDataTreeType& tree,
+                            const PointIndexTreeType& pointIndexTree,
+                            const AttributeSet::Descriptor::Ptr& attributeDescriptor)
+        : mTree(tree)
+        , mPointIndexTree(pointIndexTree)
+        , mAttributeDescriptor(attributeDescriptor) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mPointIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            // initialise the attribute storage
+
+            pointIndexLeaf->indices();
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            Index64 pointCount = indices.size();
+
+            leaf->initializeAttributes(mAttributeDescriptor, pointCount);
+        }
+    }
+
+    //////////
+
+    const PointDataTreeType&                mTree;
+    const PointIndexTreeType&               mPointIndexTree;
+    const AttributeSet::Descriptor::Ptr&    mAttributeDescriptor;
+};
+
+template<   typename PointDataTreeType,
+            typename PointIndexTreeType,
+            typename PositionListType>
+struct PopulatePositionAttributeOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType> LeafManagerT;
+    typedef typename LeafManagerT::LeafRange LeafRangeT;
+
+    typedef typename PointIndexTreeType::LeafNodeType PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray IndexArray;
+
+    typedef typename PositionListType::value_type ValueType;
+
+    PopulatePositionAttributeOp(const PointIndexTreeType& pointIndexTree,
+                                const math::Transform& transform,
+                                const PositionListType& positions)
+        : mPointIndexTree(pointIndexTree)
+        , mTransform(transform)
+        , mPositions(positions) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mPointIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            typename AttributeWriteHandle<Vec3f>::Ptr attributeWriteHandle =
+                AttributeWriteHandle<Vec3f>::create(leaf->template attributeArray("P"));
+
+            Index64 index = 0;
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            for (typename IndexArray::const_iterator it = indices.begin(), it_end = indices.end(); it != it_end; ++it)
+            {
+                ValueType positionWorldSpace;
+                mPositions.getPos(*it, positionWorldSpace);
+
+                const ValueType positionIndexSpace = mTransform.worldToIndex(positionWorldSpace);
+
+                const ValueType positionVoxelSpace = ValueType(
+                            positionIndexSpace.x() - math::Round(positionIndexSpace.x()),
+                            positionIndexSpace.y() - math::Round(positionIndexSpace.y()),
+                            positionIndexSpace.z() - math::Round(positionIndexSpace.z()));
+
+                attributeWriteHandle->set(index, Vec3f(positionVoxelSpace));
+
+                index++;
+            }
+        }
+    }
+
+    //////////
+
+    const PointIndexTreeType&   mPointIndexTree;
+    const math::Transform&      mTransform;
+    const PositionListType&     mPositions;
+};
+
+template<   typename PointDataTreeType,
+            typename PointIndexTreeType,
+            typename AttributeListType>
+struct PopulateAttributeOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType> LeafManagerT;
+    typedef typename LeafManagerT::LeafRange LeafRangeT;
+
+    typedef typename PointIndexTreeType::LeafNodeType PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray IndexArray;
+
+    typedef typename AttributeListType::value_type ValueType;
+
+    PopulateAttributeOp(const PointIndexTreeType& pointIndexTree,
+                        const AttributeListType& data,
+                        const openvdb::Name& attributeName)
+        : mPointIndexTree(pointIndexTree)
+        , mData(data)
+        , mAttributeName(attributeName) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mPointIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            typename AttributeWriteHandle<ValueType>::Ptr attributeWriteHandle =
+                AttributeWriteHandle<ValueType>::create(leaf->attributeArray(mAttributeName));
+
+            Index64 index = 0;
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            for (typename IndexArray::const_iterator it = indices.begin(), it_end = indices.end(); it != it_end; ++it)
+            {
+                ValueType value;
+                mData.template get<ValueType>(*it, value);
+
+                attributeWriteHandle->set(index, value);
+
+                index++;
+            }
+        }
+    }
+
+    //////////
+
+    const PointIndexTreeType&   mPointIndexTree;
+    const AttributeListType&    mData;
+    const openvdb::Name&        mAttributeName;
+};
+
+} // namespace point_conversion_internal
+
+
+////////////////////////////////////////
+
+
+template<typename PointDataGridT, typename PositionArrayT, typename PointIndexGridT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const PointIndexGridT& pointIndexGrid, const PositionArrayT& positions,
+                    const openvdb::NamePair& positionType, const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue)
+{
+    typedef typename PointDataGridT::TreeType                       PointDataTreeT;
+    typedef typename PointIndexGridT::TreeType                      PointIndexTreeT;
+    typedef typename tree::template LeafManager<PointDataTreeT>     LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                        LeafRangeT;
+
+    using point_conversion_internal::InitialiseAttributesOp;
+    using point_conversion_internal::PopulatePositionAttributeOp;
+
+    // construct the Tree using a topology copy of the PointIndexGrid
+
+    const PointIndexTreeT& pointIndexTree(pointIndexGrid.tree());
+    typename PointDataTreeT::Ptr treePtr(new PointDataTreeT(pointIndexTree));
+
+    LeafManagerT leafManager = LeafManagerT(*treePtr);
+    LeafRangeT leafRange = leafManager.leafRange();
+
+    // create attribute descriptor from position type
+
+    AttributeSet::Descriptor::Ptr descriptor = AttributeSet::Descriptor::create(positionType);
+
+    // add default value for position if provided
+
+    if (positionDefaultValue)   descriptor->setDefaultValue("P", *positionDefaultValue);
+
+    // create point attribute storage on each leaf
+
+    InitialiseAttributesOp<PointDataTreeT, PointIndexTreeT> initialise(
+                                *treePtr, pointIndexGrid.tree(), descriptor);
+    tbb::parallel_for(leafRange, initialise);
+
+    // populate position attribute
+
+    PopulatePositionAttributeOp<PointDataTreeT,
+                                PointIndexTreeT,
+                                PositionArrayT> populate(pointIndexTree,
+                                                        xform,
+                                                        positions);
+
+    tbb::parallel_for(leafRange, populate);
+
+    typename PointDataGridT::Ptr grid = PointDataGridT::create(treePtr);
+    grid->setTransform(xform.copy());
+    return grid;
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataGridT, typename ValueT>
+inline typename PointDataGridT::Ptr
+createPointDataGrid(const std::vector<ValueT>& positions,
+                    const openvdb::NamePair& positionType,
+                    const math::Transform& xform,
+                    Metadata::Ptr positionDefaultValue)
+{
+    const PointAttributeVector<ValueT> pointList(positions);
+
+    PointIndexGrid::Ptr pointIndexGrid = createPointIndexGrid<PointIndexGrid>(pointList, xform);
+    return createPointDataGrid<PointDataGridT>(*pointIndexGrid, pointList, positionType, xform, positionDefaultValue);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTreeT, typename PointIndexTreeT, typename PointArrayT>
+inline void
+populateAttribute(  PointDataTreeT& tree, const PointIndexTreeT& pointIndexTree,
+                    const openvdb::Name& attributeName, const PointArrayT& data)
+{
+    using point_conversion_internal::PopulateAttributeOp;
+
+    // populate attribute
+
+    PopulateAttributeOp<PointDataTreeT,
+                        PointIndexTreeT,
+                        PointArrayT> populate(pointIndexTree, data, attributeName);
+
+    tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), populate);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_CONVERSION_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/PointCount.h b/nuparu/include/openvdb_points/tools/PointCount.h
new file mode 100644
index 00000000..a3ada00e
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/PointCount.h
@@ -0,0 +1,277 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointCount.h
+///
+/// @brief  Various point counting methods using a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_COUNT_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_COUNT_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+#include <openvdb_points/tools/PointAttribute.h>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+#include <tbb/parallel_reduce.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Total points in the PointDataTree
+/// @param tree PointDataTree.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 pointCount(const PointDataTreeT& tree, const bool inCoreOnly = false);
+
+
+/// @brief Total active points in the PointDataTree
+/// @param tree PointDataTree.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 activePointCount(const PointDataTreeT& tree, const bool inCoreOnly = false);
+
+
+/// @brief Total inactive points in the PointDataTree
+/// @param tree PointDataTree.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 inactivePointCount(const PointDataTreeT& tree, const bool inCoreOnly = false);
+
+
+/// @brief Total points in the group in the PointDataTree
+/// @param tree PointDataTree.
+/// @param name group name.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 groupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly = false);
+
+
+/// @brief Total active points in the group in the PointDataTree
+/// @param tree PointDataTree.
+/// @param name group name.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 activeGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly = false);
+
+
+/// @brief Total inactive points in the group in the PointDataTree
+/// @param tree PointDataTree.
+/// @param name group name.
+/// @param inCoreOnly if true, points in out-of-core leaf nodes are not counted
+template <typename PointDataTreeT>
+Index64 inactiveGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly = false);
+
+
+////////////////////////////////////////
+
+
+namespace point_count_internal {
+
+template <  typename PointDataTreeT,
+            typename ValueIterT,
+            typename FilterT>
+struct PointCountOp
+{
+    typedef typename tree::LeafManager<const PointDataTreeT>    LeafManagerT;
+    typedef IndexIterTraits<PointDataTreeT, ValueIterT>         IndexIteratorFromLeafT;
+    typedef typename IndexIteratorFromLeafT::Iterator           IndexIterator;
+    typedef typename FilterT::Data                              FilterDataT;
+    typedef FilterIndexIter<IndexIterator, FilterT>             Iterator;
+
+    PointCountOp(const FilterDataT& filterData,
+                 const bool inCoreOnly = false)
+        : mFilterData(filterData)
+        , mInCoreOnly(inCoreOnly) { }
+
+    Index64 operator()(const typename LeafManagerT::LeafRange& range, Index64 size) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf = range.begin(); leaf; ++leaf) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+            if (mInCoreOnly && leaf->buffer().isOutOfCore())     continue;
+#endif
+            IndexIterator indexIterator(IndexIteratorFromLeafT::begin(*leaf));
+            FilterT filter(FilterT::create(*leaf, mFilterData));
+            Iterator iter(indexIterator, filter);
+            size += iterCount(iter);
+        }
+
+        return size;
+    }
+
+    static Index64 join(Index64 size1, Index64 size2) {
+        return size1 + size2;
+    }
+
+private:
+    const FilterDataT& mFilterData;
+    const bool mInCoreOnly;
+}; // struct PointCountOp
+
+
+template <typename PointDataTreeT, typename FilterT, typename ValueIterT>
+Index64 threadedFilterPointCount(   const PointDataTreeT& tree,
+                                    const typename FilterT::Data& filter,
+                                    const bool inCoreOnly = false)
+{
+    typedef point_count_internal::PointCountOp< PointDataTreeT, ValueIterT, FilterT> PointCountOp;
+
+    typename tree::LeafManager<const PointDataTreeT> leafManager(tree);
+    const PointCountOp pointCountOp(filter, inCoreOnly);
+    return tbb::parallel_reduce(leafManager.leafRange(), Index64(0), pointCountOp, PointCountOp::join);
+}
+
+
+template <typename PointDataTreeT, typename FilterT>
+Index64 filterPointCount(const PointDataTreeT& tree,
+                         const typename FilterT::Data& filter,
+                         const bool inCoreOnly = false)
+{
+    typedef typename PointDataTreeT::LeafNodeType::ValueAllCIter ValueIterT;
+    return threadedFilterPointCount<  PointDataTreeT, FilterT, ValueIterT>(tree, filter, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT, typename FilterT>
+Index64 filterActivePointCount( const PointDataTreeT& tree,
+                                const typename FilterT::Data& filter,
+                                const bool inCoreOnly = false)
+{
+    typedef typename PointDataTreeT::LeafNodeType::ValueOnCIter ValueIterT;
+    return threadedFilterPointCount<  PointDataTreeT, FilterT, ValueIterT>(tree, filter, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT, typename FilterT>
+Index64 filterInactivePointCount(   const PointDataTreeT& tree,
+                                    const typename FilterT::Data& filter,
+                                    const bool inCoreOnly = false)
+{
+    typedef typename PointDataTreeT::LeafNodeType::ValueOffCIter ValueIterT;
+    return threadedFilterPointCount<  PointDataTreeT, FilterT, ValueIterT>(tree, filter, inCoreOnly);
+}
+
+
+} // namespace point_count_internal
+
+
+template <typename PointDataTreeT>
+Index64 pointCount(const PointDataTreeT& tree, const bool inCoreOnly)
+{
+    (void) inCoreOnly;
+    Index64 size = 0;
+    for (typename PointDataTreeT::LeafCIter iter = tree.cbeginLeaf(); iter; ++iter) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (inCoreOnly && iter->buffer().isOutOfCore())     continue;
+#endif
+        size += iter->pointCount();
+    }
+    return size;
+}
+
+
+template <typename PointDataTreeT>
+Index64 activePointCount(const PointDataTreeT& tree, const bool inCoreOnly)
+{
+    (void) inCoreOnly;
+    Index64 size = 0;
+    for (typename PointDataTreeT::LeafCIter iter = tree.cbeginLeaf(); iter; ++iter) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (inCoreOnly && iter->buffer().isOutOfCore())     continue;
+#endif
+        size += iter->onPointCount();
+    }
+    return size;
+}
+
+
+template <typename PointDataTreeT>
+Index64 inactivePointCount(const PointDataTreeT& tree, const bool inCoreOnly)
+{
+    (void) inCoreOnly;
+    Index64 size = 0;
+    for (typename PointDataTreeT::LeafCIter iter = tree.cbeginLeaf(); iter; ++iter) {
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+        if (inCoreOnly && iter->buffer().isOutOfCore())     continue;
+#endif
+        size += iter->offPointCount();
+    }
+    return size;
+}
+
+
+template <typename PointDataTreeT>
+Index64 groupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly)
+{
+    GroupFilter::Data groupFilterData(name);
+    return point_count_internal::filterPointCount<PointDataTreeT, GroupFilter>(tree, groupFilterData, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT>
+Index64 activeGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly)
+{
+    GroupFilter::Data groupFilterData(name);
+    return point_count_internal::filterActivePointCount<PointDataTreeT, GroupFilter>(tree, groupFilterData, inCoreOnly);
+}
+
+
+template <typename PointDataTreeT>
+Index64 inactiveGroupPointCount(const PointDataTreeT& tree, const Name& name, const bool inCoreOnly)
+{
+    GroupFilter::Data groupFilterData(name);
+    return point_count_internal::filterInactivePointCount<PointDataTreeT, GroupFilter>(tree, groupFilterData, inCoreOnly);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_COUNT_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/PointDataGrid.h b/nuparu/include/openvdb_points/tools/PointDataGrid.h
new file mode 100644
index 00000000..2a691cfd
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/PointDataGrid.h
@@ -0,0 +1,920 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey, Nick Avramoussis, Matt Warner
+///
+/// @file PointDataGrid.h
+///
+/// @brief  Attribute-owned data structure for points. Point attributes are
+///         stored in leaf nodes and ordered by voxel for fast random and
+///         sequential access.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_DATA_GRID_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_DATA_GRID_HAS_BEEN_INCLUDED
+
+#include <openvdb/Grid.h>
+#include <openvdb/tree/Tree.h>
+#include <openvdb/tree/LeafNode.h>
+
+#include <openvdb/tools/PointIndexGrid.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/AttributeGroup.h>
+
+#include <utility> // std::pair, std::make_pair
+
+
+class TestPointDataLeaf;
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+// forward declaration
+namespace tree {
+    template<Index, typename> struct SameLeafConfig;
+}
+
+
+////////////////////////////////////////
+
+
+namespace tools {
+
+
+// forward declaration
+template<typename T, Index Log2Dim> class PointDataLeafNode;
+
+/// @brief Point index tree configured to match the default VDB configurations.
+typedef tree::Tree<tree::RootNode<tree::InternalNode<tree::InternalNode
+    <PointDataLeafNode<PointDataIndex32, 3>, 4>, 5> > > PointDataTree;
+
+
+/// @brief Point data grid.
+typedef Grid<PointDataTree> PointDataGrid;
+
+
+////////////////////////////////////////
+
+// Internal utility methods
+namespace point_data_grid_internal {
+
+template<typename T>
+struct UniquePtr
+{
+#ifdef OPENVDB_HAS_CXX11
+    typedef std::unique_ptr<T>  type;
+#else
+    typedef std::auto_ptr<T>    type;
+#endif
+};
+}
+
+
+template <typename T, Index Log2Dim>
+class PointDataLeafNode : public tree::LeafNode<T, Log2Dim> {
+
+public:
+    typedef PointDataLeafNode<T, Log2Dim>           LeafNodeType;
+    typedef boost::shared_ptr<PointDataLeafNode>    Ptr;
+
+    typedef T                                       ValueType;
+    typedef std::pair<ValueType, ValueType>         ValueTypePair;
+    typedef std::vector<ValueType>                  IndexArray;
+
+    typedef AttributeSet::Descriptor                Descriptor;
+
+    ////////////////////////////////////////
+
+    // The following methods had to be copied from the LeafNode class
+    // to make the derived PointDataLeafNode class compatible with the tree structure.
+
+    typedef tree::LeafNode<T, Log2Dim>    BaseLeaf;
+    typedef util::NodeMask<Log2Dim> NodeMaskType;
+
+    using BaseLeaf::LOG2DIM;
+    using BaseLeaf::TOTAL;
+    using BaseLeaf::DIM;
+    using BaseLeaf::NUM_VALUES;
+    using BaseLeaf::NUM_VOXELS;
+    using BaseLeaf::SIZE;
+    using BaseLeaf::LEVEL;
+
+    /// Default constructor
+    PointDataLeafNode()
+        : BaseLeaf()
+        , mAttributeSet(new AttributeSet) { }
+
+    ~PointDataLeafNode() { }
+
+    /// Construct using deep copy of other PointDataLeafNode
+    explicit PointDataLeafNode(const PointDataLeafNode& other)
+        : BaseLeaf(other)
+        , mAttributeSet(new AttributeSet(*other.mAttributeSet)) { }
+
+    /// Construct using supplied origin, value and active status
+    explicit
+    PointDataLeafNode(const Coord& coords, const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(coords, value, active)
+        , mAttributeSet(new AttributeSet) { }
+
+    /// Construct using supplied origin, value and active status
+    /// use attribute map from another PointDataLeafNode
+    PointDataLeafNode(const PointDataLeafNode& other, const Coord& coords, const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(coords, value, active)
+        , mAttributeSet(new AttributeSet(*other.mAttributeSet)) { }
+
+    // Copy-construct from a PointIndexLeafNode with the same configuration but a different ValueType.
+    template<typename OtherValueType>
+    PointDataLeafNode(const tools::PointIndexLeafNode<OtherValueType, Log2Dim>& other)
+        : BaseLeaf(other)
+        , mAttributeSet(new AttributeSet) { }
+
+    // Copy-construct from a LeafNode with the same configuration but a different ValueType.
+    // Used for topology copies - explicitly sets the value (background) to zeroVal
+    template <typename ValueType>
+    PointDataLeafNode(const tree::LeafNode<ValueType, Log2Dim>& other, const T& /*value*/, TopologyCopy)
+        : BaseLeaf(other, zeroVal<T>(), TopologyCopy())
+        , mAttributeSet(new AttributeSet) { }
+
+    // Copy-construct from a LeafNode with the same configuration but a different ValueType.
+    // Used for topology copies - explicitly sets the on and off value (background) to zeroVal
+    template <typename ValueType>
+    PointDataLeafNode(const tree::LeafNode<ValueType, Log2Dim>& other, const T& /*offValue*/, const T& /*onValue*/, TopologyCopy)
+        : BaseLeaf(other, zeroVal<T>(), zeroVal<T>(), TopologyCopy())
+        , mAttributeSet(new AttributeSet) { }
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+    PointDataLeafNode(PartialCreate, const Coord& coords,
+        const T& value = zeroVal<T>(), bool active = false)
+        : BaseLeaf(PartialCreate(), coords, value, active)
+        , mAttributeSet(new AttributeSet) { }
+#endif
+
+public:
+
+    /// Retrieve the attribute set.
+    const AttributeSet& attributeSet() const { return *mAttributeSet; }
+
+    /// @brief Create a new attribute set. Existing attributes will be removed.
+    void initializeAttributes(const Descriptor::Ptr& descriptor, const size_t arrayLength);
+    /// @brief Clear the attribute set.
+    void clearAttributes(const bool updateValueMask = true);
+
+    /// @brief Returns @c true if an attribute with this index exists.
+    /// @param pos Index of the attribute
+    bool hasAttribute(const size_t pos) const;
+    /// @brief Returns @c true if an attribute with this name exists.
+    /// @param attributeName    Name of the attribute
+    bool hasAttribute(const Name& attributeName) const;
+
+    /// @brief Append an attribute to the leaf.
+    /// @param attribute Name and type of the attribute to append.
+    /// @param expected Existing descriptor is expected to match this parameter.
+    /// @param replacement New descriptor to replace the existing one.
+    AttributeArray::Ptr appendAttribute(const AttributeSet::Util::NameAndType& attribute,
+                                        const Descriptor& expected, Descriptor::Ptr& replacement);
+    /// @brief Drop list of attributes.
+    /// @param pos vector of attribute indices to drop
+    /// @param expected Existing descriptor is expected to match this parameter.
+    /// @param replacement New descriptor to replace the existing one.
+    void dropAttributes(const std::vector<size_t>& pos,
+                        const Descriptor& expected, Descriptor::Ptr& replacement);
+    /// @brief Reorder attribute set.
+    /// @param replacement New descriptor to replace the existing one.
+    void reorderAttributes(const Descriptor::Ptr& replacement);
+    /// @brief Rename attributes in attribute set (order must remain the same).
+    /// @param expected Existing descriptor is expected to match this parameter.
+    /// @param replacement New descriptor to replace the existing one.
+    void renameAttributes(const Descriptor& expected, Descriptor::Ptr& replacement);
+    /// @brief Compact all attributes in attribute set.
+    void compactAttributes();
+
+    /// @brief Swap the underlying attribute set with the given @a attributeSet.
+    /// This leaf will assume ownership of the given attribute set. The descriptors must
+    /// match and the voxel offsets values will need updating if the point order is different.
+    void swap(AttributeSet* attributeSet);
+
+    /// @brief Sets all of the voxel offset values on this leaf, from the given vector
+    /// of @a offsets. If @a updateValueMask is true, then the active value mask will
+    /// be updated so voxels with points are active and empty voxels are inactive.
+    void setOffsets(const std::vector<ValueType>& offsets, const bool updateValueMask = true);
+
+    /// @brief Throws an error if the voxel values on this leaf are not monotonically
+    /// increasing or within the bounds of the attribute arrays
+    void validateOffsets() const;
+
+    /// @brief Read-write attribute array reference from index
+    /// {
+    AttributeArray& attributeArray(const size_t pos);
+    const AttributeArray& attributeArray(const size_t pos) const;
+    const AttributeArray& constAttributeArray(const size_t pos) const;
+    /// }
+    /// @brief Read-write attribute array reference from name
+    /// {
+    AttributeArray& attributeArray(const Name& attributeName);
+    const AttributeArray& attributeArray(const Name& attributeName) const;
+    const AttributeArray& constAttributeArray(const Name& attributeName) const;
+    /// }
+
+    /// @brief Read-only group handle from group index
+    GroupHandle groupHandle(const AttributeSet::Descriptor::GroupIndex& index) const;
+    /// @brief Read-only group handle from group name
+    GroupHandle groupHandle(const Name& group) const;
+    /// @brief Read-write group handle from group index
+    GroupWriteHandle groupWriteHandle(const AttributeSet::Descriptor::GroupIndex& index);
+    /// @brief Read-write group handle from group name
+    GroupWriteHandle groupWriteHandle(const Name& name);
+
+    /// @brief Compute the total point count for the leaf
+    Index64 pointCount() const;
+    /// @brief Compute the total active (on) point count for the leaf
+    Index64 onPointCount() const;
+    /// @brief Compute the total inactive (off) point count for the leaf
+    Index64 offPointCount() const;
+    /// @brief Compute the point count in a specific group for the leaf
+    Index64 groupPointCount(const Name& groupName) const;
+
+    /// @brief Activate voxels with non-zero points, deactivate voxels with zero points.
+    void updateValueMask();
+
+    ////////////////////////////////////////
+
+    void setOffsetOn(Index offset, const ValueType& val);
+    void setOffsetOnly(Index offset, const ValueType& val);
+
+    /// @brief Return @c true if the given node (which may have a different @c ValueType
+    /// than this node) has the same active value topology as this node.
+    template<typename OtherType, Index OtherLog2Dim>
+    bool hasSameTopology(const PointDataLeafNode<OtherType, OtherLog2Dim>* other) const {
+        return BaseLeaf::hasSameTopology(other);
+    }
+
+    /// Check for buffer, state and origin equivalence first.
+    /// If this returns true, do a deeper comparison on the attribute set to check
+    bool operator==(const PointDataLeafNode& other) const {
+        if(BaseLeaf::operator==(other) != true) return false;
+        return (*this->mAttributeSet == *other.mAttributeSet);
+    }
+
+    bool operator!=(const PointDataLeafNode& other) const { return !(other == *this); }
+
+    void addLeaf(PointDataLeafNode*) {}
+    template<typename AccessorT>
+    void addLeafAndCache(PointDataLeafNode*, AccessorT&) {}
+
+    //@{
+    /// @brief Return a pointer to this node.
+    PointDataLeafNode* touchLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    PointDataLeafNode* touchLeafAndCache(const Coord&, AccessorT&) { return this; }
+
+    template<typename NodeT, typename AccessorT>
+    NodeT* probeNodeAndCache(const Coord&, AccessorT&)
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,PointDataLeafNode>::value)) return NULL;
+        return reinterpret_cast<NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    PointDataLeafNode* probeLeaf(const Coord&) { return this; }
+    template<typename AccessorT>
+    PointDataLeafNode* probeLeafAndCache(const Coord&, AccessorT&) { return this; }
+    //@}
+
+    //@{
+    /// @brief Return a @const pointer to this node.
+    const PointDataLeafNode* probeConstLeaf(const Coord&) const { return this; }
+    template<typename AccessorT>
+    const PointDataLeafNode* probeConstLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    template<typename AccessorT>
+    const PointDataLeafNode* probeLeafAndCache(const Coord&, AccessorT&) const { return this; }
+    const PointDataLeafNode* probeLeaf(const Coord&) const { return this; }
+    template<typename NodeT, typename AccessorT>
+    const NodeT* probeConstNodeAndCache(const Coord&, AccessorT&) const
+    {
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_BEGIN
+        if (!(boost::is_same<NodeT,PointDataLeafNode>::value)) return NULL;
+        return reinterpret_cast<const NodeT*>(this);
+        OPENVDB_NO_UNREACHABLE_CODE_WARNING_END
+    }
+    //@}
+
+    // I/O methods
+
+    void readTopology(std::istream& is, bool fromHalf = false);
+    void writeTopology(std::ostream& os, bool toHalf = false) const;
+
+    void readBuffers(std::istream& is, bool fromHalf = false);
+    void readBuffers(std::istream& is, const CoordBBox&, bool fromHalf = false);
+    void writeBuffers(std::ostream& os, bool toHalf = false) const;
+
+
+    Index64 memUsage() const;
+
+    ////////////////////////////////////////
+
+    // Disable all write methods to avoid unintentional changes
+    // to the point-array offsets.
+
+    void assertNonmodifiable() {
+        assert(false && "Cannot modify voxel values in a PointDataTree.");
+    }
+
+    void setActiveState(const Coord& xyz, bool on) { BaseLeaf::setActiveState(xyz, on); }
+    void setActiveState(Index offset, bool on) { BaseLeaf::setActiveState(offset, on); }
+
+    void setValueOnly(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOnly(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValueOff(const Coord& xyz) { BaseLeaf::setValueOff(xyz); }
+    void setValueOff(Index offset) { BaseLeaf::setValueOff(offset); }
+
+    void setValueOff(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOff(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValueOn(const Coord& xyz) { BaseLeaf::setValueOn(xyz); }
+    void setValueOn(Index offset) {  BaseLeaf::setValueOn(offset); }
+
+    void setValueOn(const Coord&, const ValueType&) { assertNonmodifiable(); }
+    void setValueOn(Index, const ValueType&) { assertNonmodifiable(); }
+
+    void setValue(const Coord&, const ValueType&) { assertNonmodifiable(); }
+
+    void setValuesOn() { BaseLeaf::setValuesOn(); }
+    void setValuesOff() { BaseLeaf::setValuesOff(); }
+
+    template<typename ModifyOp>
+    void modifyValue(Index, const ModifyOp&) { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValue(const Coord&, const ModifyOp&) { assertNonmodifiable(); }
+
+    template<typename ModifyOp>
+    void modifyValueAndActiveState(const Coord&, const ModifyOp&) { assertNonmodifiable(); }
+
+    void clip(const CoordBBox&, const ValueType&) { assertNonmodifiable(); }
+
+    void fill(const CoordBBox&, const ValueType&, bool) { assertNonmodifiable(); }
+    void fill(const ValueType&) {}
+    void fill(const ValueType&, bool) { assertNonmodifiable(); }
+
+    template<typename AccessorT>
+    void setValueOnlyAndCache(const Coord&, const ValueType&, AccessorT&) {assertNonmodifiable();}
+
+    template<typename ModifyOp, typename AccessorT>
+    void modifyValueAndActiveStateAndCache(const Coord&, const ModifyOp&, AccessorT&) {
+        assertNonmodifiable();
+    }
+
+    template<typename AccessorT>
+    void setValueOffAndCache(const Coord&, const ValueType&, AccessorT&) { assertNonmodifiable(); }
+
+    template<typename AccessorT>
+    void setActiveStateAndCache(const Coord& xyz, bool on, AccessorT& parent) { BaseLeaf::setActiveStateAndCache(xyz, on, parent); }
+
+    void resetBackground(const ValueType&, const ValueType&) { assertNonmodifiable(); }
+
+    void signedFloodFill(const ValueType&) { assertNonmodifiable(); }
+    void signedFloodFill(const ValueType&, const ValueType&) { assertNonmodifiable(); }
+
+    void negate() { assertNonmodifiable(); }
+
+    friend class ::TestPointDataLeaf;
+
+    typedef typename BaseLeaf::ValueOn ValueOn;
+    typedef typename BaseLeaf::ValueOff ValueOff;
+    typedef typename BaseLeaf::ValueAll ValueAll;
+
+private:
+    point_data_grid_internal::UniquePtr<AttributeSet>::type mAttributeSet;
+
+protected:
+    typedef typename BaseLeaf::ChildOn ChildOn;
+    typedef typename BaseLeaf::ChildOff ChildOff;
+    typedef typename BaseLeaf::ChildAll ChildAll;
+
+    typedef typename NodeMaskType::OnIterator    MaskOnIterator;
+    typedef typename NodeMaskType::OffIterator   MaskOffIterator;
+    typedef typename NodeMaskType::DenseIterator MaskDenseIterator;
+
+    // During topology-only construction, access is needed
+    // to protected/private members of other template instances.
+    template<typename, Index> friend class PointDataLeafNode;
+
+    friend class tree::IteratorBase<MaskOnIterator, PointDataLeafNode>;
+    friend class tree::IteratorBase<MaskOffIterator, PointDataLeafNode>;
+    friend class tree::IteratorBase<MaskDenseIterator, PointDataLeafNode>;
+
+public:
+
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOnIterator, PointDataLeafNode, const ValueType, ValueOn> ValueOnIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOnIterator, const PointDataLeafNode, const ValueType, ValueOn> ValueOnCIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOffIterator, PointDataLeafNode, const ValueType, ValueOff> ValueOffIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskOffIterator,const PointDataLeafNode,const ValueType,ValueOff> ValueOffCIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskDenseIterator, PointDataLeafNode, const ValueType, ValueAll> ValueAllIter;
+    typedef typename BaseLeaf::template ValueIter<
+        MaskDenseIterator,const PointDataLeafNode,const ValueType,ValueAll> ValueAllCIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOnIterator, PointDataLeafNode, ChildOn> ChildOnIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOnIterator, const PointDataLeafNode, ChildOn> ChildOnCIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOffIterator, PointDataLeafNode, ChildOff> ChildOffIter;
+    typedef typename BaseLeaf::template ChildIter<
+        MaskOffIterator, const PointDataLeafNode, ChildOff> ChildOffCIter;
+    typedef typename BaseLeaf::template DenseIter<
+        PointDataLeafNode, ValueType, ChildAll> ChildAllIter;
+    typedef typename BaseLeaf::template DenseIter<
+        const PointDataLeafNode, const ValueType, ChildAll> ChildAllCIter;
+
+    typedef openvdb::tools::IndexIter IndexIter;
+    typedef ValueIndexIter<ValueOnCIter> IndexOnIter;
+    typedef ValueIndexIter<ValueOffCIter> IndexOffIter;
+
+    /// @brief Leaf index iterator
+    IndexIter beginIndexAll() const;
+    IndexOnIter beginIndexOn() const;
+    IndexOffIter beginIndexOff() const;
+    /// @brief Leaf index iterator from voxel
+    IndexIter beginIndex(const unsigned index) const;
+    IndexIter beginIndex(const Coord& ijk) const;
+
+#define VMASK_ this->getValueMask()
+    ValueOnCIter  cbeginValueOn() const  { return ValueOnCIter(VMASK_.beginOn(), this); }
+    ValueOnCIter   beginValueOn() const  { return ValueOnCIter(VMASK_.beginOn(), this); }
+    ValueOnIter    beginValueOn()        { return ValueOnIter(VMASK_.beginOn(), this); }
+    ValueOffCIter cbeginValueOff() const { return ValueOffCIter(VMASK_.beginOff(), this); }
+    ValueOffCIter  beginValueOff() const { return ValueOffCIter(VMASK_.beginOff(), this); }
+    ValueOffIter   beginValueOff()       { return ValueOffIter(VMASK_.beginOff(), this); }
+    ValueAllCIter cbeginValueAll() const { return ValueAllCIter(VMASK_.beginDense(), this); }
+    ValueAllCIter  beginValueAll() const { return ValueAllCIter(VMASK_.beginDense(), this); }
+    ValueAllIter   beginValueAll()       { return ValueAllIter(VMASK_.beginDense(), this); }
+
+    ValueOnCIter  cendValueOn() const    { return ValueOnCIter(VMASK_.endOn(), this); }
+    ValueOnCIter   endValueOn() const    { return ValueOnCIter(VMASK_.endOn(), this); }
+    ValueOnIter    endValueOn()          { return ValueOnIter(VMASK_.endOn(), this); }
+    ValueOffCIter cendValueOff() const   { return ValueOffCIter(VMASK_.endOff(), this); }
+    ValueOffCIter  endValueOff() const   { return ValueOffCIter(VMASK_.endOff(), this); }
+    ValueOffIter   endValueOff()         { return ValueOffIter(VMASK_.endOff(), this); }
+    ValueAllCIter cendValueAll() const   { return ValueAllCIter(VMASK_.endDense(), this); }
+    ValueAllCIter  endValueAll() const   { return ValueAllCIter(VMASK_.endDense(), this); }
+    ValueAllIter   endValueAll()         { return ValueAllIter(VMASK_.endDense(), this); }
+
+    ChildOnCIter  cbeginChildOn() const  { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnCIter   beginChildOn() const  { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnIter    beginChildOn()        { return ChildOnIter(VMASK_.endOn(), this); }
+    ChildOffCIter cbeginChildOff() const { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffCIter  beginChildOff() const { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffIter   beginChildOff()       { return ChildOffIter(VMASK_.endOff(), this); }
+    ChildAllCIter cbeginChildAll() const { return ChildAllCIter(VMASK_.beginDense(), this); }
+    ChildAllCIter  beginChildAll() const { return ChildAllCIter(VMASK_.beginDense(), this); }
+    ChildAllIter   beginChildAll()       { return ChildAllIter(VMASK_.beginDense(), this); }
+
+    ChildOnCIter  cendChildOn() const    { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnCIter   endChildOn() const    { return ChildOnCIter(VMASK_.endOn(), this); }
+    ChildOnIter    endChildOn()          { return ChildOnIter(VMASK_.endOn(), this); }
+    ChildOffCIter cendChildOff() const   { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffCIter  endChildOff() const   { return ChildOffCIter(VMASK_.endOff(), this); }
+    ChildOffIter   endChildOff()         { return ChildOffIter(VMASK_.endOff(), this); }
+    ChildAllCIter cendChildAll() const   { return ChildAllCIter(VMASK_.endDense(), this); }
+    ChildAllCIter  endChildAll() const   { return ChildAllCIter(VMASK_.endDense(), this); }
+    ChildAllIter   endChildAll()         { return ChildAllIter(VMASK_.endDense(), this); }
+#undef VMASK_
+}; // struct PointDataLeafNode
+
+////////////////////////////////////////
+
+// PointDataLeafNode implementation
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::initializeAttributes(const Descriptor::Ptr& descriptor, const size_t arrayLength)
+{
+    mAttributeSet.reset(new AttributeSet(descriptor, arrayLength));
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::clearAttributes(const bool updateValueMask)
+{
+    mAttributeSet.reset(new AttributeSet(mAttributeSet->descriptorPtr(), 0));
+
+    // zero voxel values
+
+    for (Index n = 0; n < LeafNodeType::NUM_VALUES; n++) {
+        this->setOffsetOnly(n, 0);
+    }
+
+    // if updateValueMask, also de-activate all voxels
+
+    if (updateValueMask)    this->setValuesOff();
+}
+
+template<typename T, Index Log2Dim>
+inline bool
+PointDataLeafNode<T, Log2Dim>::hasAttribute(const size_t pos) const
+{
+    return pos < mAttributeSet->size();
+}
+
+template<typename T, Index Log2Dim>
+inline bool
+PointDataLeafNode<T, Log2Dim>::hasAttribute(const Name& attributeName) const
+{
+    const size_t pos = mAttributeSet->find(attributeName);
+    return pos != AttributeSet::INVALID_POS;
+}
+
+template<typename T, Index Log2Dim>
+inline AttributeArray::Ptr
+PointDataLeafNode<T, Log2Dim>::appendAttribute(const AttributeSet::Util::NameAndType& attribute,
+                     const Descriptor& expected, Descriptor::Ptr& replacement)
+{
+    return mAttributeSet->appendAttribute(attribute, expected, replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::dropAttributes(const std::vector<size_t>& pos,
+                    const Descriptor& expected, Descriptor::Ptr& replacement)
+{
+    mAttributeSet->dropAttributes(pos, expected, replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::reorderAttributes(const Descriptor::Ptr& replacement)
+{
+    mAttributeSet->reorderAttributes(replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::renameAttributes(const Descriptor& expected, Descriptor::Ptr& replacement)
+{
+    mAttributeSet->renameAttributes(expected, replacement);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::compactAttributes()
+{
+    for (size_t i = 0; i < mAttributeSet->size(); i++) {
+        AttributeArray* array = mAttributeSet->get(i);
+        array->compact();
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::swap(AttributeSet* attributeSet)
+{
+    if (!attributeSet) {
+        OPENVDB_THROW(ValueError, "Cannot swap with a null attribute set");
+    }
+
+    if (mAttributeSet->descriptor() != attributeSet->descriptor()) {
+        OPENVDB_THROW(ValueError, "Attribute set descriptors are not equal.");
+    }
+
+    mAttributeSet.reset(attributeSet);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::setOffsets(const std::vector<ValueType>& offsets, const bool updateValueMask)
+{
+    if (offsets.size() != LeafNodeType::NUM_VALUES) {
+        OPENVDB_THROW(ValueError, "Offset vector size doesn't match number of voxels.")
+    }
+
+    for (size_t index = 0; index < offsets.size(); ++index) {
+        setOffsetOnly(index, offsets[index]);
+    }
+
+    if (updateValueMask) this->updateValueMask();
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::validateOffsets() const
+{
+    // Ensure all of the offset values are monotonically increasing
+    for (size_t index = 1; index < BaseLeaf::SIZE; ++index) {
+        if (this->getValue(index-1) > this->getValue(index)) {
+            OPENVDB_THROW(ValueError, "Voxel offset values are not monotonically increasing");
+        }
+    }
+
+    // Ensure all attribute arrays are of equal length
+    for (size_t attributeIndex = 1; attributeIndex < mAttributeSet->size(); ++attributeIndex ) {
+        if (mAttributeSet->getConst(attributeIndex-1)->size() != mAttributeSet->getConst(attributeIndex)->size()) {
+            OPENVDB_THROW(ValueError, "Attribute arrays have inconsistent length");
+        }
+    }
+
+    // Ensure the last voxel's offset value matches the size of each attribute array
+    if (mAttributeSet->size() > 0 && this->getValue(BaseLeaf::SIZE-1) != mAttributeSet->getConst(0)->size()) {
+        OPENVDB_THROW(ValueError, "Last voxel offset value does not match attribute array length");
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const size_t pos)
+{
+    if (pos >= mAttributeSet->size())             OPENVDB_THROW(LookupError, "Attribute Out Of Range - " << pos);
+    return *mAttributeSet->get(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const size_t pos) const
+{
+    if (pos >= mAttributeSet->size())             OPENVDB_THROW(LookupError, "Attribute Out Of Range - " << pos);
+    return *mAttributeSet->getConst(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::constAttributeArray(const size_t pos) const
+{
+    return this->attributeArray(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const Name& attributeName)
+{
+    const size_t pos = mAttributeSet->find(attributeName);
+    if (pos == AttributeSet::INVALID_POS)         OPENVDB_THROW(LookupError, "Attribute Not Found - " << attributeName);
+    return *mAttributeSet->get(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::attributeArray(const Name& attributeName) const
+{
+    const size_t pos = mAttributeSet->find(attributeName);
+    if (pos == AttributeSet::INVALID_POS)         OPENVDB_THROW(LookupError, "Attribute Not Found - " << attributeName);
+    return *mAttributeSet->getConst(pos);
+}
+
+template<typename T, Index Log2Dim>
+inline const AttributeArray&
+PointDataLeafNode<T, Log2Dim>::constAttributeArray(const Name& attributeName) const
+{
+    return this->attributeArray(attributeName);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupHandle
+PointDataLeafNode<T, Log2Dim>::groupHandle(const AttributeSet::Descriptor::GroupIndex& index) const
+{
+    const AttributeArray& array = this->attributeArray(index.first);
+    assert(GroupAttributeArray::isGroup(array));
+
+    const GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
+
+    return GroupHandle(groupArray, index.second);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupHandle
+PointDataLeafNode<T, Log2Dim>::groupHandle(const Name& name) const
+{
+    const AttributeSet::Descriptor::GroupIndex index = this->attributeSet().groupIndex(name);
+    return this->groupHandle(index);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupWriteHandle
+PointDataLeafNode<T, Log2Dim>::groupWriteHandle(const AttributeSet::Descriptor::GroupIndex& index)
+{
+    AttributeArray& array = this->attributeArray(index.first);
+    assert(GroupAttributeArray::isGroup(array));
+
+    GroupAttributeArray& groupArray = GroupAttributeArray::cast(array);
+
+    return GroupWriteHandle(groupArray, index.second);
+}
+
+template<typename T, Index Log2Dim>
+inline GroupWriteHandle
+PointDataLeafNode<T, Log2Dim>::groupWriteHandle(const Name& name)
+{
+    const AttributeSet::Descriptor::GroupIndex index = this->attributeSet().groupIndex(name);
+    return this->groupWriteHandle(index);
+}
+
+template<typename T, Index Log2Dim>
+inline IndexIter
+PointDataLeafNode<T, Log2Dim>::beginIndexAll() const
+{
+    const ValueType start = 0;
+    const ValueType end = this->getValue(NUM_VOXELS - 1);
+    return IndexIter(start, end);
+}
+
+template<typename T, Index Log2Dim>
+inline typename PointDataLeafNode<T, Log2Dim>::IndexOnIter
+PointDataLeafNode<T, Log2Dim>::beginIndexOn() const
+{
+    ValueOnCIter iter = this->cbeginValueOn();
+    return IndexOnIter(iter);
+}
+
+template<typename T, Index Log2Dim>
+inline typename PointDataLeafNode<T, Log2Dim>::IndexOffIter
+PointDataLeafNode<T, Log2Dim>::beginIndexOff() const
+{
+    ValueOffCIter iter = this->cbeginValueOff();
+    return IndexOffIter(iter);
+}
+
+template<typename T, Index Log2Dim>
+inline IndexIter
+PointDataLeafNode<T, Log2Dim>::beginIndex(const unsigned index) const
+{
+    assert(index < BaseLeaf::SIZE);
+    const ValueType end = this->getValue(index);
+    const ValueType start = (index == 0) ? ValueType(0) : this->getValue(index - 1);
+    return IndexIter(start, end);
+}
+
+template<typename T, Index Log2Dim>
+inline IndexIter
+PointDataLeafNode<T, Log2Dim>::beginIndex(const Coord& ijk) const
+{
+    return this->beginIndex(LeafNodeType::coordToOffset(ijk));
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::pointCount() const
+{
+    return iterCount(this->beginIndexAll());
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::onPointCount() const
+{
+    if (this->isEmpty())        return 0;
+    else if (this->isDense())   return this->pointCount();
+    return iterCount(this->beginIndexOn());
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::offPointCount() const
+{
+    if (this->isEmpty())        return this->pointCount();
+    else if (this->isDense())   return 0;
+    return iterCount(this->beginIndexOff());
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::groupPointCount(const Name& groupName) const
+{
+    IndexIter indexIter = this->beginIndexAll();
+    GroupFilter filter(GroupFilter::create(*this, GroupFilter::Data(groupName)));
+    FilterIndexIter<IndexIter, GroupFilter> filterIndexIter(indexIter, filter);
+    return iterCount(filterIndexIter);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::updateValueMask()
+{
+    ValueType start = 0, end = 0;
+    for (Index n = 0; n < LeafNodeType::NUM_VALUES; n++) {
+        end = this->getValue(n);
+        this->setValueMask(n, (end - start) > 0);
+        start = end;
+    }
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::setOffsetOn(Index offset, const ValueType& val)
+{
+    this->buffer().setValue(offset, val);
+    this->setValueMaskOn(offset);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::setOffsetOnly(Index offset, const ValueType& val)
+{
+    this->buffer().setValue(offset, val);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::readTopology(std::istream& is, bool fromHalf)
+{
+    BaseLeaf::readTopology(is, fromHalf);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::writeTopology(std::ostream& os, bool toHalf) const
+{
+    BaseLeaf::writeTopology(os, toHalf);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::readBuffers(std::istream& is, bool fromHalf)
+{
+    BaseLeaf::readBuffers(is, fromHalf);
+
+    mAttributeSet->read(is);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::readBuffers(std::istream& is, const CoordBBox& bbox, bool fromHalf)
+{
+    // Read and clip voxel values (no clipping yet).
+    BaseLeaf::readBuffers(is, bbox, fromHalf);
+
+    mAttributeSet->read(is);
+}
+
+template<typename T, Index Log2Dim>
+inline void
+PointDataLeafNode<T, Log2Dim>::writeBuffers(std::ostream& os, bool toHalf) const
+{
+    BaseLeaf::writeBuffers(os, toHalf);
+
+    mAttributeSet->write(os);
+}
+
+template<typename T, Index Log2Dim>
+inline Index64
+PointDataLeafNode<T, Log2Dim>::memUsage() const
+{
+    return BaseLeaf::memUsage() + mAttributeSet->memUsage();
+}
+
+} // namespace tools
+
+////////////////////////////////////////
+
+namespace tree
+{
+
+/// Helper metafunction used to implement LeafNode::SameConfiguration
+/// (which, as an inner class, can't be independently specialized)
+template<Index Dim1, typename T2>
+struct SameLeafConfig<Dim1, tools::PointDataLeafNode<T2, Dim1> > { static const bool value = true; };
+
+} // namespace tree
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_DATA_GRID_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/PointGroup.h b/nuparu/include/openvdb_points/tools/PointGroup.h
new file mode 100644
index 00000000..413fc6a2
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/PointGroup.h
@@ -0,0 +1,671 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointGroup.h
+///
+/// @brief  Point group manipulation in a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_GROUP_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_GROUP_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+#include <openvdb_points/tools/PointAttribute.h>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+/// @brief Appends a new empty group to the VDB tree.
+///
+/// @param tree          the PointDataTree to be appended to.
+/// @param group         name of the new group.
+template <typename PointDataTree>
+inline void appendGroup(PointDataTree& tree,
+                        const Name& group);
+
+/// @brief Appends new empty groups to the VDB tree.
+///
+/// @param tree          the PointDataTree to be appended to.
+/// @param groups        names of the new groups.
+template <typename PointDataTree>
+inline void appendGroups(PointDataTree& tree,
+                         const std::vector<Name>& groups);
+
+/// @brief Drops an existing group from the VDB tree.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param group         name of the group.
+/// @param compact       compact attributes if possible to reduce memory - if dropping
+///                      more than one group, compacting once at the end will be faster
+template <typename PointDataTree>
+inline void dropGroup(  PointDataTree& tree,
+                        const Name& group,
+                        const bool compact = true);
+
+/// @brief Drops existing groups from the VDB tree, the tree is compacted after dropping.
+///
+/// @param tree          the PointDataTree to be dropped from.
+/// @param groups        names of the groups.
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree,
+                        const std::vector<Name>& groups);
+
+/// @brief Drops all existing groups from the VDB tree, the tree is compacted after dropping.
+///
+/// @param tree          the PointDataTree to be dropped from.
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree);
+
+/// @brief Compacts existing groups of a VDB Tree to use less memory if possible.
+///
+/// @param tree          the PointDataTree to be compacted.
+template <typename PointDataTree>
+inline void compactGroups(PointDataTree& tree);
+
+/// @brief Sets group membership from a PointIndexTree-ordered vector.
+///
+/// @param tree          the PointDataTree.
+/// @param indexTree     the PointIndexTree.
+/// @param membership    @c true if the point is in the group.
+/// @param group         the name of the group.
+/// @param remove        if @c true also perform removal of points from the group.
+template <typename PointDataTree, typename PointIndexTree>
+inline void setGroup(   PointDataTree& tree,
+                        const PointIndexTree& indexTree,
+                        const std::vector<bool>& membership,
+                        const Name& group,
+                        const bool remove = false);
+
+
+////////////////////////////////////////
+
+
+namespace point_group_internal {
+
+
+/// Copy a group attribute value from one group offset to another
+template<typename PointDataTreeType>
+struct CopyGroupOp {
+
+    typedef typename tree::LeafManager<PointDataTreeType>       LeafManagerT;
+    typedef typename LeafManagerT::LeafRange                    LeafRangeT;
+    typedef AttributeSet::Descriptor::NameAndType               NameAndType;
+    typedef AttributeSet::Descriptor::GroupIndex                GroupIndex;
+
+    CopyGroupOp(PointDataTreeType& tree,
+                const GroupIndex& targetIndex,
+                const GroupIndex& sourceIndex)
+        : mTree(tree)
+        , mTargetIndex(targetIndex)
+        , mSourceIndex(sourceIndex) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const {
+
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            GroupHandle sourceGroup = leaf->groupHandle(mSourceIndex);
+            GroupWriteHandle targetGroup = leaf->groupWriteHandle(mTargetIndex);
+
+            for (IndexIter iter = leaf->beginIndexAll(); iter; ++iter) {
+                const bool groupOn = sourceGroup.get(*iter);
+                targetGroup.set(*iter, groupOn);
+            }
+        }
+    }
+
+    //////////
+
+    PointDataTreeType&      mTree;
+    const GroupIndex        mTargetIndex;
+    const GroupIndex        mSourceIndex;
+};
+
+
+/// Set membership on or off for the specified group
+template <typename PointDataTree, bool Member>
+struct SetGroupOp
+{
+    typedef typename tree::LeafManager<PointDataTree>   LeafManagerT;
+    typedef AttributeSet::Descriptor::GroupIndex        GroupIndex;
+
+    SetGroupOp(const AttributeSet::Descriptor::GroupIndex& index)
+        : mIndex(index) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const
+    {
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the group attribute array
+
+            GroupWriteHandle group(leaf->groupWriteHandle(mIndex));
+
+            // set the group value
+
+            group.collapse(Member);
+        }
+    }
+
+    //////////
+
+    const GroupIndex        mIndex;
+}; // struct SetGroupOp
+
+
+template <typename PointDataTree, typename PointIndexTree, bool Remove>
+struct SetGroupFromIndexOp
+{
+    typedef typename tree::LeafManager<PointDataTree>   LeafManagerT;
+    typedef typename LeafManagerT::LeafRange            LeafRangeT;
+    typedef typename PointIndexTree::LeafNodeType       PointIndexLeafNode;
+    typedef typename PointIndexLeafNode::IndexArray     IndexArray;
+    typedef AttributeSet::Descriptor::GroupIndex        GroupIndex;
+    typedef std::vector<bool>                           BoolArray;
+
+    SetGroupFromIndexOp(const PointIndexTree& indexTree,
+                        const BoolArray& membership,
+                        const GroupIndex& index)
+        : mIndexTree(indexTree)
+        , mMembership(membership)
+        , mIndex(index) { }
+
+    void operator()(const typename LeafManagerT::LeafRange& range) const
+    {
+        for (typename LeafManagerT::LeafRange::Iterator leaf=range.begin(); leaf; ++leaf) {
+
+            // obtain the PointIndexLeafNode (using the origin of the current leaf)
+
+            const PointIndexLeafNode* pointIndexLeaf = mIndexTree.probeConstLeaf(leaf->origin());
+
+            if (!pointIndexLeaf)    continue;
+
+            // obtain the group attribute array
+
+            GroupWriteHandle group(leaf->groupWriteHandle(mIndex));
+
+            // initialise the attribute storage
+
+            Index64 index = 0;
+
+            const IndexArray& indices = pointIndexLeaf->indices();
+
+            for (typename IndexArray::const_iterator it = indices.begin(),
+                                                     it_end = indices.end(); it != it_end; ++it)
+            {
+                if (Remove) {
+                    group.set(index++, mMembership.at(*it));
+                }
+                else {
+                    if (mMembership.at(*it))    group.set(index, true);
+
+                    index++;
+                }
+            }
+        }
+    }
+
+    //////////
+
+    const PointIndexTree& mIndexTree;
+    const BoolArray& mMembership;
+    const GroupIndex mIndex;
+}; // struct SetGroupFromIndexOp
+
+
+////////////////////////////////////////
+
+
+/// Convenience class with methods for analyzing group data
+class GroupInfo
+{
+public:
+    typedef AttributeSet::Descriptor Descriptor;
+
+    GroupInfo(const AttributeSet& attributeSet)
+        : mAttributeSet(attributeSet) { }
+
+    /// Return the number of bits in a group (typically 8)
+    static size_t groupBits() { return sizeof(GroupType) * CHAR_BIT; }
+
+    /// Return the number of empty group slots which correlates to the number of groups
+    /// that can be stored without increasing the number of group attribute arrays
+    size_t unusedGroups() const
+    {
+        // compute total slots (one slot per bit of the group attributes)
+
+        const size_t groupAttributes = mAttributeSet.size(AttributeArray::GROUP);
+
+        if (groupAttributes == 0)   return 0;
+
+        const size_t totalSlots = groupAttributes * this->groupBits();
+
+        // compute slots in use
+
+        const AttributeSet::Descriptor::NameToPosMap& groupMap = mAttributeSet.descriptor().groupMap();
+        const size_t usedSlots = groupMap.size();
+
+        return totalSlots - usedSlots;
+    }
+
+    /// Return @c true if there are sufficient empty slots to allow compacting
+    bool canCompactGroups() const
+    {
+        // can compact if more unused groups than in one group attribute array
+
+        return this->unusedGroups() >= this->groupBits();
+    }
+
+    /// Return the next empty group slot
+    size_t nextUnusedOffset() const
+    {
+        const Descriptor::NameToPosMap& groupMap = mAttributeSet.descriptor().groupMap();
+
+        // build a list of group indices
+
+        std::vector<size_t> indices;
+        for (Descriptor::ConstIterator  it = groupMap.begin(),
+                                        endIt = groupMap.end(); it != endIt; ++it) {
+            indices.push_back(it->second);
+        }
+
+        std::sort(indices.begin(), indices.end());
+
+        // return first index not present
+
+        size_t offset = 0;
+        for (std::vector<size_t>::const_iterator    it = indices.begin(),
+                                                    endIt = indices.end(); it != endIt; ++it) {
+            if (*it != offset)     break;
+            offset++;
+        }
+
+        return offset;
+    }
+
+    /// Fill the @p indices vector with the indices correlating to the group attribute arrays
+    void populateGroupIndices(std::vector<size_t>& indices) const
+    {
+        const Descriptor::NameToPosMap& map = mAttributeSet.descriptor().map();
+
+        for (Descriptor::ConstIterator  it = map.begin(),
+                                        itEnd = map.end(); it != itEnd; ++it) {
+
+            const AttributeArray* array = mAttributeSet.getConst(it->first);
+            if (GroupAttributeArray::isGroup(*array)) {
+                indices.push_back(it->second);
+            }
+        }
+    }
+
+    /// Determine if a move is required to efficiently compact the data and store the
+    /// source name, offset and the target offset in the input parameters
+    bool requiresMove(Name& sourceName, size_t& sourceOffset, size_t& targetOffset) const {
+
+        targetOffset = this->nextUnusedOffset();
+
+        const Descriptor::NameToPosMap& groupMap = mAttributeSet.descriptor().groupMap();
+
+        typedef Descriptor::NameToPosMap::const_reverse_iterator ReverseMapIterator;
+
+        for (ReverseMapIterator it = groupMap.rbegin(),
+                                itEnd = groupMap.rend(); it != itEnd; ++it) {
+
+            // move only required if source comes after the target
+
+            if (it->second >= targetOffset) {
+                sourceName = it->first;
+                sourceOffset = it->second;
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+private:
+    const AttributeSet& mAttributeSet;
+}; // class GroupInfo
+
+
+} // namespace point_group_internal
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void appendGroup(PointDataTree& tree, const Name& group)
+{
+    typedef AttributeSet::Descriptor                              Descriptor;
+    typedef AttributeSet::Util::NameAndType                       NameAndType;
+
+    using point_attribute_internal::AppendAttributeOp;
+    using point_group_internal::GroupInfo;
+
+    if (group.empty()) {
+        OPENVDB_THROW(KeyError, "Cannot use an empty group name as a key.");
+    }
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+    GroupInfo groupInfo(attributeSet);
+
+    // don't add if group already exists
+
+    if (descriptor->hasGroup(group))    return;
+
+    // add a new group attribute if there are no unused groups
+
+    if (groupInfo.unusedGroups() == 0) {
+
+        // find a new internal group name
+
+        const NameAndType groupAttribute(descriptor->uniqueName("__group"), GroupAttributeArray::attributeType());
+
+        descriptor = descriptor->duplicateAppend(groupAttribute);
+
+        // insert new group attribute
+
+        AppendAttributeOp<PointDataTree> append(tree, groupAttribute, descriptor,
+                                                /*hidden=*/false, /*transient=*/false, /*group=*/true);
+        tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), append);
+    }
+
+    // ensure that there are now available groups
+
+    assert(groupInfo.unusedGroups() > 0);
+
+    // find next unused offset
+
+    const size_t offset = groupInfo.nextUnusedOffset();
+
+    // add the group mapping to the descriptor
+
+    descriptor->setGroup(group, offset);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void appendGroups(PointDataTree& tree,
+                         const std::vector<Name>& groups)
+{
+    // TODO: could be more efficient by appending multiple groups at once
+    // instead of one-by-one, however this is likely not that common a use case
+
+    for (std::vector<Name>::const_iterator  it = groups.begin(),
+                                            itEnd = groups.end(); it != itEnd; ++it) {
+        appendGroup(tree, *it);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropGroup(PointDataTree& tree, const Name& group, const bool compact)
+{
+    typedef AttributeSet::Descriptor                              Descriptor;
+
+    if (group.empty()) {
+        OPENVDB_THROW(KeyError, "Cannot use an empty group name as a key.");
+    }
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+
+    descriptor->dropGroup(group);
+
+    if (compact) {
+        compactGroups(tree);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree,
+                        const std::vector<Name>& groups)
+{
+    for (std::vector<Name>::const_iterator  it = groups.begin(),
+                                            itEnd = groups.end(); it != itEnd; ++it) {
+        dropGroup(tree, *it, /*compact=*/false);
+    }
+
+    // compaction done once for efficiency
+
+    compactGroups(tree);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void dropGroups( PointDataTree& tree)
+{
+    typedef AttributeSet::Descriptor        Descriptor;
+
+    using point_group_internal::GroupInfo;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+    GroupInfo groupInfo(attributeSet);
+
+    descriptor->clearGroups();
+
+    // find all indices for group attribute arrays
+
+    std::vector<size_t> indices;
+    groupInfo.populateGroupIndices(indices);
+
+    // drop these attributes arrays
+
+    dropAttributes(tree, indices);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void compactGroups(PointDataTree& tree)
+{
+    typedef AttributeSet::Descriptor                              Descriptor;
+    typedef Descriptor::GroupIndex                                GroupIndex;
+
+    using point_group_internal::CopyGroupOp;
+    using point_group_internal::GroupInfo;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    Descriptor::Ptr descriptor = attributeSet.descriptorPtr();
+    GroupInfo groupInfo(attributeSet);
+
+    // early exit if not possible to compact
+
+    if (!groupInfo.canCompactGroups())    return;
+
+    // generate a list of group offsets and move them (one-by-one)
+    // TODO: improve this algorithm to move multiple groups per array at once
+    // though this is likely not that common a use case
+
+    Name sourceName;
+    size_t sourceOffset, targetOffset;
+
+    while (groupInfo.requiresMove(sourceName, sourceOffset, targetOffset)) {
+
+        const GroupIndex sourceIndex = attributeSet.groupIndex(sourceOffset);
+        const GroupIndex targetIndex = attributeSet.groupIndex(targetOffset);
+
+        CopyGroupOp<PointDataTree> copy(tree, targetIndex, sourceIndex);
+        tbb::parallel_for(typename tree::template LeafManager<PointDataTree>(tree).leafRange(), copy);
+
+        descriptor->setGroup(sourceName, targetOffset);
+    }
+
+    // drop unused attribute arrays
+
+    std::vector<size_t> indices;
+    groupInfo.populateGroupIndices(indices);
+
+    const size_t totalAttributesToDrop = groupInfo.unusedGroups() / groupInfo.groupBits();
+
+    assert(totalAttributesToDrop <= indices.size());
+
+    std::vector<size_t> indicesToDrop(indices.end() - totalAttributesToDrop, indices.end());
+
+    dropAttributes(tree, indicesToDrop);
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree, typename PointIndexTree>
+inline void setGroup(   PointDataTree& tree,
+                        const PointIndexTree& indexTree,
+                        const std::vector<bool>& membership,
+                        const Name& group,
+                        const bool remove)
+{
+    typedef AttributeSet::Descriptor Descriptor;
+    typedef typename tree::template LeafManager<PointDataTree> LeafManagerT;
+
+    if (membership.size() != pointCount(tree)) {
+        OPENVDB_THROW(LookupError, "Membership vector size must match number of points.");
+    }
+
+    using point_group_internal::SetGroupFromIndexOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const Descriptor& descriptor = attributeSet.descriptor();
+
+    if (!descriptor.hasGroup(group)) {
+        OPENVDB_THROW(LookupError, "Group must exist on Tree before defining membership.");
+    }
+
+    const Descriptor::GroupIndex index = attributeSet.groupIndex(group);
+
+    // set membership
+
+    if (remove) {
+        SetGroupFromIndexOp<PointDataTree,
+                            PointIndexTree, false> set(indexTree, membership, index);
+        tbb::parallel_for(LeafManagerT(tree).leafRange(), set);
+    }
+    else {
+        SetGroupFromIndexOp<PointDataTree,
+                            PointIndexTree, true> set(indexTree, membership, index);
+        tbb::parallel_for(LeafManagerT(tree).leafRange(), set);
+    }
+}
+
+
+////////////////////////////////////////
+
+
+template <typename PointDataTree>
+inline void setGroup(   PointDataTree& tree,
+                        const Name& group,
+                        const bool member = true)
+{
+    typedef AttributeSet::Descriptor Descriptor;
+    typedef typename tree::template LeafManager<PointDataTree> LeafManagerT;
+
+    using point_group_internal::SetGroupOp;
+
+    typename PointDataTree::LeafCIter iter = tree.cbeginLeaf();
+
+    if (!iter)  return;
+
+    const AttributeSet& attributeSet = iter->attributeSet();
+    const Descriptor& descriptor = attributeSet.descriptor();
+
+    if (!descriptor.hasGroup(group)) {
+        OPENVDB_THROW(LookupError, "Group must exist on Tree before defining membership.");
+    }
+
+    const Descriptor::GroupIndex index = attributeSet.groupIndex(group);
+
+    // set membership based on member variable
+
+    if (member)     tbb::parallel_for(LeafManagerT(tree).leafRange(), SetGroupOp<PointDataTree, true>(index));
+    else            tbb::parallel_for(LeafManagerT(tree).leafRange(), SetGroupOp<PointDataTree, false>(index));
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_GROUP_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/tools/PointLoad.h b/nuparu/include/openvdb_points/tools/PointLoad.h
new file mode 100644
index 00000000..65ebaaca
--- /dev/null
+++ b/nuparu/include/openvdb_points/tools/PointLoad.h
@@ -0,0 +1,161 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+//
+/// @author Dan Bailey
+///
+/// @file PointLoad.h
+///
+/// @brief  Various point loading methods using a VDB Point Grid.
+///
+
+
+#ifndef OPENVDB_TOOLS_POINT_LOAD_HAS_BEEN_INCLUDED
+#define OPENVDB_TOOLS_POINT_LOAD_HAS_BEEN_INCLUDED
+
+#include <openvdb/openvdb.h>
+
+#include <openvdb_points/tools/AttributeSet.h>
+#include <openvdb_points/tools/PointDataGrid.h>
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+namespace tools {
+
+
+/// @brief Loads all leaf node voxel data in the given grid.
+///
+/// @param grid  the Grid to be loaded.
+/// @note This method wraps readNonresidentBuffers().
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid);
+
+
+/// @brief Loads all leaf node voxel data in the given grid that
+/// overlap with mask grid leaf nodes.
+///
+/// @param grid  the Grid to be loaded.
+/// @param mask  the mask to denote region of points to load
+template <typename PointDataGridT, typename MaskGridT>
+void loadPoints(PointDataGridT& grid, const MaskGridT& mask);
+
+
+/// @brief Load the leaf node voxel data in the given grid that
+/// overlap with a world-space bounding box.
+///
+/// @param grid  the Grid to be loaded.
+/// @param bbox  the bbox to denote region of points to load
+///
+/// @note Does not clip to the bounding box, leaf nodes with any
+/// overlap will be loaded.
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid, const BBoxd& bbox);
+
+
+////////////////////////////////////////
+
+
+#ifndef OPENVDB_2_ABI_COMPATIBLE
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid)
+{
+    grid.constTree().readNonresidentBuffers();
+}
+#else
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT&)
+{
+    // out-of-core not supported with ABI 2
+}
+#endif
+
+
+template <typename PointDataGridT, typename MaskGridT>
+void loadPoints(PointDataGridT& grid, const MaskGridT& mask)
+{
+    typedef typename PointDataGridT::TreeType PointDataTreeT;
+
+    tree::ValueAccessor<const PointDataTreeT> pointsAcc(grid.constTree());
+
+    typename MaskGridT::TreeType::LeafCIter leafIter = mask.constTree().cbeginLeaf();
+
+    for (; leafIter; ++leafIter) {
+        const Coord& ijk = leafIter->origin();
+        const typename PointDataTreeT::LeafNodeType* leaf = pointsAcc.probeConstLeaf(ijk);
+
+        if (!leaf)  continue;
+
+        // load out of core leaf nodes
+        if (leaf->buffer().isOutOfCore())    leaf->buffer().data();
+    }
+}
+
+
+template <typename PointDataGridT>
+void loadPoints(PointDataGridT& grid, const BBoxd& bbox)
+{
+    typedef typename PointDataGridT::template ValueConverter<bool>::Type BoolGridT;
+
+    // Transform the world-space bounding box into the source grid's index space.
+    Vec3d idxMin, idxMax;
+    math::calculateBounds(grid.constTransform(), bbox.min(), bbox.max(), idxMin, idxMax);
+    CoordBBox region(Coord::floor(idxMin), Coord::floor(idxMax));
+
+    // Construct a boolean mask grid that is true inside the index-space bounding box
+    // and false everywhere else.
+    BoolGridT clipMask(/*background=*/false);
+    clipMask.fill(region, /*value=*/true, /*active=*/true);
+
+    // MaskGrid introduced in OpenVDB 3.2
+    typedef BoolGrid MaskType;
+
+    // Convert the input grid to a mask grid (with the same tree configuration).
+    MaskType::Ptr pointsMask = MaskType::create(/*background=*/false);
+    pointsMask->topologyUnion(grid);
+    pointsMask->topologyIntersection(clipMask);
+
+    loadPoints(grid, *pointsMask);
+}
+
+
+////////////////////////////////////////
+
+
+} // namespace tools
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+
+#endif // OPENVDB_TOOLS_POINT_LOAD_HAS_BEEN_INCLUDED
+
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/include/openvdb_points/version.h b/nuparu/include/openvdb_points/version.h
new file mode 100644
index 00000000..5ee0be85
--- /dev/null
+++ b/nuparu/include/openvdb_points/version.h
@@ -0,0 +1,85 @@
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+//
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
+//
+// Redistributions of source code must retain the above copyright
+// and license notice and the following restrictions and disclaimer.
+//
+// *     Neither the name of Double Negative Visual Effects nor the names
+// of its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IN NO EVENT SHALL THE COPYRIGHT HOLDERS' AND CONTRIBUTORS' AGGREGATE
+// LIABILITY FOR ALL CLAIMS REGARDLESS OF THEIR BASIS EXCEED US$250.00.
+//
+///////////////////////////////////////////////////////////////////////////
+
+#ifndef OPENVDB_POINTS_VERSION_HAS_BEEN_INCLUDED
+#define OPENVDB_POINTS_VERSION_HAS_BEEN_INCLUDED
+
+#include <openvdb/version.h>
+
+/// Always disable Houdini warnings
+#ifndef IGNORE_HDK_DEPRECATIONS
+#define IGNORE_HDK_DEPRECATIONS
+#endif
+
+// Library major, minor and patch version numbers
+#define OPENVDB_POINTS_LIBRARY_MAJOR_VERSION_NUMBER 0
+#define OPENVDB_POINTS_LIBRARY_MINOR_VERSION_NUMBER 2
+#define OPENVDB_POINTS_LIBRARY_PATCH_VERSION_NUMBER 0
+
+/// @brief Library version number string of the form "<major>.<minor>.<patch>"
+/// @details This is a macro rather than a static constant because we typically
+/// want the compile-time version number, not the runtime version number
+/// (although the two are usually the same).
+#define OPENVDB_POINTS_LIBRARY_VERSION_STRING "0.2.0"
+
+/// Library version number as a packed integer ("%02x%02x%04x", major, minor, patch)
+#define OPENVDB_POINTS_LIBRARY_VERSION_NUMBER \
+    ((OPENVDB_POINTS_LIBRARY_MAJOR_VERSION_NUMBER << 24) | \
+    ((OPENVDB_POINTS_LIBRARY_MINOR_VERSION_NUMBER & 0xFF) << 16) | \
+    (OPENVDB_POINTS_LIBRARY_PATCH_VERSION_NUMBER & 0xFFFF))
+
+namespace openvdb {
+OPENVDB_USE_VERSION_NAMESPACE
+namespace OPENVDB_VERSION_NAME {
+
+namespace points {
+
+// Library major, minor and patch version numbers
+const uint32_t
+    OPENVDB_POINTS_LIBRARY_MAJOR_VERSION = OPENVDB_POINTS_LIBRARY_MAJOR_VERSION_NUMBER,
+    OPENVDB_POINTS_LIBRARY_MINOR_VERSION = OPENVDB_POINTS_LIBRARY_MINOR_VERSION_NUMBER,
+    OPENVDB_POINTS_LIBRARY_PATCH_VERSION = OPENVDB_POINTS_LIBRARY_PATCH_VERSION_NUMBER;
+/// Library version number as a packed integer ("%02x%02x%04x", major, minor, patch)
+const uint32_t OPENVDB_POINTS_LIBRARY_VERSION = OPENVDB_POINTS_LIBRARY_VERSION_NUMBER;
+
+
+/// Return a library version number string of the form "<major>.<minor>.<patch>".
+inline const char* getLibraryVersionString() { return OPENVDB_POINTS_LIBRARY_VERSION_STRING; }
+
+} // namespace points
+
+} // namespace OPENVDB_VERSION_NAME
+} // namespace openvdb
+
+#endif // OPENVDB_POINTS_VERSION_HAS_BEEN_INCLUDED
+
+// Copyright (c) 2015-2016 Double Negative Visual Effects
+// All rights reserved. This software is distributed under the
+// Mozilla Public License 2.0 ( http://www.mozilla.org/MPL/2.0/ )
diff --git a/nuparu/lib/linux/asdfasdfasdf.a b/nuparu/lib/linux/asdfasdfasdf.a
new file mode 100644
index 00000000..659cd9ad
Binary files /dev/null and b/nuparu/lib/linux/asdfasdfasdf.a differ
diff --git a/nuparu/lib/linux/libAntTweakBar.a b/nuparu/lib/linux/libAntTweakBar.a
new file mode 100644
index 00000000..20f53ca3
Binary files /dev/null and b/nuparu/lib/linux/libAntTweakBar.a differ
diff --git a/nuparu/lib/linux/libHalf.a b/nuparu/lib/linux/libHalf.a
new file mode 100644
index 00000000..6eebeb6a
Binary files /dev/null and b/nuparu/lib/linux/libHalf.a differ
diff --git a/nuparu/lib/linux/libIex.a b/nuparu/lib/linux/libIex.a
new file mode 100644
index 00000000..ae1e2b8d
Binary files /dev/null and b/nuparu/lib/linux/libIex.a differ
diff --git a/nuparu/lib/linux/libIexMath.a b/nuparu/lib/linux/libIexMath.a
new file mode 100644
index 00000000..e31d0c88
Binary files /dev/null and b/nuparu/lib/linux/libIexMath.a differ
diff --git a/nuparu/lib/linux/libIlmThread.a b/nuparu/lib/linux/libIlmThread.a
new file mode 100644
index 00000000..099d0276
Binary files /dev/null and b/nuparu/lib/linux/libIlmThread.a differ
diff --git a/nuparu/lib/linux/libImath.a b/nuparu/lib/linux/libImath.a
new file mode 100644
index 00000000..81ccafe8
Binary files /dev/null and b/nuparu/lib/linux/libImath.a differ
diff --git a/nuparu/lib/linux/libboost_iostreams.a b/nuparu/lib/linux/libboost_iostreams.a
new file mode 100644
index 00000000..b5ab54c3
Binary files /dev/null and b/nuparu/lib/linux/libboost_iostreams.a differ
diff --git a/nuparu/lib/linux/libboost_system.a b/nuparu/lib/linux/libboost_system.a
new file mode 100644
index 00000000..cc274bd4
Binary files /dev/null and b/nuparu/lib/linux/libboost_system.a differ
diff --git a/nuparu/lib/linux/libjsoncpp.a b/nuparu/lib/linux/libjsoncpp.a
new file mode 100644
index 00000000..a5509650
Binary files /dev/null and b/nuparu/lib/linux/libjsoncpp.a differ
diff --git a/nuparu/lib/linux/libopenvdb.a b/nuparu/lib/linux/libopenvdb.a
new file mode 100644
index 00000000..71f9d58d
Binary files /dev/null and b/nuparu/lib/linux/libopenvdb.a differ
diff --git a/nuparu/lib/linux/libopenvdb_new.a b/nuparu/lib/linux/libopenvdb_new.a
new file mode 100644
index 00000000..4600d2b3
Binary files /dev/null and b/nuparu/lib/linux/libopenvdb_new.a differ
diff --git a/nuparu/lib/linux/libopenvdb_points.a b/nuparu/lib/linux/libopenvdb_points.a
new file mode 100644
index 00000000..f335ac1c
Binary files /dev/null and b/nuparu/lib/linux/libopenvdb_points.a differ
diff --git a/nuparu/lib/linux/libtbb.so b/nuparu/lib/linux/libtbb.so
new file mode 100644
index 00000000..43e23674
--- /dev/null
+++ b/nuparu/lib/linux/libtbb.so
@@ -0,0 +1 @@
+INPUT (libtbb.so.2)
diff --git a/nuparu/lib/linux/libtbbmalloc.so b/nuparu/lib/linux/libtbbmalloc.so
new file mode 100644
index 00000000..2ee0cac0
--- /dev/null
+++ b/nuparu/lib/linux/libtbbmalloc.so
@@ -0,0 +1 @@
+INPUT (libtbbmalloc.so.2)
diff --git a/nuparu/lib/win/AntTweakBar.dll b/nuparu/lib/win/AntTweakBar.dll
new file mode 100644
index 00000000..eeeff2a7
Binary files /dev/null and b/nuparu/lib/win/AntTweakBar.dll differ
diff --git a/nuparu/lib/win/AntTweakBar.lib b/nuparu/lib/win/AntTweakBar.lib
new file mode 100644
index 00000000..0301a125
Binary files /dev/null and b/nuparu/lib/win/AntTweakBar.lib differ
diff --git a/src/camera/camera.cpp b/src/camera/camera.cpp
deleted file mode 100644
index fb246b12..00000000
--- a/src/camera/camera.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-//
-//  camera.cpp
-//  Thanda
-//
-
-#include "camera.hpp"
diff --git a/src/camera/camera.hpp b/src/camera/camera.hpp
deleted file mode 100644
index 59ad2b12..00000000
--- a/src/camera/camera.hpp
+++ /dev/null
@@ -1,10 +0,0 @@
-//
-//  camera.hpp
-//  Thanda
-//
-
-#ifndef camera_hpp
-#define camera_hpp
-
-
-#endif /* camera_hpp */
diff --git a/src/fluidSolver/fluidSolver.cpp b/src/fluidSolver/fluidSolver.cpp
deleted file mode 100644
index 9c9a1663..00000000
--- a/src/fluidSolver/fluidSolver.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-//
-//  fluidSolver.cpp
-//  Thanda
-
-
-#include "fluidSolver.hpp"
diff --git a/src/fluidSolver/fluidSolver.hpp b/src/fluidSolver/fluidSolver.hpp
deleted file mode 100644
index 6429c4e3..00000000
--- a/src/fluidSolver/fluidSolver.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-//
-//  fluidSolver.hpp
-//  Thanda
-
-#ifndef fluidSolver_hpp
-#define fluidSolver_hpp
-
-
-#endif /* fluidSolver_hpp */
diff --git a/src/geom/geom.cpp b/src/geom/geom.cpp
deleted file mode 100644
index c4438fdc..00000000
--- a/src/geom/geom.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-//
-//  geom.cpp
-//  Thanda
-
-#include "geom.hpp"
diff --git a/src/geom/geom.hpp b/src/geom/geom.hpp
deleted file mode 100644
index 91e658f7..00000000
--- a/src/geom/geom.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-//
-//  geom.hpp
-//  Thanda
-
-#ifndef geom_hpp
-#define geom_hpp
-
-
-#endif /* geom_hpp */
diff --git a/src/main.cpp b/src/main.cpp
deleted file mode 100644
index 2a00d1e4..00000000
--- a/src/main.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "main.hpp"
-
-using namespace std;
-
-
-int main()
-{
-
-    return 0;
-}
-
diff --git a/src/main.hpp b/src/main.hpp
deleted file mode 100644
index fe1f64d5..00000000
--- a/src/main.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-//
-//  main.hpp
-//  Thanda
-
-#ifndef main_hpp
-#define main_hpp
-
-#endif /* main_hpp */
-
diff --git a/src/scene/scene.cpp b/src/scene/scene.cpp
deleted file mode 100644
index 32300ca9..00000000
--- a/src/scene/scene.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-//
-//  scene.cpp
-//  Thanda
-
-#include "scene.hpp"
diff --git a/src/scene/scene.hpp b/src/scene/scene.hpp
deleted file mode 100644
index 84022a43..00000000
--- a/src/scene/scene.hpp
+++ /dev/null
@@ -1,3 +0,0 @@
-//
-//  scene.hpp
-//  Thanda
\ No newline at end of file
diff --git a/src/scene/scene.json b/src/scene/scene.json
deleted file mode 100644
index 789b4a42..00000000
--- a/src/scene/scene.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-	"containerDim" : {
-		"scaleX" : 5.0,
-		"scaleY" : 5.0,
-		"scaleZ" : 5.0
-	},
-	"particleDim" : {
-		"boundX" : 3.7,
-		"boundY" : 3.7,
-		"boundZ" : 3.7
-	},
-	"particleSeparation" : 0.1
-}
\ No newline at end of file
diff --git a/src/viewer/viewer.cpp b/src/viewer/viewer.cpp
deleted file mode 100644
index 60d1868a..00000000
--- a/src/viewer/viewer.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-//
-//  viewer.cpp
-//  Thanda
-
-#include "viewer.hpp"
diff --git a/src/viewer/viewer.hpp b/src/viewer/viewer.hpp
deleted file mode 100644
index abb78a17..00000000
--- a/src/viewer/viewer.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-//
-//  viewer.hpp
-//  Thanda
-
-#ifndef viewer_hpp
-#define viewer_hpp
-
-
-#endif /* viewer_hpp */